In [380]:
import numpy as np
import pandas as pd
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding,Concatenate,Input,LSTM,Dense,Dropout,GRU
from tensorflow.keras.metrics import F1Score
from sklearn.model_selection import train_test_split
from itertools import chain

In [367]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [375]:
text_tokenizer = Tokenizer()

text_tokenizer.fit_on_texts(train_data['text'])

print('Vocabulary size:',len(text_tokenizer.word_index)+1)

Vocabulary size: 22701


In [376]:
train_sequences = text_tokenizer.texts_to_sequences(train_data['text'])
test_sequences = text_tokenizer.texts_to_sequences(test_data['text'])
    
max_sequence_length = max([len(x) for x in text_input_sequences])

train_sequences = np.array(pad_sequences(train_sequences, padding='post'))
test_sequences = np.array(pad_sequences(test_sequences, padding='post', maxlen=train_sequences.shape[1]))

print(max_sequence_length)

124


In [377]:
print(train_sequences.shape, test_sequences.shape)

(7613, 33) (3263, 33)


In [378]:
x_train, x_test, y_train, y_test = train_test_split(train_sequences, train_data['target'], test_size=0.2)

In [381]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model = Sequential([
    Embedding(len(text_tokenizer.word_index)+1, 150, input_length=max_sequence_length),
    LSTM(250,return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dense(1, activation='sigmoid')
])

model.compile(loss="binary_crossentropy",optimizer='adam',metrics=[F1Score(threshold = 0.5)])
model.summary()



In [383]:
history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1,callbacks=[early_stopping])

Epoch 1/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 54ms/step - f1_score: 0.8942 - loss: 0.2469 - val_f1_score: 0.7318 - val_loss: 0.5013
Epoch 2/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 52ms/step - f1_score: 0.9555 - loss: 0.1309 - val_f1_score: 0.7375 - val_loss: 0.5390
Epoch 3/50
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 54ms/step - f1_score: 0.9722 - loss: 0.0880 - val_f1_score: 0.7077 - val_loss: 0.7708


In [388]:
pred = model.predict(test_sequences)
y_pred = [(1 if p>=0.5 else 0) for p in pred]

submission = pd.DataFrame({'id':test_data['id'], 'target':y_pred})
submission.to_csv('submission.csv', index=False)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
