In [380]:
import numpy as np
import pandas as pd
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding,Concatenate,Input,LSTM,Dense,Dropout,GRU
from tensorflow.keras.metrics import F1Score
from sklearn.model_selection import train_test_split
from itertools import chain

In [367]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [375]:
text_tokenizer = Tokenizer()

text_tokenizer.fit_on_texts(train_data['text'])

print('Vocabulary size:',len(text_tokenizer.word_index)+1)

Vocabulary size: 22701


In [376]:
train_sequences = text_tokenizer.texts_to_sequences(train_data['text'])
test_sequences = text_tokenizer.texts_to_sequences(test_data['text'])

train_sequences = np.array(pad_sequences(train_sequences, padding='post'))
test_sequences = np.array(pad_sequences(test_sequences, padding='post', maxlen=train_sequences.shape[1]))

124


In [377]:
print(train_sequences.shape, test_sequences.shape)

(7613, 33) (3263, 33)


In [378]:
x_train, x_test, y_train, y_test = train_test_split(train_sequences, train_data['target'], test_size=0.2)

In [397]:
model = Sequential([
    Embedding(len(text_tokenizer.word_index)+1, 150),
    LSTM(250,return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dense(1, activation='sigmoid')
])

model.compile(loss="binary_crossentropy",optimizer='adam',metrics=[F1Score(threshold = 0.5)])

In [401]:
early_stopping = EarlyStopping(monitor='f1_score', mode='max', patience=5, verbose=1, restore_best_weights=True)

history=model.fit(x_train,y_train,epochs=20,validation_data=(x_test,y_test),verbose=1,callbacks=[early_stopping])

Epoch 1/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - f1_score: 0.9954 - loss: 0.0135 - val_f1_score: 0.6968 - val_loss: 1.1753
Epoch 2/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - f1_score: 0.9909 - loss: 0.0169 - val_f1_score: 0.7151 - val_loss: 1.4210
Epoch 3/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - f1_score: 0.9964 - loss: 0.0102 - val_f1_score: 0.6942 - val_loss: 1.3890
Epoch 4/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - f1_score: 0.9933 - loss: 0.0071 - val_f1_score: 0.6888 - val_loss: 1.8253
Epoch 5/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 55ms/step - f1_score: 0.9949 - loss: 0.0055 - val_f1_score: 0.6910 - val_loss: 1.8132
Epoch 6/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 49ms/step - f1_score: 0.9953 - loss: 0.0064 - val_f1_score: 0.6921 - val_loss: 2.1232
Epoch 7/20
[1m19

In [402]:
pred = model.predict(test_sequences)
y_pred = [(1 if p>=0.5 else 0) for p in pred]

submission = pd.DataFrame({'id':test_data['id'], 'target':y_pred})
submission.to_csv('submission.csv', index=False)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
