In [129]:
import numpy as np
import pandas as pd
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU
from sklearn.model_selection import train_test_split

In [110]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [111]:
print('Train Data')
print('Sample size:',len(train_data))
print('\nNaN values:\n',train_data.isna().sum())
print('\nUnique values:')
print('keyword:',len(train_data['keyword'].unique()))
print('location:',len(train_data['location'].unique()))

train_data['keyword'] = train_data['keyword'].fillna('NoKeyword')
train_data['location'] = train_data['location'].fillna('NoLocation')

train_data['keyword'] = train_data['keyword'].str.lower()
train_data['location'] = train_data['location'].str.lower()
train_data['text'] = train_data['text'].str.lower()

Train Data
Sample size: 7613

NaN values:
 id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

Unique values:
keyword: 222
location: 3342


In [112]:
print('Test Data')
print('Sample size:',len(test_data))
print('\nNaN values:\n',test_data.isna().sum())
print('\nUnique values:')
print('keyword:',len(test_data['keyword'].unique()))
print('location:',len(test_data['location'].unique()))

test_data['keyword'] = test_data['keyword'].fillna('NoKeyword')
test_data['location'] = test_data['location'].fillna('NoLocation')

test_data['keyword'] = test_data['keyword'].str.lower()
test_data['location'] = test_data['location'].str.lower()
test_data['text'] = test_data['text'].str.lower()

Test Data
Sample size: 3263

NaN values:
 id             0
keyword       26
location    1105
text           0
dtype: int64

Unique values:
keyword: 222
location: 1603


In [114]:
text_tokenizer = Tokenizer()
keyword_tokenizer = Tokenizer()
location_tokenizer = Tokenizer()

text_tokenizer.fit_on_texts(train_data['text'])
keyword_tokenizer.fit_on_texts(train_data['keyword'])
location_tokenizer.fit_on_texts(train_data['location'])

print('Vocabulary size:')
print('Text:',len(text_tokenizer.word_index)+1)
print('Keyword:',len(keyword_tokenizer.word_index)+1)
print('Location:',len(location_tokenizer.word_index)+1)

Vocabulary size:
Text: 22701
Keyword: 241
Location: 3325


In [118]:
text_input_sequences = []
for line in train_data['text']:
    tokens = text_tokenizer.texts_to_sequences(line)
    text_input_sequences.append(tokens)

keyword_input_sequences = []
for line in train_data['keyword']:
    tokens = keyword_tokenizer.texts_to_sequences([line])
    keyword_input_sequences.append(tokens)
        
location_input_sequences = []
for line in train_data['location']:
    tokens = location_tokenizer.texts_to_sequences([line])
    location_input_sequences.append(tokens)

In [125]:
x = []
for i in range(len(text_input_sequences)):
    x.append([text_input_sequences[i], keyword_input_sequences[i], location_input_sequences[i]])
y = list(train_data['target'])

In [126]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [128]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)