In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
#lets's import the datasets
train_df = pd.read_csv(r"/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv(r"/kaggle/input/nlp-getting-started/test.csv")
sample_submission_df = pd.read_csv(r"/kaggle/input/nlp-getting-started/sample_submission.csv")


# **Splitting the Data**
*we have separate train and test data. let's keep some train data for validation to check overfit or undefit.*


In [3]:
from sklearn.model_selection import train_test_split
X = train_df.drop("target", axis=1)
y = train_df["target"]

columns_to_remove = ["id","keyword","location"]
X = X.drop(columns=columns_to_remove)

X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=100)

In [4]:
X_train["text"].head()

7556       The Twitter update pretty much wrecked the app
2549    If GOP want to destroy America then Obama is d...
5370    @elielcruz just watching the streams was bad -...
3144    When your child needs emergency care they can ...
496     #TBT Remember that time Patrick Kane attacked ...
Name: text, dtype: object

*Creating vocabulary for train and test data using keras*

In [5]:
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [6]:
# X_train["text"].apply(word_tokenize).map(len).plot(kind="bar")

In [7]:
vocab_length = 1000
tokenizer_obj = Tokenizer(num_words=vocab_length,oov_token="<oov>")

tokenizer_obj.fit_on_texts(X_train['text'].values)
# We will fit the tokenizer on train and use the same on test to make sequences
# print(tokenizer_obj.word_index)

train_sequences = tokenizer_obj.texts_to_sequences(X_train['text'].values)
# representing each word in a sentence/record with index value of it in a vocab learnt buy tokenizer
test_sequences = tokenizer_obj.texts_to_sequences(X_test['text'].values)

*padding vectors/sequences generated for different length sentences to have a same length*

In [8]:
sequence_length = 25
#padding training data.
train_sequences = pad_sequences(train_sequences, maxlen=sequence_length)
# going with default padding as 'pre'

#padding testing data
test_sequences = pad_sequences(test_sequences,maxlen=sequence_length)

**Building the model**

In [9]:
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_length,128,input_length=sequence_length))
model.add(keras.layers.LSTM(64,return_sequences=False))
model.add(keras.layers.Dense(32,activation="relu"))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(1,activation="sigmoid"))

In [10]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[keras.metrics.BinaryAccuracy()])

In [11]:
model.fit(train_sequences,y_train,epochs=15,batch_size=100)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x74adf65b9350>

In [12]:
model.evaluate(test_sequences,y_test,batch_size=200)



[1.0421454906463623, 0.7550886273384094]

It is evident that the model is overfitted the train data. for this we should retrain the model with the dropout layers

**Predicting for the test data**

In [13]:
columns_to_remove_test = ["keyword","location"]
test_df = pd.read_csv(r"/kaggle/input/nlp-getting-started/test.csv")
test_df_to_predict= test_df.drop(columns=columns_to_remove_test)
test_submit_sequences = tokenizer_obj.texts_to_sequences(test_df_to_predict["text"].values)
test_submit_sequences_padded = pad_sequences(test_submit_sequences,maxlen=sequence_length)

predicted_output = model.predict(test_submit_sequences_padded,batch_size=100)

In [14]:
target_df = pd.DataFrame(data=predicted_output,columns=["target"])
target_df["target"] = target_df["target"].apply(lambda x: 0 if x<0.5 else 1)

In [15]:
test_df_predicted = pd.concat([test_df_to_predict,target_df],axis=1)
test_submission = test_df_predicted.drop(columns=["text"])

In [16]:
test_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [17]:
test_submission.to_csv(r"/kaggle/working/submission.csv",index=False)

In [18]:
model.save("nlp_disaster_prediction_model.h5")