In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
pd.set_option('max_colwidth', None)

In [3]:
#lets's import the datasets
train_df = pd.read_csv(r"/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv(r"/kaggle/input/nlp-getting-started/test.csv")
sample_submission_df = pd.read_csv(r"/kaggle/input/nlp-getting-started/sample_submission.csv")


In [4]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
test_df.shape

(3263, 4)

# **Splitting the Data**
*we have separate train and test data. let's keep some train data for validation to check overfit or undefit.*


In [6]:
from sklearn.model_selection import train_test_split
X = train_df.drop("target", axis=1)
y = train_df["target"]

columns_to_remove = ["id","keyword","location"]
X = X.drop(columns=columns_to_remove)

print(X.shape)
print(y.shape)

(7613, 1)
(7613,)


In [7]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=50, stratify=y)

In [8]:
X_train["text"].head()

3885    Today (August 6th) is the 70th anniversary of A-Bomb 'Little Boy' been dropped on Hiroshima.70000 killed outright as the city was flattened
6819                                                                                          salute to all the kids still trapped in adult bodies.
2220           Back on the beach after the deluge.  Surf camp in motion.  Our Surf Therapy programme kicked off today for... http://t.co/vjsAqPxngN
4318                             The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'
Name: text, dtype: object

In [9]:
print(X_train.shape)
print(X_test.shape)

(5709, 1)
(1904, 1)


In [10]:
pd.set_option('max_colwidth', None)
train_df[train_df['target']> 0.0] 

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1


*Creating vocabulary for train and test data using keras*

In [11]:
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from tensorflow.keras.metrics import Recall, Precision
from sklearn.metrics import confusion_matrix as CFM

import re

# **Text Preprocessing**
1. We will be replacing the special characters
2. Separating the attached words. 
    eg. **ThatGuy** to **That Guy**


In [12]:
# X_train["text"].apply(word_tokenize).map(len).plot(kind="bar")
def replace_special_char(sentence):
    pattern = re.compile(r"[^A-Z \s]",flags=re.IGNORECASE)
    sentence,_ = re.subn(pattern,' ',sentence,count=50)
    return sentence

def replace_Aa_with_Aaspace(sentence, words):
    for word in words:
        sentence = sentence.replace(word, " "+word)
    return sentence
    
 
def separarate_attached_words(sentence):
    pattern = re.compile(r"[A-Z][^A-Z]*?")
    words = re.findall(pattern,sentence)
    sentence = replace_Aa_with_Aaspace(sentence, words)
    return sentence
    
# X_train["text"] = X_train["text"].apply(lambda x: x.replace('#',''))
# X_test['text'] = X_test['text'].apply(lambda x: x.replace('#',''))

def text_preprocess(text):
    text = replace_special_char(text)
    text = separarate_attached_words(text)
    text = text.lower()
    return text
    
X_train["text"] = X_train["text"].apply(text_preprocess)
X_test['text'] = X_test['text'].apply(text_preprocess)



In [13]:
X_test.head()


Unnamed: 0,text
4193,that persian guy y o u n g s a f e eden hazard as harden is spot on flopping is identical
3703,my biggest fear is disappointing the people who believe in me
4601,traffic collision no injury i s at i s rd ave offramp south sac http t co c t ej xo lpu
2925,http t co mo a q au fa jacksonville family bands together as memorial is planned for toddler who florida http t co n k ou z ww r t
1595,now playing cliff richard i could easily fall in love with you amp shadows internet nieuws radio on http t co lk m wp qzw


In [14]:
vocab_length = 20000
tokenizer_obj = Tokenizer(num_words=vocab_length,oov_token="<oov>")

tokenizer_obj.fit_on_texts(X_train['text'].values)
# We will fit the tokenizer on train and use the same on test to make sequences
# print(tokenizer_obj.word_index)

train_sequences = tokenizer_obj.texts_to_sequences(X_train['text'].values)
# representing each word in a sentence/record with index value of it in a vocab learnt buy tokenizer
test_sequences = tokenizer_obj.texts_to_sequences(X_test['text'].values)

*padding vectors/sequences generated for different length sentences to have a same length*

In [15]:
sequence_length = 25
#padding training data.
train_sequences = pad_sequences(train_sequences, maxlen=sequence_length)
# going with default padding as 'pre'

#padding testing data
test_sequences = pad_sequences(test_sequences,maxlen=sequence_length)

**Building the model**

In [16]:
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_length,128,input_length=sequence_length))
model.add(keras.layers.LSTM(64,activation = 'tanh', return_sequences=False))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(64,activation="tanh", kernel_regularizer=regularizers.l2(0.06)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(10,activation="tanh", kernel_regularizer=regularizers.l2(0.025)))
model.add(keras.layers.BatchNormalization())
# model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1))

In [17]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss=keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])

In [18]:
model.fit(train_sequences,y_train,epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x790d59a851d0>

In [19]:
def calculate_metrics(y_true, y_pred):
    precision = Precision()
    precision.update_state(y_true, y_pred)
    precision = precision.result().numpy()

    recall = Recall()
    recall.update_state(y_true, y_pred)
    recall = recall.result().numpy()

    confusion_matrix = CFM(y_true, tf.math.round(y_pred))
                                                        
    return precision, recall, confusion_matrix



In [20]:
y_train_predicted = model.predict(train_sequences)
y_train_predicted = keras.layers.Activation('sigmoid')(y_train_predicted) # since we haven't applied in final layer

precision, recall, confusion_matrix = calculate_metrics(y_train, y_train_predicted)

print(precision, recall)
print("--------------")
print(confusion_matrix)

0.995878 0.98491645
--------------
[[3246   10]
 [  37 2416]]


In [21]:
y_test_predicted = model.predict(test_sequences)
y_test_predicted = keras.layers.Activation('sigmoid')(y_test_predicted) 

precision, recall, confusion_matrix = calculate_metrics(y_test, y_test_predicted)

print(precision, recall)
print("--------------")
print(confusion_matrix)

0.6456103 0.73716384
--------------
[[755 331]
 [215 603]]


In [22]:
model.evaluate(test_sequences,y_test,batch_size=200)



[0.8397395610809326, 0.7174369692802429]

It is evident that the model is overfitted the train data. for this we should retrain the model with the dropout layers

**Predicting for the test data**

In [23]:
columns_to_remove_test = ["keyword","location"]
test_df = pd.read_csv(r"/kaggle/input/nlp-getting-started/test.csv")
test_df_to_predict= test_df.drop(columns=columns_to_remove_test)
test_df_to_predict['text'] = test_df_to_predict['text'].apply(text_preprocess)
test_submit_sequences = tokenizer_obj.texts_to_sequences(test_df_to_predict["text"].values)
test_submit_sequences_padded = pad_sequences(test_submit_sequences,maxlen=sequence_length)

In [24]:
predicted_output = model.predict(test_submit_sequences_padded,batch_size=100)
predicted_output = tf.math.sigmoid(predicted_output) # since the outputs of the model are logits

In [25]:
target_df = pd.DataFrame(data=predicted_output,columns=["target"])
target_df["target"] = target_df["target"].apply(lambda x: 0 if x<0.5 else 1)

In [26]:
test_df_predicted = pd.concat([test_df_to_predict,target_df],axis=1)
test_submission = test_df_predicted.drop(columns=["text"])

In [27]:
test_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,1
4,11,1


In [28]:
test_submission.to_csv(r"/kaggle/working/submission.csv",index=False)

In [29]:
# model.save("nlp_disaster_prediction_model.h5")