In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
stop_words_list = nltk.corpus.stopwords.words("english")
lemma_nltk = WordNetLemmatizer()

In [3]:
data = pd.read_csv("J:/Data science/data/NLP/disaster/train.csv")

In [4]:
df = data.sample(frac=1 , random_state=42)
df.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [5]:
df.shape

(7613, 5)

In [6]:
df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [7]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    text = " ".join([lemma_nltk.lemmatize(word) for word in tokens if word not in stop_words_list])
    return text

In [8]:
df['text'] = df['text'].apply(lambda x:clean_text(x))

In [9]:
x = df['text']
y = df['target']

In [10]:
x.head()

2644            new weapon cause unimaginable destruction
2227    famping thing gishwhes got soaked deluge going...
5448    dt georgegalloway rt galloway4mayor ûïthe col...
132     aftershock back school kick great want thank e...
6845    response trauma child addict develop defensive...
Name: text, dtype: object

In [11]:
y.head()

2644    1
2227    0
5448    1
132     0
6845    0
Name: target, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
x_train ,x_test ,y_train ,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [13]:
sum(set([len(i.split()) for i in x_train]))

325

In [14]:
words = sum([len(i.split()) for i in x_train])
words

53333

In [15]:
words = sum([len(i.split()) for i in x_train])
words

53333

In [16]:
len(x_train)

5329

In [17]:
avg = words / len(x_train)
round(avg)

10

In [18]:
max_tokens = 10000
max_sent_length = 10

In [19]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [20]:
text_vectorizer = TextVectorization(max_tokens=max_tokens,
                                    output_sequence_length=max_sent_length)

In [21]:
text_vectorizer.adapt(x_train)

In [22]:
from tensorflow.keras import layers
tf.random.set_seed(42)
embedding = layers.Embedding(input_dim=max_tokens,
                             input_length=max_sent_length,
                             output_dim=64)          

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,TextVectorization, Flatten, LSTM, Dropout, Activation, Embedding

In [26]:
model = Sequential([
    tf.keras.Input(shape=(1,) , dtype="string"),
    text_vectorizer,
    embedding,
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1 , activation="sigmoid")
])

In [27]:
model.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer = tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 10)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 10, 64)            640000    
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 673,089
Trainable params: 673,089
Non-trainable params: 0
_________________________________________________________________


In [29]:
model_history = model.fit(x_train,y_train,epochs=5)
model_history

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x24c744fdaf0>

In [30]:
model.evaluate(x_test,y_test)



[0.9691628217697144, 0.7408056259155273]

In [32]:
model2 = Sequential([
    tf.keras.Input(shape=(1,) , dtype="string"),
    text_vectorizer,
    embedding,
    tf.keras.layers.LSTM(64 , return_sequences=True),
    tf.keras.layers.LSTM(64),
    layers.Dense(1,activation="sigmoid")
])

In [33]:
model2.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer = tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [34]:
model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 10)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 10, 64)            640000    
                                                                 
 lstm_4 (LSTM)               (None, 10, 64)            33024     
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 706,113
Trainable params: 706,113
Non-trainable params: 0
________________________________________________

In [35]:
model2_history = model.fit(x_train,y_train,epochs=5)
model2_history

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x24c051aabb0>

In [36]:
model2.evaluate(x_test,y_test)



[0.6907997727394104, 0.7022767066955566]

In [65]:
model3 = Sequential([
    tf.keras.Input(shape=(1,) , dtype="string"),
    text_vectorizer,
    embedding,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(32 , activation="relu"),
    tf.keras.layers.Dense(1 , activation="sigmoid")
])

In [66]:
model3.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer = tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [67]:
model3.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 10)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 10, 64)            640000    
                                                                 
 bidirectional_8 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_14 (Dense)            (None, 32)                4128      
                                                                 
 dense_15 (Dense)            (None, 1)                 33        
                                                                 
Total params: 710,209
Trainable params: 710,209
Non-tr

In [68]:
model3_history = model3.fit(x_train,y_train,epochs=5)
model3_history

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x24c0e3cd250>

In [69]:
model3.evaluate(x_test,y_test)



[1.9970803260803223, 0.710595428943634]

In [82]:
model4 = Sequential([
    tf.keras.Input(shape=(1,) , dtype="string"),
    text_vectorizer,
    embedding,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64 , return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64 , activation="relu"),
    tf.keras.layers.Dense(1 , activation="sigmoid")
])

In [83]:
model4.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer = tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [84]:
model4.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 10)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 10, 64)            640000    
                                                                 
 bidirectional_11 (Bidirecti  (None, 10, 128)          66048     
 onal)                                                           
                                                                 
 bidirectional_12 (Bidirecti  (None, 64)               41216     
 onal)                                                           
                                                                 
 dense_18 (Dense)            (None, 64)                4160      
                                                     

In [85]:
model3_history = model4.fit(x_train,y_train,epochs=5)
model3_history

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x24c1a3f2460>

In [86]:
model4.evaluate(x_test,y_test)



[1.5362179279327393, 0.7263572812080383]