In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
stop_words_list = nltk.corpus.stopwords.words("english")
lemma_nltk = WordNetLemmatizer()

In [4]:
data = pd.read_csv("J:/Data science/data/NLP/disaster/train.csv")

In [5]:
df = data.sample(frac=1 , random_state=42)
df.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [6]:
df.shape

(7613, 5)

In [7]:
df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [8]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    text = " ".join([lemma_nltk.lemmatize(word) for word in tokens if word not in stop_words_list])
    return text

In [9]:
df['text'] = df['text'].apply(lambda x:clean_text(x))

In [10]:
x = df['text']
y = df['target']

In [11]:
x.head()

2644            new weapon cause unimaginable destruction
2227    famping thing gishwhes got soaked deluge going...
5448    dt georgegalloway rt galloway4mayor ûïthe col...
132     aftershock back school kick great want thank e...
6845    response trauma child addict develop defensive...
Name: text, dtype: object

In [12]:
y.head()

2644    1
2227    0
5448    1
132     0
6845    0
Name: target, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
x_train ,x_test ,y_train ,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [14]:
sum(set([len(i.split()) for i in x_train]))

325

In [15]:
words = sum([len(i.split()) for i in x_train])
words

53333

In [16]:
words = sum([len(i.split()) for i in x_train])
words

53333

In [17]:
len(x_train)

5329

In [18]:
avg = words / len(x_train)
round(avg)

10

In [20]:
max_tokens = 10000
max_sent_length = 10

In [21]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [22]:
text_vectorizer = TextVectorization(max_tokens=max_tokens,
                                    output_sequence_length=max_sent_length)

In [23]:
text_vectorizer.adapt(x_train)

In [25]:
from tensorflow.keras import layers
tf.random.set_seed(42)
embedding = layers.Embedding(input_dim=max_tokens,
                             input_length=max_sent_length,
                             output_dim=64)          

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,TextVectorization, Flatten, LSTM, Dropout, Activation, Embedding

In [34]:
model = Sequential([
    tf.keras.Input(shape=(1,) , dtype="string"),
    text_vectorizer,
    embedding,
    tf.keras.layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation="sigmoid")
])

In [35]:
model.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer = tf.keras.optimizers.Adam(),
             metrics=["accuracy"])

In [37]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 10)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 10, 64)            640000    
                                                                 
 global_average_pooling1d (G  (None, 64)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 640,065
Trainable params: 640,065
Non-trainable params: 0
_________________________________________________________________


In [38]:
model.fit(x_train,y_train,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1f607fcf7f0>

In [39]:
model.evaluate(x_test,y_test)



[0.4566921889781952, 0.7964097857475281]

In [41]:
model_2 = Sequential([
    tf.keras.Input(shape=(1,) , dtype="string"),
    text_vectorizer,
    embedding,
    tf.keras.layers.SimpleRNN(64),
    layers.Dense(1, activation="sigmoid")
])

In [42]:
model_2.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer = tf.keras.optimizers.Adam(),
             metrics=["accuracy"])

In [44]:
model_2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 10)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 10, 64)            640000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 648,321
Trainable params: 648,321
Non-trainable params: 0
_________________________________________________________________


In [45]:
model_2.fit(x_train,y_train,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1f608b36a00>

In [48]:
model_2.evaluate(x_test,y_test)



[0.8105838298797607, 0.7679509520530701]

In [50]:
model_3 = Sequential([
    tf.keras.Input(shape=(1,) , dtype="string"),
    text_vectorizer,
    embedding,
    tf.keras.layers.SimpleRNN(64,return_sequences=True), # لو معملتهاش بترو هيجيب ايرور
    tf.keras.layers.SimpleRNN(64),
    layers.Dense(1, activation="sigmoid")
])

In [51]:
model_3.compile(loss=tf.keras.losses.binary_crossentropy,
              optimizer = tf.keras.optimizers.Adam(),
             metrics=["accuracy"])

In [52]:
model_3.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 10)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 10, 64)            640000    
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 10, 64)            8256      
                                                                 
 simple_rnn_4 (SimpleRNN)    (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 656,577
Trainable params: 656,577
Non-trainable params: 0
________________________________________________

In [53]:
model_3.fit(x_train,y_train,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1f60d316430>

In [54]:
model_3.evaluate(x_test,y_test)



[1.0200862884521484, 0.74956214427948]

In [58]:
y_probs_model3 = model_3.predict(x_test)
y_probs_model3



array([[9.9675411e-01],
       [9.9937171e-01],
       [9.9970257e-01],
       ...,
       [9.9699396e-01],
       [8.3379226e-04],
       [9.9948221e-01]], dtype=float32)

In [56]:
y_probs_model3 = model_3.predict(x_test)
y_probs_model3[:3]



array([[0.9967541],
       [0.9993717],
       [0.9997026]], dtype=float32)

In [59]:
y_pred_model3 = tf.squeeze(tf.round(y_probs_model3))
y_pred_model3

<tf.Tensor: shape=(2284,), dtype=float32, numpy=array([1., 1., 1., ..., 1., 0., 1.], dtype=float32)>

In [57]:
y_pred_model3 = tf.squeeze(tf.round(y_probs_model3))
y_pred_model3[:3]

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([1., 1., 1.], dtype=float32)>

In [60]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [61]:
conf = confusion_matrix(y_test , y_pred_model3)
conf

array([[986, 275],
       [297, 726]], dtype=int64)

In [62]:
accuracy_score(y_test,y_pred_model3)

0.7495621716287215