In [85]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [86]:
stop_words_list = nltk.corpus.stopwords.words("english")
lemma_nltk = WordNetLemmatizer()

In [87]:
data = pd.read_csv("J:/Data science/data/NLP/disaster/train.csv")

In [88]:
df = data.sample(frac=1 , random_state=42)
df.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [89]:
df.shape

(7613, 5)

In [90]:
df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [91]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    text = " ".join([lemma_nltk.lemmatize(word) for word in tokens if word not in stop_words_list])
    return text

In [92]:
df['text'] = df['text'].apply(lambda x:clean_text(x))

In [93]:
x = df['text']
y = df['target']

In [94]:
x.head()

2644            new weapon cause unimaginable destruction
2227    famping thing gishwhes got soaked deluge going...
5448    dt georgegalloway rt galloway4mayor ûïthe col...
132     aftershock back school kick great want thank e...
6845    response trauma child addict develop defensive...
Name: text, dtype: object

In [95]:
y.head()

2644    1
2227    0
5448    1
132     0
6845    0
Name: target, dtype: int64

In [96]:
from sklearn.model_selection import train_test_split
x_train ,x_test ,y_train ,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [97]:
words = sum([len(i.split()) for i in x_train])
words

53333

In [98]:
len(x_train)

5329

In [99]:
avg = words / len(x_train)
round(avg)

10

In [100]:
max_tokens = 10000
max_sent_length = 10

In [101]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
text_vector = TextVectorization(max_tokens = max_tokens,
                                output_sequence_length=max_sent_length)

In [102]:
text_vector.adapt(x_train)

In [103]:
sample_sent = "i love you"
text_vector([sample_sent])

<tf.Tensor: shape=(1, 10), dtype=int64, numpy=array([[ 1, 57,  1,  0,  0,  0,  0,  0,  0,  0]], dtype=int64)>

# word Ebedding

In [104]:
from tensorflow.keras import layers
tf.random.set_seed(42)
embedding = layers.Embedding(input_dim=max_tokens,
                             input_length=max_sent_length,
                             output_dim = 128,
                             name = "Embedding_1"
                             )
embedding

<keras.layers.core.embedding.Embedding at 0x2528bd9d3a0>

In [105]:
import random
random_sent = random.choice(x_train)
sample_embedding = embedding(text_vector([random_sent]))
sample_embedding

<tf.Tensor: shape=(1, 10, 128), dtype=float32, numpy=
array([[[ 0.03567096,  0.02025517,  0.04264439, ..., -0.03750698,
         -0.01763681,  0.04190821],
        [ 0.01363448,  0.01463579, -0.00829039, ...,  0.02082897,
         -0.0216069 ,  0.00767861],
        [-0.01808893,  0.03785774, -0.00993693, ...,  0.01176222,
         -0.03872883, -0.0382408 ],
        ...,
        [-0.01238489, -0.01569571,  0.04614357, ...,  0.00714378,
         -0.04799243, -0.04700608],
        [ 0.00593823,  0.03414381, -0.03037226, ..., -0.04380504,
          0.00468301, -0.03307756],
        [ 0.03977952, -0.03782602, -0.03646283, ...,  0.00236253,
          0.03332629,  0.02803668]]], dtype=float32)>

In [106]:
sample_embedding[0]

<tf.Tensor: shape=(10, 128), dtype=float32, numpy=
array([[ 0.03567096,  0.02025517,  0.04264439, ..., -0.03750698,
        -0.01763681,  0.04190821],
       [ 0.01363448,  0.01463579, -0.00829039, ...,  0.02082897,
        -0.0216069 ,  0.00767861],
       [-0.01808893,  0.03785774, -0.00993693, ...,  0.01176222,
        -0.03872883, -0.0382408 ],
       ...,
       [-0.01238489, -0.01569571,  0.04614357, ...,  0.00714378,
        -0.04799243, -0.04700608],
       [ 0.00593823,  0.03414381, -0.03037226, ..., -0.04380504,
         0.00468301, -0.03307756],
       [ 0.03977952, -0.03782602, -0.03646283, ...,  0.00236253,
         0.03332629,  0.02803668]], dtype=float32)>

In [107]:
sample_embedding[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ 0.03567096,  0.02025517,  0.04264439,  0.02908177, -0.01494061,
        0.04725884, -0.02846882, -0.04813309,  0.01245219, -0.03131226,
        0.01051384,  0.04916995, -0.03936596,  0.00277791,  0.03728781,
        0.0211118 ,  0.04353425, -0.01747558,  0.0106712 , -0.01926885,
       -0.00797834,  0.04189848, -0.00493519,  0.04509734, -0.02319825,
       -0.02310714,  0.04454556,  0.04610752,  0.00197711, -0.00327367,
       -0.02855313, -0.04643349, -0.01936264, -0.01700542, -0.02423441,
       -0.03488116, -0.00625985,  0.0282595 , -0.02348395, -0.04004522,
       -0.01669728, -0.01591397, -0.0194787 , -0.00074703, -0.002533  ,
       -0.01032245, -0.03525096, -0.00737032, -0.04728321, -0.04323356,
       -0.00975309, -0.02156738, -0.0441628 ,  0.02443285, -0.0375182 ,
       -0.00805396,  0.04794754, -0.01009747, -0.04560684,  0.03612586,
        0.04411975, -0.04938265,  0.00379644,  0.04981407, -0.01356051,
        0.039482

In [108]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix ,accuracy_score,precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [109]:
model_0 = Pipeline([
                    ("tfidf" , TfidfVectorizer()),
                    ("nb" , MultinomialNB())
])
model_0.fit(x_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])

In [110]:
y_pred = model_0.predict(x_test)
y_pred[:10]

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0], dtype=int64)

In [111]:
score_naive = accuracy_score(y_test,y_pred)
score_naive

0.7915936952714536

In [112]:
def calculate_results(y_test,y_pred):
    model_acc = accuracy_score(y_test,y_pred)*100
    model_precition , model_recall , model_f1, _ = precision_recall_fscore_support(y_test,
                                                                                  y_pred,
                                                                                  average="weighted")
    model_results = {"accuracy" : model_acc,
                     "precition" : model_precition,
                     "recall" : model_recall,
                     "f1_score" : model_f1}
    return model_results

In [113]:
baseline_model = calculate_results(y_test,y_pred)
baseline_model

{'accuracy': 79.15936952714536,
 'precition': 0.8057580492418782,
 'recall': 0.7915936952714536,
 'f1_score': 0.785415143145281}

In [114]:
x.head()

2644            new weapon cause unimaginable destruction
2227    famping thing gishwhes got soaked deluge going...
5448    dt georgegalloway rt galloway4mayor ûïthe col...
132     aftershock back school kick great want thank e...
6845    response trauma child addict develop defensive...
Name: text, dtype: object

In [115]:
x_train.head()

2470    modiministry railway minister prabhu call mp d...
3455    man squeeze another man head bare hand literal...
1977    drug alcohol jackson vroman house httptco5oqhq...
7216    danagould waynesteratl agree background check ...
1028    know hate body buy 2 bag chip variety pack fru...
Name: text, dtype: object

In [116]:
text_vector(x)

<tf.Tensor: shape=(7613, 10), dtype=int64, numpy=
array([[  10,   73,  171, ...,    0,    0,    0],
       [   1,  126,    1, ..., 7718, 6029, 1718],
       [4437,    1,   50, ..., 1938, 7573, 8756],
       ...,
       [7850,    1, 1657, ...,  881, 4666, 3036],
       [ 177,   26,  363, ..., 1158,  575,    1],
       [1806, 1898, 2329, ...,   15,    1,    0]], dtype=int64)>

In [128]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,) , dtype="string")
x = text_vector(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs,outputs , name="model_1_dense")

In [129]:
model_1.compile(loss = tf.keras.losses.binary_crossentropy,
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ["accuracy"])

In [130]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 10)               0         
 ectorization)                                                   
                                                                 
 Embedding_1 (Embedding)     (None, 10, 128)           1280000   
                                                                 
 global_average_pooling1d_4   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [132]:
model_1_history = model_1.fit(x_train,y_train,epochs=5)
model_1_history

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2528d685760>

In [136]:
model_1.evaluate(x_test,y_test)



[0.48780906200408936, 0.792469322681427]

In [134]:
score_model_1 = calculate_results(y_test,y_pred)
score_naive

0.7915936952714536

In [137]:
model_1_pred_probs = model_1.predict(x_test)
model_1_pred_probs[:5]



array([[0.58484423],
       [0.9150744 ],
       [0.9907011 ],
       [0.08557672],
       [0.04157984]], dtype=float32)

In [143]:
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
# squeeze removes single dimensions
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [147]:
score_model_1 = calculate_results(y_test,model_1_preds)
score_model_1

{'accuracy': 79.24693520140104,
 'precition': 0.7943083059920605,
 'recall': 0.7924693520140105,
 'f1_score': 0.7903047845302098}

In [148]:
def compare_baseline_to_new_results(baseline_results, new_model_results):
    for key, value in baseline_results.items():
        print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

In [152]:
compare_baseline_to_new_results(baseline_results=baseline_model, new_model_results=score_model_1)

Baseline accuracy: 79.16, New accuracy: 79.25, Difference: 0.09
Baseline precition: 0.81, New precition: 0.79, Difference: -0.01
Baseline recall: 0.79, New recall: 0.79, Difference: 0.00
Baseline f1_score: 0.79, New f1_score: 0.79, Difference: 0.00
