# In this notebook I going to intruduce a some NLP concepts.

Dataset : https://www.kaggle.com/competitions/nlp-getting-started/data?select=train.csv

In [1]:
import pandas as pd
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import datetime

import warnings

warnings.filterwarnings('ignore')

2024-05-29 07:42:49.100326: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-29 07:42:49.131362: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df_sample = pd.read_csv('Datasets/sample_submission.csv')
df_train = pd.read_csv('Datasets/train.csv')
df_test = pd.read_csv('Datasets/test.csv')

In [3]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
X_train, X_val, y_train, y_val = train_test_split(df_train['text'],df_train['target'], test_size=0.1)

In [6]:
X_train.to_numpy()

array(['Slip Sliding Away - Flash Floods Info for Writers w/Tony Nester @SonoranRattler #writingtips http://t.co/sLTtOrRLHs',
       'Eating takis then rubbing my eyes with my hands now my eyes are bleeding tears',
       'Riot Kit Bah - part of the new concept Gear coming for Autumn/Winter\n#menswear #fashion #urbanfashion\x89Û_ https://t.co/cCwzDTFbUS',
       ..., 'FINALLY a storm',
       "Erdogan's Bloody Gambit: on July 20 a suicide bombing in Turkey took the lives of 31 socialists in SuruÌ¤ http://t.co/z6xAUIDRXu @Shareaholic",
       'i hit my foot now my toe is bleeding ??'], dtype=object)

## How to vectorize and embed texts ?

In [7]:
from tensorflow.keras.layers import TextVectorization, Embedding


In [8]:
text_vectorizer =TextVectorization(max_tokens=10000,output_sequence_length=20)

2024-05-29 07:42:50.553173: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-29 07:42:50.584771: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-29 07:42:50.588786: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [9]:
text_vectorizer.adapt(X_train.to_numpy())

In [10]:
text_vectorizer(['My god, my house is in fire in this moment !!'])

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[ 13, 222,  13, 285,   9,   4,  45,   4,  19, 933,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]])>

In [11]:
words_vocab = text_vectorizer.get_vocabulary()

In [12]:
words_vocab[:5]

['', '[UNK]', 'the', 'a', 'in']

In [13]:
embedding = Embedding(input_dim=10000, output_dim = 128)

In [14]:
X_train.to_numpy()[354]

'When you see your crush in the stands. (Vine by @KhadiDon) https://t.co/aSooPcYgwn'

In [15]:
embedding(text_vectorizer(X_train.to_numpy()[354]))[4]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ 0.00913352,  0.02377174,  0.04021836,  0.00869515,  0.0104376 ,
        0.00892221, -0.00638318, -0.00819924, -0.02994398,  0.01725677,
       -0.04240071, -0.00279144,  0.0239603 ,  0.02495842, -0.04263813,
        0.02598711, -0.04152074, -0.04888999, -0.03264915,  0.04364434,
        0.04178325,  0.02318884, -0.04952515,  0.01811874, -0.00768703,
        0.03999123, -0.03430386, -0.04351454, -0.00835433,  0.04019778,
       -0.03664168, -0.04157631,  0.02191978,  0.047317  ,  0.00606273,
        0.04290285, -0.00894326,  0.03986287, -0.00946856, -0.00195   ,
       -0.02210127, -0.02409345,  0.02011472,  0.03267124, -0.01158297,
        0.01119876,  0.02479259,  0.00406191, -0.0238199 , -0.03843447,
        0.02624217,  0.01584421, -0.04218714,  0.02563209, -0.00347221,
       -0.00682299, -0.03175913,  0.0230809 , -0.00095618, -0.00468017,
       -0.019769  , -0.00910317, -0.03942901,  0.04337   , -0.03717401,
        0.034362

In [16]:
embedding(text_vectorizer(X_train.to_numpy()[354]))[5]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0.00185535, -0.04480417, -0.04984268,  0.02122095,  0.01086881,
       -0.0409892 ,  0.04681208, -0.00524715, -0.04833147, -0.0239215 ,
       -0.04086485,  0.03911367, -0.00180814,  0.00273214,  0.00865927,
       -0.0364236 ,  0.01542009,  0.04647256, -0.03419872,  0.00386766,
        0.00321293, -0.04728035,  0.00191872, -0.02267531,  0.04297749,
       -0.01229533,  0.04946164, -0.01120789, -0.03117949, -0.04429674,
       -0.03149416,  0.02351351, -0.0378134 ,  0.02010048,  0.02287867,
       -0.01673215,  0.02468059,  0.00112979, -0.01094055, -0.03041484,
        0.01363974, -0.03914993,  0.03228152, -0.01739683,  0.0123139 ,
        0.01869395,  0.01167048,  0.02522956,  0.00600256,  0.03438168,
       -0.01197867,  0.02227935,  0.00586729,  0.02173677, -0.04280074,
        0.04920441, -0.0260103 , -0.02364537, -0.03780488,  0.01900494,
       -0.04627546,  0.0339686 ,  0.02595131,  0.04991647, -0.01613889,
        0.024705

## Creating a baseline

In [17]:
model_0 = Pipeline([
                     ("vectorizer", CountVectorizer()),
                    ("tfidf",TfidfTransformer()),
                    ("clf",MultinomialNB())
])

model_0.fit(X_train,y_train)

In [18]:
model_0_base = model_0.score(X_val,y_val)

print(f"the baseline model accuracy was : {model_0_base*100:.2f}%")

the baseline model accuracy was : 79.40%


In [19]:
def model_evaluation(y_true,y_pred):
    model_accuracy = accuracy_score(y_true,y_pred)
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true,y_pred,average='weighted')

    results_dict = {"accuracy":model_accuracy,
                    "precision":model_precision,
                    "recall":model_recall,
                    "f1_score":model_f1}
    return results_dict

### Evaluating the model in traning

In [20]:
y_pred = model_0.predict(X_val)
base_line = model_evaluation(y_val,y_pred)
base_line

{'accuracy': 0.7939632545931758,
 'precision': 0.811788277447686,
 'recall': 0.7939632545931758,
 'f1_score': 0.7857293435263886}

### Evaluating the model in real prediction

In [21]:
y_pred_test = model_0.predict(df_test.text)
model_evaluation(df_sample.target,y_pred_test)

{'accuracy': 0.7048728164266013,
 'precision': 1.0,
 'recall': 0.7048728164266013,
 'f1_score': 0.8268919647672119}

In [22]:
def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback

# A Dense model

In [23]:
SAVE_DIR = "logs"

inputs = tf.keras.layers.Input(shape=(1,),dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x) 
output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model_1 = tf.keras.Model(inputs,output, name="model_1_dense")
model_1.summary()

In [24]:
model_1.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [25]:
model_1.fit(X_train,y_train,
            epochs=5,
            validation_data=(X_val,y_val),
            callbacks = [create_tensorboard_callback(dir_name=SAVE_DIR,experiment_name="model_1_dense")])

Saving TensorBoard log files to: logs/model_1_dense/20240529-074251
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6065 - loss: 0.6555 - val_accuracy: 0.7874 - val_loss: 0.5384
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8137 - loss: 0.4760 - val_accuracy: 0.8005 - val_loss: 0.4592
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8620 - loss: 0.3685 - val_accuracy: 0.8045 - val_loss: 0.4458
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8786 - loss: 0.3084 - val_accuracy: 0.7992 - val_loss: 0.4479
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9021 - loss: 0.2560 - val_accuracy: 0.7874 - val_loss: 0.4664


<keras.src.callbacks.history.History at 0x7cd8c9ca9ff0>

In [26]:
y_pred = tf.round(model_1.predict(X_val).squeeze())

model_evaluation(y_val,y_pred)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


{'accuracy': 0.7874015748031497,
 'precision': 0.7873275013853044,
 'recall': 0.7874015748031497,
 'f1_score': 0.7855499594104705}

In [27]:
base_line

{'accuracy': 0.7939632545931758,
 'precision': 0.811788277447686,
 'recall': 0.7939632545931758,
 'f1_score': 0.7857293435263886}

# A LTSM model

In [28]:
inputs = tf.keras.layers.Input(shape=(1,),dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(64)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model_2 = tf.keras.Model(inputs,outputs, name="model_2")


In [29]:
model_2.summary()

In [30]:
model_2.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [31]:
model_2.fit(X_train,y_train,
                epochs=5,
                validation_data=(X_val,y_val),
                callbacks = [create_tensorboard_callback(dir_name=SAVE_DIR,experiment_name="model_2_LSTM")])

Saving TensorBoard log files to: logs/model_2_LSTM/20240529-074255
Epoch 1/5
[1m 32/215[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.6822 - loss: 0.5608

2024-05-29 07:42:57.009349: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8600 - loss: 0.3212 - val_accuracy: 0.7861 - val_loss: 0.5131
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9399 - loss: 0.1622 - val_accuracy: 0.7874 - val_loss: 0.6163
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9522 - loss: 0.1348 - val_accuracy: 0.7795 - val_loss: 0.7495
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9566 - loss: 0.1123 - val_accuracy: 0.7612 - val_loss: 0.8497
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9658 - loss: 0.0882 - val_accuracy: 0.7638 - val_loss: 0.9624


<keras.src.callbacks.history.History at 0x7cd8b009bd90>

In [32]:
y_pred = tf.round(model_2.predict(X_val).squeeze())

model_evaluation(y_val,y_pred)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


{'accuracy': 0.7637795275590551,
 'precision': 0.7630005268606506,
 'recall': 0.7637795275590551,
 'f1_score': 0.7620079369273471}

In [42]:
inputs = tf.keras.layers.Input(shape=(1,),dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GRU(128, return_sequences=True)(x)
x = tf.keras.layers.GRU(128)(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model_3 = tf.keras.Model(inputs,outputs, name="model_2")

In [43]:
model_3.summary()

In [44]:
model_3.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [45]:
model_3.fit(X_train,y_train,
                epochs=20,
                validation_data=(X_val,y_val),
                callbacks = [create_tensorboard_callback(dir_name=SAVE_DIR,experiment_name="model_3_LSTM")])

Saving TensorBoard log files to: logs/model_3_LSTM/20240529-074446
Epoch 1/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8900 - loss: 0.2170 - val_accuracy: 0.7402 - val_loss: 0.8149
Epoch 2/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9837 - loss: 0.0414 - val_accuracy: 0.7388 - val_loss: 1.1961
Epoch 3/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9843 - loss: 0.0338 - val_accuracy: 0.7336 - val_loss: 1.7169
Epoch 4/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9810 - loss: 0.0356 - val_accuracy: 0.7375 - val_loss: 1.7065
Epoch 5/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9836 - loss: 0.0287 - val_accuracy: 0.7126 - val_loss: 1.5693
Epoch 6/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9837 - loss: 0.0288 - v

<keras.src.callbacks.history.History at 0x7cd80e22a7d0>

In [46]:
y_pred = tf.round(model_3.predict(X_val).squeeze())

model_evaluation(y_val,y_pred)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


{'accuracy': 0.7086614173228346,
 'precision': 0.7126113510851406,
 'recall': 0.7086614173228346,
 'f1_score': 0.7096820592139776}