# In this notebook I going to intruduce a some NLP concepts.

Dataset : https://www.kaggle.com/competitions/nlp-getting-started/data?select=train.csv

In [82]:
import pandas as pd
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import datetime

import warnings

warnings.filterwarnings('ignore')

In [6]:
df_sample = pd.read_csv('Datasets/sample_submission.csv')
df_train = pd.read_csv('Datasets/train.csv')
df_test = pd.read_csv('Datasets/test.csv')

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [5]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [13]:
X_train, X_val, y_train, y_val = train_test_split(df_train['text'],df_train['target'], test_size=0.1)

In [23]:
X_train.to_numpy()

array(['crushed a 6 mile run tonight. awesome',
       'Alabama home quarantined over possible Ebola case: Officials say a quarantine is in place at ... http://t.co/ztOnvgubVm #Bluehand #PJNET',
       "Tonight It's Going To Be Mayhem @ #4PlayThursdays. Everybody Free w/ Text. 1716 I ST NW (18+) http://t.co/sCu9QZp6nq",
       ...,
       "I stand alone\ndon't piss and moan\nabout my choices made\nIf I must reap the whirlwind so be it\nI'll do so with demeanor calm and staid",
       '@Erker Again?? Eep! Thought of you yesterday when I saw that hella scary hail. #armageddon?',
       'Generational \x89Û÷British schism\x89Ûª over privacy threat of drones http://t.co/dqtMTPqmBR\n  #drones #privacy http://t.co/dMsnYPtscY'],
      dtype=object)

## How to vectorize and embed texts ?

In [33]:
from tensorflow.keras.layers import TextVectorization, Embedding


In [27]:
text_vectorizer =TextVectorization(max_tokens=10000,output_sequence_length=20)

In [28]:
text_vectorizer.adapt(X_train.to_numpy())

In [29]:
text_vectorizer(['My god, my house is in fire in this moment !!'])

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[ 13, 211,  13, 368,   9,   4,  43,   4,  19, 996,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]])>

In [30]:
words_vocab = text_vectorizer.get_vocabulary()

In [32]:
words_vocab[:5]

['', '[UNK]', 'the', 'a', 'in']

In [34]:
embedding = Embedding(input_dim=10000, output_dim = 128)

In [41]:
X_train.to_numpy()[354]

'Beyond all bounds; till inundation rise'

In [45]:
embedding(text_vectorizer(X_train.to_numpy()[354]))[4]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0.01907905, -0.0407122 , -0.0215154 ,  0.01198933,  0.02152791,
        0.03067351, -0.03556321, -0.0065333 ,  0.02295834, -0.03823628,
       -0.01985903,  0.00056513,  0.04180951, -0.03494778,  0.04006789,
       -0.02581559,  0.01969742,  0.00437313, -0.02669712,  0.00365674,
       -0.01568324,  0.04603317,  0.00152036, -0.01918415,  0.01755692,
       -0.03701361, -0.01945529, -0.04954158,  0.01059233,  0.03069434,
        0.04056473,  0.0412131 , -0.02184223, -0.04096577, -0.02076751,
        0.00686665, -0.03202442, -0.01033839, -0.03328227,  0.005867  ,
        0.01933343,  0.01441092,  0.01163872, -0.01187   ,  0.04839936,
        0.00536287,  0.04888504, -0.00480294,  0.01996113, -0.00605484,
        0.04218319, -0.04261646, -0.03297825, -0.01536974,  0.02527202,
       -0.03286296,  0.01515731,  0.02722769, -0.0006691 , -0.00644265,
       -0.04076078,  0.00845772,  0.0300577 ,  0.04136795,  0.03705516,
        0.029047

In [46]:
embedding(text_vectorizer(X_train.to_numpy()[354]))[5]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0.03855801,  0.02844906, -0.04968568,  0.02439829, -0.0335146 ,
       -0.04543892,  0.04681109,  0.01257959,  0.03344095, -0.01437332,
       -0.00024341, -0.04473727, -0.00364625,  0.01149084,  0.03589264,
       -0.00552518, -0.00506965, -0.02229854, -0.03539278, -0.04205468,
        0.03230045, -0.01153265,  0.02234424, -0.01206874,  0.0446819 ,
       -0.03950541, -0.02368436,  0.03346128,  0.02970434, -0.01693424,
       -0.00197259, -0.03769988,  0.04611614, -0.04012765,  0.03046191,
        0.04466182,  0.00591383,  0.035862  ,  0.04731469,  0.0283173 ,
        0.00097056,  0.0329938 , -0.00465686,  0.02641281,  0.00626343,
        0.00817718,  0.03491734,  0.03472303, -0.00975584, -0.03840988,
       -0.0192551 , -0.01066998,  0.00113162, -0.03391914, -0.01077908,
       -0.03372888,  0.04539695,  0.00906658, -0.02746192,  0.00361869,
        0.02804803, -0.031578  ,  0.04880694, -0.04059435, -0.02476188,
       -0.016300

## Creating a baseline

In [51]:
model_0 = Pipeline([
                     ("vectorizer", CountVectorizer()),
                    ("tfidf",TfidfTransformer()),
                    ("clf",MultinomialNB())
])

model_0.fit(X_train,y_train)

In [53]:
model_0_base = model_0.score(X_val,y_val)

print(f"the baseline model accuracy was : {model_0_base*100:.2f}%")

the baseline model accuracy was : 80.18%


In [66]:
def model_evaluation(y_true,y_pred):
    model_accuracy = accuracy_score(y_true,y_pred)
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true,y_pred,average='weighted')

    results_dict = {"accuracy":model_accuracy,
                    "precision":model_precision,
                    "recall":model_recall,
                    "f1_score":model_f1}
    return results_dict

### Evaluating the model in traning

In [94]:
y_pred = model_0.predict(X_val)
base_line = model_evaluation(y_val,y_pred)
base_line

{'accuracy': 0.8018372703412073,
 'precision': 0.8187642030321519,
 'recall': 0.8018372703412073,
 'f1_score': 0.7934458692773226}

### Evaluating the model in real prediction

In [73]:
y_pred_test = model_0.predict(df_test.text)
model_evaluation(df_sample.target,y_pred_test)

{'accuracy': 0.7036469506589028,
 'precision': 1.0,
 'recall': 0.7036469506589028,
 'f1_score': 0.8260478503327937}

In [74]:
def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback

# A Dense model

In [86]:
SAVE_DIR = "model_logs"

inputs = tf.keras.layers.Input(shape=(1,),dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x) 
output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model_1 = tf.keras.Model(inputs,output, name="model_1_dense")
model_1.summary()

In [87]:
model_1.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [88]:
model_1.fit(X_train,y_train,
            epochs=5,
            validation_data=(X_val,y_val),
            callbacks = [create_tensorboard_callback(dir_name=SAVE_DIR,experiment_name="model_1_dense")])

Saving TensorBoard log files to: model_logs/model_1_dense/20240528-110736
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7353 - loss: 0.6168 - val_accuracy: 0.7927 - val_loss: 0.5103
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8410 - loss: 0.4235 - val_accuracy: 0.8150 - val_loss: 0.4520
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8787 - loss: 0.3330 - val_accuracy: 0.8163 - val_loss: 0.4339
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9045 - loss: 0.2689 - val_accuracy: 0.8281 - val_loss: 0.4307
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9158 - loss: 0.2337 - val_accuracy: 0.8241 - val_loss: 0.4407


<keras.src.callbacks.history.History at 0x7f8a6c5794e0>

In [93]:
y_pred = tf.round(model_1.predict(X_val).squeeze())

model_evaluation(y_val,y_pred)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 990us/step


{'accuracy': 0.8241469816272966,
 'precision': 0.8315226901281775,
 'recall': 0.8241469816272966,
 'f1_score': 0.819781776315188}

In [95]:
base_line

{'accuracy': 0.8018372703412073,
 'precision': 0.8187642030321519,
 'recall': 0.8018372703412073,
 'f1_score': 0.7934458692773226}