# NLP fundametals
Steps involved :
 - Convert the words to numercial sequences
 - Create the word-numerical sequence matrix, which shows the word vector dense matrix
 - Build the RNN network (LSTM)
 - Compile, calssify

In [1]:
import os
import tensorflow as tf
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow_hub as hub
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split

In [2]:
def create_tensorboard_callback(experiment_name, model_name):
    """
    Create an tensorboard callback.
    """
    return tf.keras.callbacks.TensorBoard(log_dir=os.path.join("model_logs", experiment_name, model_name))

In [3]:
def create_model_checkpoint_callback(experiment_name, model_name):
    """
     Create an Model Checkpoint callback
    """
    return tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join("model_checkpoints", experiment_name, model_name+".ckpt"),
                                              save_weights_only=True, monitor="val_acc", 
                                              save_best_only=True)

In [4]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results_score(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred)*100
    print(f"[calculate_results_score] the accuracy is :: {model_accuracy}")
    model_precision, model_recall, model_f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    print(f"[calculate_results_score] The precision is : {model_precision}")
    print(f"[calculate_results_score] The recall is : {model_recall}")
    print(f"[calculate_results_score] The f1 score is : {model_f1_score}")
    return {
        "accuracy": model_accuracy,
        "precision": model_precision,
        "recall": model_recall,
        "f1_score": model_f1_score,
             }

In [5]:
train_df = pd.read_csv("../datasets/nlp-getting-started/train.csv")
test_df = pd.read_csv("../datasets/nlp-getting-started/test.csv")
train_df.head(), test_df.head()

(   id keyword location                                               text  \
 0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
 1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
 2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
 3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
 4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   
 
    target  
 0       1  
 1       1  
 2       1  
 3       1  
 4       1  ,
    id keyword location                                               text
 0   0     NaN      NaN                 Just happened a terrible car crash
 1   2     NaN      NaN  Heard about #earthquake is different cities, s...
 2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
 3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
 4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan)

In [6]:
train_data_shuffled = train_df.sample(frac=1, random_state=273)
test_data_shuffled = test_df.sample(frac=1, random_state=273)
train_data_shuffled, test_data_shuffled

(         id                keyword              location  \
 4678   6649              landslide  Melbourne, Australia   
 3002   4313           dust%20storm                   NaN   
 4321   6135                 hijack            Houston TX   
 1251   1807  buildings%20on%20fire                    UK   
 917    1327                 bloody                   AUS   
 ...     ...                    ...                   ...   
 7069  10125               upheaval      IG/SC:bjfordiani   
 2835   4079              displaced           Oakland, CA   
 4310   6119               hellfire                   NaN   
 4378   6219               hijacker           California    
 5871   8388                   ruin                   NaN   
 
                                                    text  target  
 4678                 @kemal_atlay caught in a landslide       1  
 3002  || So.... I just watched the trailed for The D...       0  
 4321  Tension In Bayelsa As Patience Jonathan Plans ...       1 

In [7]:
train_data_shuffled.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [8]:
len(train_data_shuffled), len(test_data_shuffled)

(7613, 3263)

In [9]:
# splitting data into train and validation data
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_data_shuffled["text"].to_numpy(),
                                                                            train_data_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=273)

In [10]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

In [11]:
train_sentences[:10], train_labels[:10]

(array(['For those impacted by the #CalWildfires here are some great recovery tips to help you in the aftermath http://t.co/wwxbGuBww5',
        "HEY LOOK!!!  Kash's Foundation Live for Today got blown up on People Magazine's website!!  \n\nTodd Blake... http://t.co/2Fenu1SYu6",
        'Three-alarm fire destroys two residential buildings a car in Manchester N.H. on Sunday afternoon http://t.co/rVkyj3YUVK',
        ".@jimmyfallon I crushed squirrel bones with a mortar and pestle for my school's bio dept. not really sure why #WorstSummerJob",
        "The Next Financial Crash. 'The Writing is on the Wall'. Don't Say 'You Weren't Warned' http://t.co/H7lDx29aba",
        'Walmart is taking steps to keep children safe in hot vehicles. Take a look at the innovative car seat here! http://t.co/z3nEvGlUFm',
        'Demolition Means Progress: Flint Michigan and the Fate of the American Metropolis Highsmith https://t.co/ZvoBMDxHGP',
        "Top story: @ViralSpell: 'Couple spend wedding day fee

### Tokenization vs Embeddings
Tokenisation is the process of converting, assigning a token(a, an, tensorflow) to numbers ( 0, 1,2).
Embedding is a represntation of relationships between tokens/words.

In [12]:
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=None,
                                  standardize="lower_and_strip_punctuation",
                                  split="whitespace",
                                  ngrams=None,
                                  output_mode="int",
                                  output_sequence_length=None,
                                  # pad_to_max_tokens=True
                                  )

2022-01-26 14:30:08.609164: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-26 14:30:08.664671: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-26 14:30:08.665469: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-26 14:30:08.667203: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [13]:
sum([len(i.split()) for i in train_sentences])/len(train_sentences)
# average length of a sentence

14.93212669683258

In [14]:
max_vocab_length = 10000
max_length = 20 # can be tuned based on the average number of words in a tweet


text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_vocab_length,
                                                    output_mode="int",
                                                    output_sequence_length=max_length,)
text_vectorizer.adapt(train_sentences)

In [15]:
text_vectorizer.adapt(train_sentences)

In [16]:
text_vectorizer(["this is a sample sentence, hope this is converted into a vectorized format"])

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[  19,    9,    3, 8824,    1,  241,   19,    9,    1,   67,    3,
           1,    1,    0,    0,    0,    0,    0,    0,    0]])>

In [17]:
text_vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'a',
 'in',
 'to',
 'of',
 'and',
 'i',
 'is',
 'for',
 'on',
 'you',
 'my',
 'with',
 'it',
 'that',
 'at',
 'by',
 'this',
 'from',
 'be',
 'are',
 'was',
 'have',
 'like',
 'as',
 'up',
 'me',
 'just',
 'so',
 'but',
 'not',
 'amp',
 'your',
 'im',
 'out',
 'its',
 'will',
 'no',
 'an',
 'after',
 'has',
 'fire',
 'all',
 'when',
 'if',
 'we',
 'get',
 'about',
 'now',
 'new',
 'via',
 'more',
 'what',
 'dont',
 'or',
 'one',
 'been',
 'people',
 'they',
 'how',
 'over',
 'news',
 'he',
 'who',
 'us',
 'into',
 'do',
 'video',
 'were',
 'emergency',
 'disaster',
 '2',
 'can',
 'there',
 'than',
 'her',
 'police',
 'some',
 'still',
 'would',
 'crash',
 'his',
 'body',
 'off',
 'burning',
 'back',
 'got',
 'why',
 'know',
 'california',
 'buildings',
 'them',
 'had',
 'time',
 'suicide',
 'storm',
 'man',
 'cant',
 'see',
 'bomb',
 'going',
 'nuclear',
 'world',
 'two',
 'rt',
 'first',
 'day',
 'youtube',
 'our',
 'love',
 'dead',
 '3',
 'their',
 'train',
 '

In [18]:
words_vocab = text_vectorizer.get_vocabulary()
words_vocab[-5:]

['pajamas', 'painthey', 'painful', 'paine', 'paging']

In [19]:
embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                            output_dim=128,
                                            input_length=max_length,
                                            )
embedding

<keras.layers.embeddings.Embedding at 0x7f6630103400>

## Base Model in NaiveBayes

In [20]:
# Base Line, built using non DL model
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [21]:
base_score = model_0.score(val_sentences, val_labels)
base_score

0.8044619422572179

In [22]:
predictions = model_0.predict(val_sentences)
predictions[:10]

array([0, 1, 0, 0, 1, 0, 0, 1, 1, 1])

In [34]:
model_0_results = calculate_results_score(val_labels, predictions)

[calculate_results_score] the accuracy is :: 80.4461942257218
[calculate_results_score] The precision is : 0.815353279638672
[calculate_results_score] The recall is : 0.8044619422572179
[calculate_results_score] The f1 score is : 0.7975368757761941


## Base Deep NN Model

In [24]:
inputs = tf.keras.layers.Input(shape=(1,), dtype="string", name="input_layer")
# Convert the raw words into numbers
vectorization_layer = text_vectorizer(inputs)
# Convert the numerical data into embeddings based on thier sequence/weights
embedding_layer = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                            output_dim=128,
                                            input_length=max_length,
                                            )(vectorization_layer)
pooling_layer = tf.keras.layers.GlobalAveragePooling1D()(embedding_layer)
outputs=tf.keras.layers.Dense(units=1, activation="sigmoid")(pooling_layer)
model_1 = tf.keras.Model(inputs=inputs, outputs=outputs, name="BaseDenseModel")
model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
tf.keras.utils.plot_model(model=model_1, show_shapes=True)
model_1.summary()

Model: "BaseDenseModel"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 20)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 20, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129


In [25]:
model_1.fit(x=train_sentences,
            y=train_labels,
            epochs=5,
            validation_data=(val_sentences, val_labels),
            callbacks=[create_model_checkpoint_callback("nlp_exp", "base_deep_model"),
                       create_tensorboard_callback("nlp_exp", "base_deep_model")])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f66b898e040>

In [27]:
model_1_pred_probs = model_1.predict(val_sentences)

In [30]:
model_1_pred_probs.shape, model_1_pred_probs[0]

((762, 1), array([0.09339862], dtype=float32))

In [32]:
model_1_predictions = tf.squeeze(tf.round(model_1_pred_probs))
model_1_predictions[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 1., 0., 0., 1., 1., 1.], dtype=float32)>

In [33]:
model_1_results = calculate_results_score(val_labels, model_1_predictions)
model_1_results

[calculate_results_score] the accuracy is :: 80.4461942257218
[calculate_results_score] The precision is : 0.8035930167819932
[calculate_results_score] The recall is : 0.8044619422572179
[calculate_results_score] The f1 score is : 0.8035135637410098


{'accuracy': 80.4461942257218,
 'precision': 0.8035930167819932,
 'recall': 0.8044619422572179,
 'f1_score': 0.8035135637410098}

In [35]:
model_0_results

{'accuracy': 80.4461942257218,
 'precision': 0.815353279638672,
 'recall': 0.8044619422572179,
 'f1_score': 0.7975368757761941}

In [37]:
# visualising the vocab data
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10], 

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [41]:
embedding_layer_weights = model_1.layers[2].get_weights()[0]
embedding_layer_weights.shape

(10000, 128)

In [48]:
import io
def create_word_embedding_files(words_in_vocab, weights):
    vector_file_path = os.path.join("model_logs", "nlp_exp", "base_deep_model", 'vectors.tsv')
    metadata_file_path = os.path.join("model_logs", "nlp_exp", "base_deep_model", 'metadata.tsv')
    with open(vector_file_path, "w", encoding="utf-8") as out_v:
        with open(metadata_file_path, "w", encoding="utf-8") as out_m:
            for index, word in enumerate(words_in_vocab):
                if index == 0:
                    continue  # skip 0, it's padding.
                vec = weights[index]
                out_v.write('\t'.join([str(x) for x in vec]) + "\n")
                out_m.write(word + "\n")

In [49]:
create_word_embedding_files(words_in_vocab, embedding_layer_weights)