# NLP fundametals
Steps involved :
 - Convert the words to numercial sequences
 - Create the word-numerical sequence matrix, which shows the word vector dense matrix
 - Build the RNN network (LSTM)
 - Compile, calssify

In [1]:
import os
import tensorflow as tf
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow_hub as hub
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split

In [2]:
def create_tensorboard_callback(experiment_name, model_name):
    """
    Create an tensorboard callback.
    """
    return tf.keras.callbacks.TensorBoard(log_dir=os.path.join("model_logs", experiment_name, model_name))

In [3]:
def create_model_checkpoint_callback(experiment_name, model_name):
    """
     Create an Model Checkpoint callback
    """
    return tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join("model_checkpoints", experiment_name, model_name+".ckpt"),
                                              save_weights_only=True, monitor="val_acc", 
                                              save_best_only=True)

In [4]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results_score(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred)*100
    print(f"[calculate_results_score] the accuracy is :: {model_accuracy}")
    model_precision, model_recall, model_f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    print(f"[calculate_results_score] The precision is : {model_precision}")
    print(f"[calculate_results_score] The recall is : {model_recall}")
    print(f"[calculate_results_score] The f1 score is : {model_f1_score}")
    return {
        "accuracy": model_accuracy,
        "precision": model_precision,
        "recall": model_recall,
        "f1_score": model_f1_score,
             }

In [5]:
train_df = pd.read_csv("../datasets/nlp-getting-started/train.csv")
test_df = pd.read_csv("../datasets/nlp-getting-started/test.csv")
train_df.head(), test_df.head()

(   id keyword location                                               text  \
 0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
 1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
 2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
 3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
 4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   
 
    target  
 0       1  
 1       1  
 2       1  
 3       1  
 4       1  ,
    id keyword location                                               text
 0   0     NaN      NaN                 Just happened a terrible car crash
 1   2     NaN      NaN  Heard about #earthquake is different cities, s...
 2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
 3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
 4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan)

In [6]:
train_data_shuffled = train_df.sample(frac=1, random_state=273)
test_data_shuffled = test_df.sample(frac=1, random_state=273)
train_data_shuffled, test_data_shuffled

(         id                keyword              location  \
 4678   6649              landslide  Melbourne, Australia   
 3002   4313           dust%20storm                   NaN   
 4321   6135                 hijack            Houston TX   
 1251   1807  buildings%20on%20fire                    UK   
 917    1327                 bloody                   AUS   
 ...     ...                    ...                   ...   
 7069  10125               upheaval      IG/SC:bjfordiani   
 2835   4079              displaced           Oakland, CA   
 4310   6119               hellfire                   NaN   
 4378   6219               hijacker           California    
 5871   8388                   ruin                   NaN   
 
                                                    text  target  
 4678                 @kemal_atlay caught in a landslide       1  
 3002  || So.... I just watched the trailed for The D...       0  
 4321  Tension In Bayelsa As Patience Jonathan Plans ...       1 

In [7]:
train_data_shuffled.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [8]:
len(train_data_shuffled), len(test_data_shuffled)

(7613, 3263)

In [9]:
# splitting data into train and validation data
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_data_shuffled["text"].to_numpy(),
                                                                            train_data_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=273)

In [10]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

In [11]:
train_sentences[:10], train_labels[:10]

(array(['For those impacted by the #CalWildfires here are some great recovery tips to help you in the aftermath http://t.co/wwxbGuBww5',
        "HEY LOOK!!!  Kash's Foundation Live for Today got blown up on People Magazine's website!!  \n\nTodd Blake... http://t.co/2Fenu1SYu6",
        'Three-alarm fire destroys two residential buildings a car in Manchester N.H. on Sunday afternoon http://t.co/rVkyj3YUVK',
        ".@jimmyfallon I crushed squirrel bones with a mortar and pestle for my school's bio dept. not really sure why #WorstSummerJob",
        "The Next Financial Crash. 'The Writing is on the Wall'. Don't Say 'You Weren't Warned' http://t.co/H7lDx29aba",
        'Walmart is taking steps to keep children safe in hot vehicles. Take a look at the innovative car seat here! http://t.co/z3nEvGlUFm',
        'Demolition Means Progress: Flint Michigan and the Fate of the American Metropolis Highsmith https://t.co/ZvoBMDxHGP',
        "Top story: @ViralSpell: 'Couple spend wedding day fee

### Tokenization vs Embeddings
Tokenisation is the process of converting, assigning a token(a, an, tensorflow) to numbers ( 0, 1,2).
Embedding is a represntation of relationships between tokens/words.

In [12]:
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=None,
                                  standardize="lower_and_strip_punctuation",
                                  split="whitespace",
                                  ngrams=None,
                                  output_mode="int",
                                  output_sequence_length=None,
                                  # pad_to_max_tokens=True
                                  )

2022-01-26 18:39:04.393207: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2022-01-26 18:39:04.393263: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: rocket
2022-01-26 18:39:04.393276: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: rocket
2022-01-26 18:39:04.393507: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.86.0
2022-01-26 18:39:04.393554: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.86.0
2022-01-26 18:39:04.393564: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.86.0
2022-01-26 18:39:04.394584: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operat

In [13]:
sum([len(i.split()) for i in train_sentences])/len(train_sentences)
# average length of a sentence

14.93212669683258

In [14]:
max_vocab_length = 10000
max_length = 20 # can be tuned based on the average number of words in a tweet


text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_vocab_length,
                                                    output_mode="int",
                                                    output_sequence_length=max_length,)
text_vectorizer.adapt(train_sentences)

In [15]:
text_vectorizer.adapt(train_sentences)

In [16]:
text_vectorizer(["this is a sample sentence, hope this is converted into a vectorized format"])

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[  19,    9,    3, 8824,    1,  241,   19,    9,    1,   67,    3,
           1,    1,    0,    0,    0,    0,    0,    0,    0]])>

In [17]:
text_vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'a',
 'in',
 'to',
 'of',
 'and',
 'i',
 'is',
 'for',
 'on',
 'you',
 'my',
 'with',
 'it',
 'that',
 'at',
 'by',
 'this',
 'from',
 'be',
 'are',
 'was',
 'have',
 'like',
 'as',
 'up',
 'me',
 'just',
 'so',
 'but',
 'not',
 'amp',
 'your',
 'im',
 'out',
 'its',
 'will',
 'no',
 'an',
 'after',
 'has',
 'fire',
 'all',
 'when',
 'if',
 'we',
 'get',
 'about',
 'now',
 'new',
 'via',
 'more',
 'what',
 'dont',
 'or',
 'one',
 'been',
 'people',
 'they',
 'how',
 'over',
 'news',
 'he',
 'who',
 'us',
 'into',
 'do',
 'video',
 'were',
 'emergency',
 'disaster',
 '2',
 'can',
 'there',
 'than',
 'her',
 'police',
 'some',
 'still',
 'would',
 'crash',
 'his',
 'body',
 'off',
 'burning',
 'back',
 'got',
 'why',
 'know',
 'california',
 'buildings',
 'them',
 'had',
 'time',
 'suicide',
 'storm',
 'man',
 'cant',
 'see',
 'bomb',
 'going',
 'nuclear',
 'world',
 'two',
 'rt',
 'first',
 'day',
 'youtube',
 'our',
 'love',
 'dead',
 '3',
 'their',
 'train',
 '

In [18]:
words_vocab = text_vectorizer.get_vocabulary()
words_vocab[-5:]

['pajamas', 'painthey', 'painful', 'paine', 'paging']

In [19]:
embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                            output_dim=128,
                                            input_length=max_length,
                                            )
embedding

<keras.layers.embeddings.Embedding at 0x7f2ba746cfa0>

## Base Model in NaiveBayes

In [20]:
# Base Line, built using non DL model
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [21]:
base_score = model_0.score(val_sentences, val_labels)
base_score

0.8044619422572179

In [22]:
predictions = model_0.predict(val_sentences)
predictions[:10]

array([0, 1, 0, 0, 1, 0, 0, 1, 1, 1])

In [23]:
model_0_results = calculate_results_score(val_labels, predictions)

[calculate_results_score] the accuracy is :: 80.4461942257218
[calculate_results_score] The precision is : 0.815353279638672
[calculate_results_score] The recall is : 0.8044619422572179
[calculate_results_score] The f1 score is : 0.7975368757761941


## Base Deep NN Model

In [24]:
inputs = tf.keras.layers.Input(shape=(1,), dtype="string", name="input_layer")
# Convert the raw words into numbers
vectorization_layer = text_vectorizer(inputs)
# Convert the numerical data into embeddings based on thier sequence/weights
embedding_layer = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                            output_dim=128,
                                            input_length=max_length,
                                            )(vectorization_layer)
pooling_layer = tf.keras.layers.GlobalAveragePooling1D()(embedding_layer)
outputs=tf.keras.layers.Dense(units=1, activation="sigmoid")(pooling_layer)
model_1 = tf.keras.Model(inputs=inputs, outputs=outputs, name="BaseDenseModel")
model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
tf.keras.utils.plot_model(model=model_1, show_shapes=True)
model_1.summary()

Model: "BaseDenseModel"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 20)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 20, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129


In [25]:
model_1.fit(x=train_sentences,
            y=train_labels,
            epochs=5,
            validation_data=(val_sentences, val_labels),
            callbacks=[create_model_checkpoint_callback("nlp_exp", "base_deep_model"),
                       create_tensorboard_callback("nlp_exp", "base_deep_model")])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f94f9870e20>

In [26]:
model_1_pred_probs = model_1.predict(val_sentences)

In [27]:
model_1_pred_probs.shape, model_1_pred_probs[0]

((762, 1), array([0.0884866], dtype=float32))

In [28]:
model_1_predictions = tf.squeeze(tf.round(model_1_pred_probs))
model_1_predictions[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 1., 0., 0., 1., 1., 1.], dtype=float32)>

In [29]:
model_1_results = calculate_results_score(val_labels, model_1_predictions)
model_1_results

[calculate_results_score] the accuracy is :: 81.10236220472441
[calculate_results_score] The precision is : 0.8105000201894885
[calculate_results_score] The recall is : 0.8110236220472441
[calculate_results_score] The f1 score is : 0.8094559672226677


{'accuracy': 81.10236220472441,
 'precision': 0.8105000201894885,
 'recall': 0.8110236220472441,
 'f1_score': 0.8094559672226677}

In [30]:
model_0_results

{'accuracy': 80.4461942257218,
 'precision': 0.815353279638672,
 'recall': 0.8044619422572179,
 'f1_score': 0.7975368757761941}

In [31]:
# visualising the vocab data
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10], 

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [32]:
embedding_layer_weights = model_1.layers[2].get_weights()[0]
embedding_layer_weights.shape

(10000, 128)

In [33]:
import io
def create_word_embedding_files(words_in_vocab, weights):
    vector_file_path = os.path.join("model_logs", "nlp_exp", "base_deep_model", 'vectors.tsv')
    metadata_file_path = os.path.join("model_logs", "nlp_exp", "base_deep_model", 'metadata.tsv')
    with open(vector_file_path, "w", encoding="utf-8") as out_v:
        with open(metadata_file_path, "w", encoding="utf-8") as out_m:
            for index, word in enumerate(words_in_vocab):
                if index == 0:
                    continue  # skip 0, it's padding.
                vec = weights[index]
                out_v.write('\t'.join([str(x) for x in vec]) + "\n")
                out_m.write(word + "\n")

In [34]:
create_word_embedding_files(words_in_vocab, embedding_layer_weights)

## LSTM : long short term memory (RNN)

In [47]:
tf.random.set_seed(273)
model_name =  "lstm_model"

inputs = tf.keras.layers.Input(shape=(1,), dtype="string", name="input_layer")
# Vectorize the data, i.e convert to numerical encoding
# Convert the raw words into numbers
vectorization_layer_2 = text_vectorizer(inputs)
# Convert the numerical data into embeddings based on thier sequence/weights
embedding_layer_2 = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                              output_dim=128,
                                              input_length=max_length,
                                              embeddings_initializer="uniform",
                                            )(vectorization_layer_2)
# RNN layer 
recurrent_layer_2_a = tf.keras.layers.LSTM(units=64, return_sequences=True)(embedding_layer_2)
recurrent_layer_2_b = tf.keras.layers.LSTM(units=64)(recurrent_layer_2_a)
# Dense Layer
# x = tf.keras.layers.Dense(units=64, activation="relu")(recurrent_layer_2_b)
outputs = tf.keras.layers.Dense(units=1, activation="sigmoid", name=model_name)(recurrent_layer_2_b)
model_2 = tf.keras.Model(inputs, outputs, name=model_name)

model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
model_2_history = model_2.fit(train_sentences, train_labels,
                              epochs=5, validation_data=(val_sentences, val_labels),
                              callbacks=[create_model_checkpoint_callback("nlp_exp", "base_rnn_model"),
                                         create_tensorboard_callback("nlp_exp", "base_rnn_model")])
model_2.summary()

Epoch 1/5


2022-01-26 16:04:44.651915: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8202


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "lstm_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 20)               0         
 ectorization)                                                   
                                                                 
 embedding_10 (Embedding)    (None, 20, 128)           1280000   
                                                                 
 lstm_15 (LSTM)              (None, 20, 64)            49408     
                                                                 
 lstm_16 (LSTM)              (None, 64)                33024     
                                                                 
 lstm_model (Dense)          (None, 1)                 65        
                

In [49]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]

array([[0.04516522],
       [0.12157019],
       [0.05206184],
       [0.02336265],
       [0.99973124],
       [0.02011085],
       [0.01728604],
       [0.9997726 ],
       [0.99965596],
       [0.99875176]], dtype=float32)

In [50]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_results = calculate_results_score(val_labels, model_2_preds)
model_2_results

[calculate_results_score] the accuracy is :: 74.2782152230971
[calculate_results_score] The precision is : 0.7456836284970436
[calculate_results_score] The recall is : 0.7427821522309711
[calculate_results_score] The f1 score is : 0.7437210550636849


{'accuracy': 74.2782152230971,
 'precision': 0.7456836284970436,
 'recall': 0.7427821522309711,
 'f1_score': 0.7437210550636849}

### GRU cell , grated recurrent unit

In [62]:
tf.random.set_seed(273)
# Model 3
inputs_3 = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name="input_layer")
#vectorization
vectorization_layer_3 = text_vectorizer(inputs_3)
# embedding
embedding_layer_3 = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                              output_dim=128,
                                              input_length=max_length,
                                              embeddings_initializer="uniform",
                                            )(vectorization_layer_3)
gru_layer_a = tf.keras.layers.GRU(units=64, return_sequences=True)(embedding_layer_3)
# lstm_layer = tf.keras.layers.LSTM(units=64, return_sequences=True)(gru_layer_a)
#print("lstm layer :: {}".format(lstm_layer.shape))
gru_layer_b = tf.keras.layers.GRU(units=64)(gru_layer_a)
# actviation_layer = tf.keras.layers.Dense(units=64, activation="relu", name="activation_layer")(gru_layer_b)
# pooling_layer = tf.keras.layers.GlobalAveragePooling1D()(gru_layer_a)
outputs_3 = tf.keras.layers.Dense(units=1, activation="sigmoid", name="output_layer")(gru_layer_b)
model_3 = tf.keras.Model(inputs=inputs_3, outputs=outputs_3, name="gru_model")
model_3.summary()

Model: "gru_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 20)               0         
 ectorization)                                                   
                                                                 
 embedding_21 (Embedding)    (None, 20, 128)           1280000   
                                                                 
 gru_17 (GRU)                (None, 20, 64)            37248     
                                                                 
 gru_18 (GRU)                (None, 64)                24960     
                                                                 
 output_layer (Dense)        (None, 1)                 65        
                                                         

In [63]:
model_3.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
model_3_history = model_3.fit(train_sentences,
                          train_labels,
                          validation_data=(val_sentences, val_labels),
                          epochs=5,
                          callbacks=[create_model_checkpoint_callback("nlp_exp", "gru_model"),
                                     create_tensorboard_callback("nlp_exp", "gru_model")])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [64]:
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs[:10]

array([[0.02808117],
       [0.2096241 ],
       [0.02792463],
       [0.00976873],
       [0.99871504],
       [0.01962819],
       [0.01452878],
       [0.9993586 ],
       [0.9980019 ],
       [0.9980489 ]], dtype=float32)

In [67]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 1., 0., 0., 1., 1., 1.], dtype=float32)>

In [68]:
calculate_results_score(val_labels, model_3_preds)

[calculate_results_score] the accuracy is :: 74.1469816272966
[calculate_results_score] The precision is : 0.743531793576729
[calculate_results_score] The recall is : 0.7414698162729659
[calculate_results_score] The f1 score is : 0.7422130025392907


{'accuracy': 74.1469816272966,
 'precision': 0.743531793576729,
 'recall': 0.7414698162729659,
 'f1_score': 0.7422130025392907}

### Model 4 : Bidriectional Rnn

In [21]:
tf.random.set_seed(273)
inputs_4 = tf.keras.layers.Input(shape=(1, ), dtype=tf.string, name="input_layer")
vectorization_layer_4 = text_vectorizer(inputs_4)
embedding_layer_4 = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                            output_dim=128,
                                            input_length=max_length,
                                            embeddings_initializer="uniform",                                            
                                           )(vectorization_layer_4)
embedding_layer_4_a = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(embedding_layer_4)
embedding_layer_4_b = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64))(embedding_layer_4_a)
outputs_4 = tf.keras.layers.Dense(units=1, activation="sigmoid", name="output_layer")(embedding_layer_4_b)

model_4 = tf.keras.Model(inputs=inputs_4, outputs=outputs_4, name="bidirectional_rnn")
model_4.summary()

Model: "bidirectional_rnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 20)               0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 20, 128)           1280000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 20, 128)          98816     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              74496     
 nal)                                                            
                                                 

In [22]:
model_4.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [23]:
model_4.fit(train_sentences, train_labels, validation_data=(val_sentences, val_labels),
            epochs=5, callbacks=[create_model_checkpoint_callback("nlp_exp", "bidirectional_rnn"),
                                 create_tensorboard_callback("nlp_exp", "bidirectional_rnn")] )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2b5a8b6430>

In [24]:
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]

array([[0.01345822],
       [0.05456653],
       [0.02790022],
       [0.02769393],
       [0.99898446],
       [0.003234  ],
       [0.00366732],
       [0.9999149 ],
       [0.99889207],
       [0.96837026]], dtype=float32)

In [25]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 1., 0., 0., 1., 1., 1.], dtype=float32)>

In [26]:
calculate_results_score(val_labels, model_4_preds)

[calculate_results_score] the accuracy is :: 77.29658792650919
[calculate_results_score] The precision is : 0.7719745677537196
[calculate_results_score] The recall is : 0.7729658792650919
[calculate_results_score] The f1 score is : 0.7703610768366989


{'accuracy': 77.29658792650919,
 'precision': 0.7719745677537196,
 'recall': 0.7729658792650919,
 'f1_score': 0.7703610768366989}

### Model 5 : CNN for text data

In [31]:
tf.random.set_seed(273)

inputs_5 = tf.keras.layers.Input(shape=(1,),dtype=tf.string, name="input_layer")
vectorization_layer_5 = text_vectorizer(inputs_5)
embedding_layer_5 = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                            output_dim=128,
                                            input_length=max_length,
                                            embeddings_initializer="uniform",)(vectorization_layer_5)
conv_layer = tf.keras.layers.Conv1D(filters=64,
                                    kernel_size=5,
                                    strides=1,
                                    activation="relu",
                                    padding="valid", name="convolution_layer")(embedding_layer_5)
pooling_layer_5 = tf.keras.layers.GlobalMaxPool1D(name="max_pooling_layer")(conv_layer)
outputs_5 = tf.keras.layers.Dense(units=1, activation="sigmoid", name="output_layer")(pooling_layer_5)

model_5 = tf.keras.Model(inputs=inputs_5, outputs=outputs_5, name="cnn_text_classifier")
model_5.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
model_5.summary()


Model: "cnn_text_classifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 20)               0         
 ectorization)                                                   
                                                                 
 embedding_6 (Embedding)     (None, 20, 128)           1280000   
                                                                 
 convolution_layer (Conv1D)  (None, 16, 64)            41024     
                                                                 
 max_pooling_layer (GlobalMa  (None, 64)               0         
 xPooling1D)                                                     
                                                                 
 output_layer (Dense)        (None, 1)         

In [32]:
model_5_history = model_5.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_model_checkpoint_callback("nlp_exp", "cnn_text_classfier"),
                                         create_tensorboard_callback("nlp_exp", "cnn_text_classfier"),
                                         ])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
model_5_preds_probs = model_5.predict(val_sentences)
model_5_preds_probs[:10]

array([[1.4668882e-02],
       [3.6546665e-01],
       [3.7992895e-03],
       [1.8065184e-02],
       [9.9976707e-01],
       [1.5287101e-02],
       [6.1517954e-04],
       [1.0000000e+00],
       [9.9959135e-01],
       [9.7741640e-01]], dtype=float32)

In [34]:
model_5_preds = tf.squeeze(tf.round(model_5_preds_probs))

In [35]:
model_5_results = calculate_results_score(val_labels, model_5_preds)
model_5_results

[calculate_results_score] the accuracy is :: 78.08398950131233
[calculate_results_score] The precision is : 0.7796719357517247
[calculate_results_score] The recall is : 0.7808398950131233
[calculate_results_score] The f1 score is : 0.7796474006201792


{'accuracy': 78.08398950131233,
 'precision': 0.7796719357517247,
 'recall': 0.7808398950131233,
 'f1_score': 0.7796474006201792}

### Model 6 : Transfer Learning 

In [36]:
import tensorflow_hub as hub

# pre trained embedding layer
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [37]:
embed_samples = embed(["Hey, thats a storm out there, red alert!",
                        "That food was suh a disaster, it sucked."])

In [38]:
embed_samples

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[ 0.07470109,  0.05868598, -0.0422262 , ..., -0.04076605,
         0.04728632,  0.02345333],
       [ 0.02443152, -0.02388306,  0.06637954, ..., -0.03705139,
         0.01797296, -0.07725956]], dtype=float32)>

In [40]:
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name="universal_sentence_encoder")

In [50]:
model_6 = tf.keras.Sequential(layers=[
    sentence_encoder_layer,
    tf.keras.layers.Dense(units=64, activation="relu"),
    tf.keras.layers.Dense(units=1, activation="sigmoid", name="output_layer")
], name="pretrained_use_model")
model_6.summary()

Model: "pretrained_use_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 universal_sentence_encoder   (None, 512)              256797824 
 (KerasLayer)                                                    
                                                                 
 dense_2 (Dense)             (None, 64)                32832     
                                                                 
 output_layer (Dense)        (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [51]:
model_6.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [52]:
model_6.fit(train_sentences,
            train_labels,
            validation_data=(val_sentences, val_labels),
           epochs=5,
            callbacks=[create_model_checkpoint_callback("nlp_exp", "pretrained_use_model"),
                       create_tensorboard_callback("nlp_exp", "pretrained_use_model")])

Epoch 1/5




Epoch 2/5




Epoch 3/5




Epoch 4/5




Epoch 5/5






<keras.callbacks.History at 0x7f2b22cfecd0>

In [53]:
model_6_pred_prods = model_6.predict(val_sentences)
model_6_pred_prods[:10]

array([[0.11684072],
       [0.14856562],
       [0.17748833],
       [0.05267456],
       [0.9908987 ],
       [0.08122388],
       [0.20797008],
       [0.98698425],
       [0.9149903 ],
       [0.920263  ]], dtype=float32)

In [54]:
model_6_preds = tf.squeeze(tf.round(model_6_pred_prods))
model_6_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 1., 0., 0., 1., 1., 1.], dtype=float32)>

In [55]:
model_6_results = calculate_results_score(val_labels, model_6_preds)

[calculate_results_score] the accuracy is :: 82.1522309711286
[calculate_results_score] The precision is : 0.8208411032035219
[calculate_results_score] The recall is : 0.821522309711286
[calculate_results_score] The f1 score is : 0.8208097274071411
