<a href="https://colab.research.google.com/github/Codehackerone/NLU-World-Bank/blob/main/Learning_from_DIsaster_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Check for GPU
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-7585b21f-2657-ab93-c499-730cde8c35be)


## Imports + helper functions

In [None]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [None]:
def create_tensorboard_callback(dir_name, experiment_name):
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback

In [None]:
def plot_loss_curves(history):
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  accuracy = history.history['accuracy']
  val_accuracy = history.history['val_accuracy']

  epochs = range(len(history.history['loss']))

  plt.plot(epochs, loss, label='training_loss')
  plt.plot(epochs, val_loss, label='val_loss')
  plt.title('Loss')
  plt.xlabel('Epochs')
  plt.legend()

  plt.figure()
  plt.plot(epochs, accuracy, label='training_accuracy')
  plt.plot(epochs, val_accuracy, label='val_accuracy')
  plt.title('Accuracy')
  plt.xlabel('Epochs')
  plt.legend();

In [None]:
def compare_historys(original_history, new_history, initial_epochs=5):    
    acc = original_history.history["accuracy"]
    loss = original_history.history["loss"]

    val_acc = original_history.history["val_accuracy"]
    val_loss = original_history.history["val_loss"]

    total_acc = acc + new_history.history["accuracy"]
    total_loss = loss + new_history.history["loss"]

    total_val_acc = val_acc + new_history.history["val_accuracy"]
    total_val_loss = val_loss + new_history.history["val_loss"]

    plt.figure(figsize=(8, 8))
    plt.subplot(2, 1, 1)
    plt.plot(total_acc, label='Training Accuracy')
    plt.plot(total_val_acc, label='Validation Accuracy')
    plt.plot([initial_epochs-1, initial_epochs-1], plt.ylim(), label='Start Fine Tuning')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(total_loss, label='Training Loss')
    plt.plot(total_val_loss, label='Validation Loss')
    plt.plot([initial_epochs-1, initial_epochs-1], plt.ylim(), label='Start Fine Tuning') 
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.xlabel('epoch')
    plt.show()

## Visualising a Text dataset

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/Tf_exercises/NLP/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Tf_exercises/NLP/test.csv")

In [None]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
#shuffle dataset
train_df_shuffled = train_df.sample(frac=1, random_state=101)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
5475,7813,quarantine,"ÌÏT: 40.707762,-74.014213",Aannnnd - 'Reddit Will Now Quarantine Offensiv...,0
6387,9128,suicide%20bomb,Na waffi,Pic of 16yr old PKK suicide bomber who detonat...,1
4343,6167,hijack,"Near Richmond, VA",Another Mac vuln!\n\nhttps://t.co/OxXRnaB8Un,0
4524,6429,hurricane,,@Hurricane_Dolce no prob,1
2771,3983,devastation,,70 Years After Atomic Bombs Japan Still Strugg...,1


In [None]:
train_df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [None]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

Total training samples: 7613
Total test samples: 3263
Total samples: 10876


In [None]:
import random
# create indexes within range 
random_index = random.randint(0, len(train_df)-5) 

for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 1 (real disaster)
Text:
#Japan marks 70th anniversary of #Hiroshima atomic bombing (from @AP) http://t.co/qREInWg0GS

---

Target: 1 (real disaster)
Text:
Obama Declares Disaster for Typhoon-Devastated Saipan: Obama signs disaster declaration for Northern Marians a... http://t.co/1i19CuOv7L

---

Target: 1 (real disaster)
Text:
Benzema increasingly looks to be a casualty of Benitez's new look squad. Arsenal bound? 50-50 chance I think

---

Target: 0 (not real disaster)
Text:
Do you ever just want to obliterate an entire species off the face of the earth? I vote for mosquitoes

---

Target: 0 (not real disaster)
Text:
THIS IS RELAXING! #thunder #SoothMySlumber #WATERMELOANN #populardemand w/ @Soak... (Vine by @thewebbeffect19) https://t.co/F0QIRS5lJA

---



## Splitting Data into train and validation sets

In [None]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=101)

In [None]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [None]:
train_sentences[:10], train_labels[:10]

(array(["@RachelRofe tired it' 5:36 am. Woke up to a thunderstorm lightning and rain. How are you?",
        "Best windows torrent client? was recommended Deluge but it looks like it was written 10 years ago with java swing and 'uses' worse",
        "#Colorado #Avalanche Men's Official Colorado Avalanche Reebok T-Shirt XL Blue 100% Cotton http://t.co/ZNSvsTGwx3 #NHL #Hockey",
        '@BuffoonMike I knew mo not doing much would bite us he was influenced by that shitty staff and injuries are not acquisitions',
        'Being bestfriends with your high school crush???? @yourboy_shawn',
        'Skinny Jeans are Hazardous for Your Health! #socialnews http://t.co/LTMa9xQXpx',
        "Robert Ballew's log statements are always at the FATAL level.",
        'My precious olive tree lost this battle...another crazy windstorm in #yyc! @weathernetwork http://t.co/N00DVXEga2',
        'Cabin Fever 2 flames https://t.co/yXnagsqvBM',
        '.@NorwayMFA #Bahrain police had previously died in a ro

## Coverting text to numbers

### Text Vectorization/Tokenization

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens=None, #how many words in vocab
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace",# how to split tokens
                                    ngrams=None, #create groups of n-words
                                    output_mode="int", # how to map token into numbers
                                    output_sequence_length=None #how long  
                                    )

In [None]:
# Avg no of tokens(words) in training set
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [None]:
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [None]:
text_vectorizer.adapt(train_sentences)

In [None]:
sample_sentence = "I love Tensorflow!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  8, 107,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [None]:
# Choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
@afterShock_DeLo scuf ps live and the game... cya      

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[6213, 4636, 2768,  191,    7,    2,  400,    1,    0,    0,    0,
           0,    0,    0,    0]])>

In [None]:
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab)

10000

In [None]:
# Top 5 common words
words_in_vocab[:5]

['', '[UNK]', 'the', 'a', 'in']

In [None]:
# 5 Least Common
words_in_vocab[-5:]

['palm', 'palinfoen', 'palestinian\x89Û', 'paleface', 'pale']

### Embedding Layer

In [None]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = max_vocab_length,
                             output_dim=128,
                             embeddings_initializer="uniform",
                             input_length=max_length)
embedding

<keras.layers.embeddings.Embedding at 0x7f66f008c8d0>

In [None]:
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
Why does it say Silas sliced in that headlinelike someone chopped him up like a piece of cabbage????????????????????????  #GH      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.01507762, -0.00683278, -0.02583267, ..., -0.01785116,
          0.01210141,  0.02151849],
        [ 0.02754733, -0.02557244,  0.01697358, ..., -0.0491358 ,
         -0.02985781,  0.00187672],
        [ 0.03933248, -0.01777107, -0.03752483, ..., -0.00127109,
         -0.04870188,  0.04951696],
        ...,
        [-0.04487647, -0.04093845, -0.04499679, ...,  0.04410474,
         -0.04311071,  0.04430291],
        [-0.035471  ,  0.01975041, -0.03188761, ...,  0.02796276,
         -0.01142106,  0.0133114 ],
        [-0.03093685, -0.03192439, -0.01111617, ...,  0.02927883,
          0.03619931,  0.03194046]]], dtype=float32)>

## Building Models

### Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
model_0 = Pipeline([
                      ("tfidf", TfidfVectorizer()),
                      ("clf", MultinomialNB())                      
                    ])

In [None]:
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [None]:
# accuracy
baseline_score = model_0.score(val_sentences, val_labels)
baseline_score

0.8083989501312336

In [None]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 80.83989501312337,
 'f1': 0.8034518553856209,
 'precision': 0.8151819569724925,
 'recall': 0.8083989501312336}

### Model 1:Simple Dense Model

In [None]:
# Directory to save Tensorboard logs
SAVE_DIR = "model_logs"

In [None]:
from tensorflow.keras import layers

inputs = layers.Input(shape = (1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x) # lower the dimentionality of embedding
outputs = layers.Dense(1, activation='sigmoid')(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

In [None]:
model_1.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d_4   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_6 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [None]:
model_1_history = model_1.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks = [create_tensorboard_callback(dir_name=SAVE_DIR,
                                           experiment_name = "simple_dense_model")
                                          ]
)

Saving TensorBoard log files to: model_logs/simple_dense_model/20220625-164821
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_1.evaluate(val_sentences, val_labels)



[0.5074189901351929, 0.7913385629653931]

In [None]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs[:10]

array([[0.53456867],
       [0.99796355],
       [0.9361799 ],
       [0.2596228 ],
       [0.5398504 ],
       [0.70589226],
       [0.9871221 ],
       [0.7675339 ],
       [0.8658932 ],
       [0.9352523 ]], dtype=float32)

In [None]:
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0.], dtype=float32)>

In [None]:
model_1_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_1_preds)
model_1_results

{'accuracy': 79.13385826771653,
 'f1': 0.7901501572052335,
 'precision': 0.7904422174500914,
 'recall': 0.7913385826771654}

In [None]:
import numpy as np
np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

array([False, False, False, False])

In [None]:
# helper function to compare our baseline results
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

compare_baseline_to_new_results(baseline_results=baseline_results, 
                                new_model_results=model_1_results)

Baseline accuracy: 80.84, New accuracy: 79.13, Difference: -1.71
Baseline precision: 0.82, New precision: 0.79, Difference: -0.02
Baseline recall: 0.81, New recall: 0.79, Difference: -0.02
Baseline f1: 0.80, New f1: 0.79, Difference: -0.01


### Model 2:LSTM

In [None]:
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

In [None]:
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
____________________________________________

In [None]:
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, 
                                                                     "LSTM")])

Saving TensorBoard log files to: model_logs/LSTM/20220625-170829
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs.shape, model_2_pred_probs[:10]

((762, 1), array([[0.7762597 ],
        [0.9998654 ],
        [0.9987645 ],
        [0.9698967 ],
        [0.83919984],
        [0.9980527 ],
        [0.9997305 ],
        [0.88703364],
        [0.90217984],
        [0.99940324]], dtype=float32))

In [None]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)>

In [None]:
model_2_results = calculate_results(y_true=val_labels,
                                    y_pred=model_2_preds)
model_2_results

{'accuracy': 75.98425196850394,
 'f1': 0.7600770472364051,
 'precision': 0.7603774031589875,
 'recall': 0.7598425196850394}

In [None]:
compare_baseline_to_new_results(baseline_results, model_2_results)

Baseline accuracy: 80.84, New accuracy: 75.98, Difference: -4.86
Baseline precision: 0.82, New precision: 0.76, Difference: -0.05
Baseline recall: 0.81, New recall: 0.76, Difference: -0.05
Baseline f1: 0.80, New f1: 0.76, Difference: -0.04


### Model 3:GRU

In [None]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64)(x) 
outputs = layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

In [None]:
model_3.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_8 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_____________________________________________

In [None]:
model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "GRU")])

Saving TensorBoard log files to: model_logs/GRU/20220625-171123
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs.shape, model_3_pred_probs[:10]

((762, 1), array([[0.8720886 ],
        [0.9998554 ],
        [0.99952567],
        [0.99891603],
        [0.9859276 ],
        [0.9987457 ],
        [0.999549  ],
        [0.8453394 ],
        [0.978635  ],
        [0.999516  ]], dtype=float32))

In [None]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)>

In [None]:
model_3_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_3_preds)
model_3_results

{'accuracy': 73.49081364829397,
 'f1': 0.7361959050142429,
 'precision': 0.7404018942076686,
 'recall': 0.7349081364829396}

In [None]:
compare_baseline_to_new_results(baseline_results, model_3_results)

Baseline accuracy: 80.84, New accuracy: 73.49, Difference: -7.35
Baseline precision: 0.82, New precision: 0.74, Difference: -0.07
Baseline recall: 0.81, New recall: 0.73, Difference: -0.07
Baseline f1: 0.80, New f1: 0.74, Difference: -0.07


### Model 4:Bidirectional RNN

In [None]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64))(x) 
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_Bidirectional")

In [None]:
model_4.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
model_4.summary()

Model: "model_4_Bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              98816     
 l)                                                              
                                                                 
 dense_9 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,378,945
Trainable params: 1,3

In [None]:
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "bidirectional_RNN")])

Saving TensorBoard log files to: model_logs/bidirectional_RNN/20220625-171311
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]

array([[0.75901735],
       [0.9999851 ],
       [0.9977162 ],
       [0.99924564],
       [0.70407075],
       [0.9999291 ],
       [0.99998164],
       [0.8955765 ],
       [0.9472857 ],
       [0.99996185]], dtype=float32)

In [None]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)>

In [None]:
model_4_results = calculate_results(val_labels, model_4_preds)
model_4_results

{'accuracy': 75.06561679790026,
 'f1': 0.7513552906040256,
 'precision': 0.752717292375734,
 'recall': 0.7506561679790026}

In [None]:
compare_baseline_to_new_results(baseline_results, model_4_results)

Baseline accuracy: 80.84, New accuracy: 75.07, Difference: -5.77
Baseline precision: 0.82, New precision: 0.75, Difference: -0.06
Baseline recall: 0.81, New recall: 0.75, Difference: -0.06
Baseline f1: 0.80, New f1: 0.75, Difference: -0.05


### Model 5:Conv1D

In [None]:
embedding_test = embedding(text_vectorizer(["this is a test sentence"]))
conv_1d = layers.Conv1D(filters=32, kernel_size=5, activation="relu")
conv_1d_output = conv_1d(embedding_test)
max_pool = layers.GlobalMaxPool1D() 
max_pool_output = max_pool(conv_1d_output)
embedding_test.shape, conv_1d_output.shape, max_pool_output.shape

(TensorShape([1, 15, 128]), TensorShape([1, 11, 32]), TensorShape([1, 32]))

In [None]:
embedding_test[:1], conv_1d_output[:1], max_pool_output[:1]

(<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
 array([[[ 0.0015951 , -0.00167328,  0.03172447, ..., -0.02951322,
          -0.04517956,  0.1053958 ],
         [-0.03360718,  0.05941894,  0.04039135, ..., -0.02275302,
          -0.0149795 , -0.03069239],
         [-0.03469492, -0.00601064,  0.00723076, ...,  0.02648267,
           0.02052273,  0.07495757],
         ...,
         [ 0.04814656,  0.02189989,  0.00534187, ..., -0.01591825,
          -0.00617246,  0.02720873],
         [ 0.04814656,  0.02189989,  0.00534187, ..., -0.01591825,
          -0.00617246,  0.02720873],
         [ 0.04814656,  0.02189989,  0.00534187, ..., -0.01591825,
          -0.00617246,  0.02720873]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 11, 32), dtype=float32, numpy=
 array([[[0.00940246, 0.0037363 , 0.04545581, 0.03659646, 0.        ,
          0.01323595, 0.        , 0.        , 0.06671632, 0.06803234,
          0.        , 0.        , 0.        , 0.03016756, 0.02546688,
          0.01627643, 0.

In [None]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)

outputs = layers.Dense(1, activation="sigmoid")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")


model_5.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_5.summary()

Model: "model_5_Conv1D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 11, 32)            20512     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 32)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_10 (Dense)            (None, 1)              

In [None]:
model_5_history = model_5.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, 
                                                                     "Conv1D")])

Saving TensorBoard log files to: model_logs/Conv1D/20220625-171944
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_5_pred_probs = model_5.predict(val_sentences)
model_5_pred_probs[:10]

array([[0.9676838 ],
       [0.99999607],
       [0.99726   ],
       [0.9999068 ],
       [0.99679923],
       [0.9999813 ],
       [0.99998   ],
       [0.08603763],
       [0.98503464],
       [0.9999999 ]], dtype=float32)

In [None]:
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 1., 1., 1., 1., 1., 0., 1., 1.], dtype=float32)>

In [None]:
model_5_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_5_preds)
model_5_results

{'accuracy': 73.75328083989501,
 'f1': 0.7381076468679586,
 'precision': 0.7390489878513167,
 'recall': 0.7375328083989501}

In [None]:
compare_baseline_to_new_results(baseline_results, model_5_results)

Baseline accuracy: 80.84, New accuracy: 73.75, Difference: -7.09
Baseline precision: 0.82, New precision: 0.74, Difference: -0.08
Baseline recall: 0.81, New recall: 0.74, Difference: -0.07
Baseline f1: 0.80, New f1: 0.74, Difference: -0.07


### Model 6: TensorFlow Hub Pretrained Sentence Encode