## Helper Function

In [None]:
def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback


import matplotlib.pyplot as plt

def plot_loss_curves(history):
  """
  Returns separate loss curves for training and validation metrics.

  Args:
    history: TensorFlow model History object (see: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/History)
  """
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  accuracy = history.history['accuracy']
  val_accuracy = history.history['val_accuracy']

  epochs = range(len(history.history['loss']))


  plt.plot(epochs, loss, label='training_loss')
  plt.plot(epochs, val_loss, label='val_loss')
  plt.title('Loss')
  plt.xlabel('Epochs')
  plt.legend()


  plt.figure()
  plt.plot(epochs, accuracy, label='training_accuracy')
  plt.plot(epochs, val_accuracy, label='val_accuracy')
  plt.title('Accuracy')
  plt.xlabel('Epochs')
  plt.legend();

def compare_historys(original_history, new_history, initial_epochs=5):
    """
    Compares two TensorFlow model History objects.

    Args:
      original_history: History object from original model (before new_history)
      new_history: History object from continued model training (after original_history)
      initial_epochs: Number of epochs in original_history (new_history plot starts from here)
    """

    # Get original history measurements
    acc = original_history.history["accuracy"]
    loss = original_history.history["loss"]

    val_acc = original_history.history["val_accuracy"]
    val_loss = original_history.history["val_loss"]

    # Combine original history with new history
    total_acc = acc + new_history.history["accuracy"]
    total_loss = loss + new_history.history["loss"]

    total_val_acc = val_acc + new_history.history["val_accuracy"]
    total_val_loss = val_loss + new_history.history["val_loss"]

    # Make plots
    plt.figure(figsize=(8, 8))
    plt.subplot(2, 1, 1)
    plt.plot(total_acc, label='Training Accuracy')
    plt.plot(total_val_acc, label='Validation Accuracy')
    plt.plot([initial_epochs-1, initial_epochs-1],
              plt.ylim(), label='Start Fine Tuning') # reshift plot around epochs
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(total_loss, label='Training Loss')
    plt.plot(total_val_loss, label='Validation Loss')
    plt.plot([initial_epochs-1, initial_epochs-1],
              plt.ylim(), label='Start Fine Tuning') # reshift plot around epochs
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.xlabel('epoch')
    plt.show()

# Create function to unzip a zipfile into current working directory
# (since we're going to be downloading and unzipping a few files)
import zipfile

def unzip_data(filename):
  """
  Unzips filename into the current working directory.

  Args:
    filename (str): a filepath to a target zip folder to be unzipped.
  """
  zip_ref = zipfile.ZipFile(filename, "r")
  zip_ref.extractall()
  zip_ref.close()

# Walk through an image classification directory and find out how many files (images)
# are in each subdirectory.
import os

def walk_through_dir(dir_path):
  """
  Walks through dir_path returning its contents.

  Args:
    dir_path (str): target directory

  Returns:
    A print out of:
      number of subdiretories in dir_path
      number of images (files) in each subdirectory
      name of each subdirectory
  """
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
      y_true: true labels in the form of a 1D array
      y_pred: predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

## Get text datasets

In [None]:
import tensorflow as tf
import itertools
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
import datetime


In [None]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2024-12-29 09:58:42--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.215.207, 173.194.216.207, 173.194.217.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.215.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-12-29 09:58:42 (70.0 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [None]:
unzip_data("nlp_getting_started.zip")

## Visualizing the dataset

In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [None]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
train_df.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [None]:
len(train_df), len(test_df)

(7613, 3263)

In [None]:
import random
random_index = random.randint(0, len(train_df)-5)
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", " (real diseaster)" if target > 0 else " (fake diseaster)")
  print(f"Text:\n{text}")

Target: 1  (real diseaster)
Text:
150-Foot Sinkhole Opens in Lowndes County Residential Area
WCTV-35 minutes ago
Target: 1  (real diseaster)
Text:
Reddit Will Now QuarantineÛ_ http://t.co/pkUAMXw6pm #onlinecommunities #reddit #amageddon #freespeech #Business http://t.co/PAWvNJ4sAP
Target: 1  (real diseaster)
Text:
A better look at what this catastrophic rain and flooding has done to ourÛ_ https://t.co/5yRBegzafX
Target: 1  (real diseaster)
Text:
suddenly it's off &amp; on gloomy &amp; thunder so loud it shakes the windows? Not ever on the Bay Area. Miss me w/that lol http://t.co/x4eCGGvnSN
Target: 1  (real diseaster)
Text:
Heat wave adding to the misery of internally-displaced Gazans http://t.co/jW3hN9ewFT via @PressTV http://t.co/NYWrkRQ7Kn


## Split data into training and Validation set

In [None]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_target, val_target = train_test_split(train_df_shuffled['text'].to_numpy(), train_df_shuffled['target'].to_numpy(), test_size=0.1, random_state=42)

In [None]:
len(train_sentences), len(val_sentences), len(train_target), len(val_target)

(6851, 762, 6851, 762)

## Preprosessing the data for the baseline model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [None]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [None]:
max_vocab_length = 100000
max_length = 15
text_vectorization = TextVectorization(max_tokens=max_vocab_length, standardize="lower_and_strip_punctuation", split="whitespace", ngrams=None, output_mode="int", output_sequence_length=max_length, pad_to_max_tokens=True)

In [None]:
text_vectorization.adapt(train_sentences)

In [None]:
sentences = 'Excited not only about the next 6 years of school and ensuing student debt but also catastrophic climate change in my lifetime'
text_vectorization([sentences])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[5695,   34,  126,   54,    2,  274,  560,  141,    6,  185,    7,
           1, 4514, 2500,   30]])>

In [None]:
vocabularies = text_vectorization.get_vocabulary()
print(vocabularies)



## Make Baseline Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
baseline_model = Pipeline([('tfidf',TfidfVectorizer()), ('clf',MultinomialNB())])
baseline_model.fit(train_sentences, train_target)

In [None]:
baseline_score = baseline_model.score(val_sentences, val_target)
baseline_score

0.7926509186351706

In [None]:
baseline_pred = baseline_model.predict(val_sentences)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  model_accuracy= accuracy_score(y_true, y_pred)*100
  model_precision, model_recall, model_f1, _= precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_result = {"accuracy":model_accuracy, "precision": model_precision, "recall":model_recall, "f1":model_f1}
  return model_result

In [None]:
baseline_result = calculate_results(val_target, baseline_pred)
baseline_result

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

## Preprocessing data for sequential model

In [None]:
train_df, test_df= train_test_split(train_df_shuffled[['text','target']], test_size=0.1, random_state=42)
val_df = test_df.sample(frac=0.5)
test_df.drop(val_df.index, inplace=True)

In [None]:
def remove_stopwords(sentence):
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
    sentence = sentence.lower()
    word_list = sentence.split()
    words = [w for w in word_list if w not in stopwords]
    sentence = " ".join(words)

    return sentence

In [None]:
train_sentences = train_df['text'].apply(remove_stopwords).values
train_labels = train_df['target'].values

test_sentences = test_df['text'].apply(remove_stopwords).values
test_labels = test_df['target'].values

val_sentences = val_df['text'].apply(remove_stopwords).values
val_labels = val_df['target'].values

In [None]:
def preprocessing_fn(dataset):
  dataset_sequences = dataset.map(lambda text, label: (text_vectorization(text), label))
  dataset_sequences = dataset_sequences.ragged_batch(batch_size=dataset_sequences.cardinality())
  sequences, labels = dataset_sequences.get_single_element()
  padded_sequences = tf.keras.utils.pad_sequences(sequences.numpy(),maxlen=max_length,truncating='post',padding='post')
  padded_sequences = tf.data.Dataset.from_tensor_slices(padded_sequences)
  labels = tf.data.Dataset.from_tensor_slices(labels)
  dataset_vectorized = tf.data.Dataset.zip(padded_sequences, labels)
  return dataset_vectorized

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels))

In [None]:
train_dataset = train_dataset.apply(preprocessing_fn)
test_dataset = test_dataset.apply(preprocessing_fn)
val_dataset = val_dataset.apply(preprocessing_fn)

In [None]:
for example in train_dataset.take(2):
  print(example)
  print()

(<tf.Tensor: shape=(15,), dtype=int32, numpy=
array([10738,  6507,   387,  2956,   978,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0], dtype=int32)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)

(<tf.Tensor: shape=(15,), dtype=int32, numpy=
array([ 1514,   209,   485, 11535,  2589,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0], dtype=int32)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)



In [None]:
SHUFFLE_BUFFER_SIZE = 1000
PREFETCH_BUFFER_SIZE = tf.data.AUTOTUNE
BATCH_SIZE = 32

train_dataset_final = (train_dataset.cache().shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE).prefetch(PREFETCH_BUFFER_SIZE))
test_dataset_final = (test_dataset.cache().batch(BATCH_SIZE).prefetch(PREFETCH_BUFFER_SIZE))
validation_dataset_final = (val_dataset.cache().batch(BATCH_SIZE).prefetch(PREFETCH_BUFFER_SIZE))

## Model 1

In [None]:
inputs = tf.keras.layers.Input(shape=(15,), dtype=tf.int64)
x = tf.keras.layers.Embedding(input_dim=max_vocab_length, output_dim=128, input_length=max_length)(inputs)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_1 = tf.keras.Model(inputs, outputs)



In [None]:
model_1.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])

In [None]:
model_1.summary()

In [None]:
model_1_history = model_1.fit(train_dataset_final, epochs=5, validation_data=validation_dataset_final, callbacks=[create_tensorboard_callback(dir_name="model_logs", experiment_name="simple_dense_model")])

Saving TensorBoard log files to: model_logs/simple_dense_model/20241229-095854
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 260ms/step - accuracy: 0.6304 - loss: 0.6347 - val_accuracy: 0.8084 - val_loss: 0.4466
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 202ms/step - accuracy: 0.8673 - loss: 0.3223 - val_accuracy: 0.7979 - val_loss: 0.4872
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 206ms/step - accuracy: 0.9416 - loss: 0.1594 - val_accuracy: 0.8136 - val_loss: 0.5543
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 203ms/step - accuracy: 0.9716 - loss: 0.0834 - val_accuracy: 0.7953 - val_loss: 0.6414
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 207ms/step - accuracy: 0.9866 - loss: 0.0461 - val_accuracy: 0.8058 - val_loss: 0.7781


In [None]:
model_1.evaluate(test_dataset_final)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7167 - loss: 1.0033 


[0.9671122431755066, 0.7375327944755554]

In [None]:
def convert_text_to_number(text):
  dataset = tf.data.Dataset.from_tensor_slices([text])
  dataset_sequences = dataset.map(lambda text: (text_vectorization(text)))
  sequences = dataset_sequences.get_single_element()
  padded_sequences = tf.keras.utils.pad_sequences(sequences.numpy(),maxlen=max_length,truncating='post',padding='post')
  return padded_sequences

In [None]:
test_sentences_vectorized = convert_text_to_number(test_sentences)
model_1_pred_probs = model_1.predict(test_sentences_vectorized)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [None]:
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))

In [None]:
model_1_preds

<tf.Tensor: shape=(381,), dtype=float32, numpy=
array([1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.,
       1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0.,
       1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0.,
       1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1., 1.,
       0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0.,
       0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1.,
       0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0

In [None]:
model_1_preds = model_1_preds.cpu().numpy()
model_1_results = calculate_results(y_true=test_labels, y_pred=model_1_preds)
model_1_results

Instructions for updating:
Use tf.identity with explicit device placement instead.


{'accuracy': 73.75328083989501,
 'precision': 0.7437833907125246,
 'recall': 0.7375328083989501,
 'f1': 0.734598798576223}

In [None]:
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

compare_baseline_to_new_results(baseline_results=baseline_result, new_model_results=model_1_results)

Baseline accuracy: 79.27, New accuracy: 73.75, Difference: -5.51
Baseline precision: 0.81, New precision: 0.74, Difference: -0.07
Baseline recall: 0.79, New recall: 0.74, Difference: -0.06
Baseline f1: 0.79, New f1: 0.73, Difference: -0.05


## Model 2

In [None]:
model_2 = tf.keras.Sequential([
      tf.keras.Input(shape=(max_length,)),
      tf.keras.layers.Embedding(input_dim=max_vocab_length, output_dim=128, input_length=max_length),
      tf.keras.layers.LSTM(64, return_sequences=True),
      tf.keras.layers.LSTM(128),
      tf.keras.layers.Dense(1, activation="softmax")
  ])



In [None]:
model_2.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])

In [None]:
tensorboard_callback = create_tensorboard_callback(dir_name="model_logs", experiment_name="LSTM")
model_2_history = model_2.fit(train_dataset_final, epochs=5, validation_data=train_dataset_final, callbacks=[tensorboard_callback])

Saving TensorBoard log files to: model_logs/LSTM/20241229-100326
Epoch 1/5




[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 241ms/step - accuracy: 0.4340 - loss: 0.5801 - val_accuracy: 0.4267 - val_loss: 0.2763
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 231ms/step - accuracy: 0.4248 - loss: 0.2518 - val_accuracy: 0.4267 - val_loss: 0.1178
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 241ms/step - accuracy: 0.4339 - loss: 0.1282 - val_accuracy: 0.4267 - val_loss: 0.0983
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 248ms/step - accuracy: 0.4368 - loss: 0.0836 - val_accuracy: 0.4267 - val_loss: 0.0324
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 239ms/step - accuracy: 0.4313 - loss: 0.0409 - val_accuracy: 0.4267 - val_loss: 0.0204


In [None]:
model_2.evaluate(test_dataset_final)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4946 - loss: 0.9763


[0.9893650412559509, 0.48293963074684143]

In [None]:
model_2_pred_probs = model_2.predict(test_sentences_vectorized)
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds = model_2_preds.cpu().numpy()
model_2_results = calculate_results(y_true=test_labels, y_pred=model_2_preds)

[1m 7/12[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 9ms/step  



[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
model_2_results

{'accuracy': 48.29396325459317,
 'precision': 0.23323068868359959,
 'recall': 0.48293963254593175,
 'f1': 0.31455183146354493}

In [None]:
compare_baseline_to_new_results(baseline_results=baseline_result, new_model_results=model_2_results)

Baseline accuracy: 79.27, New accuracy: 48.29, Difference: -30.97
Baseline precision: 0.81, New precision: 0.23, Difference: -0.58
Baseline recall: 0.79, New recall: 0.48, Difference: -0.31
Baseline f1: 0.79, New f1: 0.31, Difference: -0.47


## Model 3

In [None]:
inputs = tf.keras.layers.Input(shape=(15,), dtype=tf.int64)
x = tf.keras.layers.Embedding(input_dim=max_vocab_length, output_dim=128, input_length=max_length)(inputs)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='relu')(x)
x = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu')(x)
x = tf.keras.layers.GlobalMaxPool1D()(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_3 = tf.keras.Model(inputs, outputs)



In [None]:
model_3.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])

In [None]:
tensorboard_callback = create_tensorboard_callback(dir_name="model_logs", experiment_name="LSTM")
model_3_history = model_3.fit(train_dataset_final, epochs=5, validation_data=train_dataset_final, callbacks=[tensorboard_callback])

Saving TensorBoard log files to: model_logs/LSTM/20241229-100817
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 200ms/step - accuracy: 0.6346 - loss: 0.6206 - val_accuracy: 0.9018 - val_loss: 0.2821
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 199ms/step - accuracy: 0.8742 - loss: 0.3192 - val_accuracy: 0.9585 - val_loss: 0.1150
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 201ms/step - accuracy: 0.9506 - loss: 0.1295 - val_accuracy: 0.9896 - val_loss: 0.0378
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 198ms/step - accuracy: 0.9860 - loss: 0.0424 - val_accuracy: 0.9949 - val_loss: 0.0147
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 200ms/step - accuracy: 0.9936 - loss: 0.0178 - val_accuracy: 0.9953 - val_loss: 0.0104


In [None]:
model_3.evaluate(test_dataset_final)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7211 - loss: 1.1831


[1.1239303350448608, 0.7454068064689636]

In [None]:
model_3_pred_probs = model_3.predict(test_sentences_vectorized)
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds = model_3_preds.cpu().numpy()
model_3_results = calculate_results(y_true=test_labels, y_pred=model_3_preds)
model_3_results

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


{'accuracy': 74.54068241469817,
 'precision': 0.7567129241823496,
 'recall': 0.7454068241469817,
 'f1': 0.7410902575745186}

In [None]:
compare_baseline_to_new_results(baseline_results=baseline_result, new_model_results=model_3_results)

Baseline accuracy: 79.27, New accuracy: 74.54, Difference: -4.72
Baseline precision: 0.81, New precision: 0.76, Difference: -0.05
Baseline recall: 0.79, New recall: 0.75, Difference: -0.05
Baseline f1: 0.79, New f1: 0.74, Difference: -0.05


## Model 4

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels))

In [None]:
import tensorflow_hub as hub
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",input_shape=[],dtype=tf.string,trainable=False)

In [None]:
embed_samples = sentence_encoder_layer(["When you call the universal sentence encoder on a sentence, it turns it into numbers."])
embed_samples

<tf.Tensor: shape=(1, 512), dtype=float32, numpy=
array([[ 3.59669104e-02, -8.57946873e-02, -1.15274275e-02,
         5.25982305e-03, -1.85217243e-02, -5.04201166e-02,
        -3.61694135e-02,  5.34677319e-03,  4.80591655e-02,
         4.69074361e-02, -3.72333042e-02, -1.14954598e-02,
         4.35241461e-02,  7.05099106e-02,  7.09376037e-02,
        -8.18043128e-02,  8.71716719e-03, -4.65412140e-02,
        -2.24577561e-02,  4.68687192e-02,  2.02255719e-03,
         3.09907217e-02,  2.04356313e-02,  6.39216825e-02,
        -7.64108598e-02,  8.42117891e-02, -4.57604155e-02,
        -1.06165453e-03, -2.05941647e-02,  1.24110589e-02,
         5.72753921e-02,  3.81562002e-02, -2.74211802e-02,
        -3.54347494e-03, -9.83258560e-02, -1.24485437e-02,
         3.86562981e-02,  5.03195338e-02, -2.36250591e-02,
         3.21848248e-03,  3.22520882e-02,  7.38095073e-03,
         4.47310396e-02, -4.12235968e-03,  1.15160132e-02,
         2.83772387e-02,  6.01386302e-04, -5.90335354e-02,
      

In [None]:
def preprocessing_fn(dataset):
  dataset = dataset.batch(BATCH_SIZE)
  dataset_sequences = dataset.map(lambda text, label: (sentence_encoder_layer(text), label))
  return dataset_sequences

In [None]:
train_dataset = train_dataset.apply(preprocessing_fn)
test_dataset = test_dataset.apply(preprocessing_fn)
val_dataset = val_dataset.apply(preprocessing_fn)

In [None]:
train_dataset_final = (train_dataset.cache().shuffle(SHUFFLE_BUFFER_SIZE).prefetch(PREFETCH_BUFFER_SIZE))
test_dataset_final = (test_dataset.cache().prefetch(PREFETCH_BUFFER_SIZE))
validation_dataset_final = (val_dataset.cache().prefetch(PREFETCH_BUFFER_SIZE))

In [None]:
for example in train_dataset_final.take(2):
  print(example)
  print()

(<tf.Tensor: shape=(32, 512), dtype=float32, numpy=
array([[-0.03298512, -0.02759284, -0.04487535, ...,  0.0138586 ,
         0.0158374 ,  0.00670457],
       [ 0.04023113, -0.05577822, -0.01079957, ..., -0.0600226 ,
        -0.06444848, -0.08134638],
       [ 0.03250216,  0.00883114,  0.00721379, ..., -0.03032464,
         0.05947021,  0.00959871],
       ...,
       [ 0.03720918, -0.04668706, -0.05853461, ..., -0.04881785,
         0.0571765 ,  0.06554002],
       [-0.00012232,  0.02434209,  0.01701002, ...,  0.01389242,
         0.00435215, -0.08035246],
       [ 0.02129035, -0.00326675, -0.00538932, ..., -0.06662047,
        -0.01176247,  0.02380305]], dtype=float32)>, <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0])>)

(<tf.Tensor: shape=(32, 512), dtype=float32, numpy=
array([[ 0.03775797,  0.0064327 ,  0.05779907, ..., -0.01925476,
        -0.0515672 , -0.00543367],
       

In [None]:
inputs = tf.keras.layers.Input(shape=[512,])
x = tf.keras.layers.Reshape((1,512))(inputs)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs)

# Compile model
model_4.compile(loss="binary_crossentropy",optimizer=tf.keras.optimizers.Adam(),metrics=["accuracy"])

model_4.summary()

In [None]:
model_4_history = model_4.fit(train_dataset_final,epochs=5,validation_data=(validation_dataset_final),callbacks=[create_tensorboard_callback('model_logs', "sentence_encoder")])

Saving TensorBoard log files to: model_logs/sentence_encoder/20241229-104012
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 30ms/step - accuracy: 0.7245 - loss: 0.5571 - val_accuracy: 0.7953 - val_loss: 0.4556
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.8219 - loss: 0.4194 - val_accuracy: 0.8058 - val_loss: 0.4500
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 35ms/step - accuracy: 0.8290 - loss: 0.4010 - val_accuracy: 0.8110 - val_loss: 0.4545
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 38ms/step - accuracy: 0.8225 - loss: 0.4005 - val_accuracy: 0.8136 - val_loss: 0.4475
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.8283 - loss: 0.3837 - val_accuracy: 0.8058 - val_loss: 0.4670


In [None]:
model_4.evaluate(test_dataset_final)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.7977 - loss: 0.4560


[0.43276625871658325, 0.808398962020874]

In [None]:
def convert_text_to_number(text):
  dataset = tf.data.Dataset.from_tensor_slices([text])
  dataset = dataset.map(lambda text: (sentence_encoder_layer(text)))
  return dataset

In [None]:
test_sentences_vectorized = convert_text_to_number(test_sentences)
test_sentences_vectorized

<_MapDataset element_spec=TensorSpec(shape=(None, 512), dtype=tf.float32, name=None)>

In [None]:
model_4_pred_probs = model_4.predict(test_sentences_vectorized)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


In [None]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds = model_4_preds.cpu().numpy()
model_4_results = calculate_results(y_true=test_labels, y_pred=model_4_preds)
model_4_results

{'accuracy': 80.83989501312337,
 'precision': 0.8138356212639025,
 'recall': 0.8083989501312336,
 'f1': 0.806957547982612}

In [None]:
compare_baseline_to_new_results(baseline_results=baseline_result, new_model_results=model_4_results)

Baseline accuracy: 79.27, New accuracy: 80.84, Difference: 1.57
Baseline precision: 0.81, New precision: 0.81, Difference: 0.00
Baseline recall: 0.79, New recall: 0.81, Difference: 0.02
Baseline f1: 0.79, New f1: 0.81, Difference: 0.02


## Compare Model Results

In [None]:
all_model_results = pd.DataFrame({"baseline": baseline_result,
                                  "simple_dense": model_1_results,
                                  "lstm": model_2_results,
                                  "conv1d": model_3_results,
                                  "tf_hub_sentence_encoder": model_4_results})
all_model_results = all_model_results.transpose()
all_model_results

Unnamed: 0,accuracy,precision,recall,f1
baseline,79.265092,0.811139,0.792651,0.786219
simple_dense,73.753281,0.743783,0.737533,0.734599
lstm,48.293963,0.233231,0.48294,0.314552
conv1d,74.540682,0.756713,0.745407,0.74109
tf_hub_sentence_encoder,80.839895,0.813836,0.808399,0.806958
