# Introduction to NLP Fundamentals in TensorFlow

Natural Language Processing (NLP) is a field in machine learning with the ability of a computer to understand, interpret, and generate human language. NLP is a subset of artificial intelligence and is widely used in applications like language translation, chatbots, sentiment analysis, speech recognition, etc.

## Chech for GPU

In [19]:
import tensorflow as tf
print(tf.__version__)

2.17.0


In [20]:
import matplotlib.pyplot as plt

In [21]:
!nvidia-smi

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


## Get Helper Functions

In [22]:
### We create a bunch of helpful functions throughout the course.
### Storing them here so they're easily accessible.

import tensorflow as tf

# Create a function to import an image and resize it to be able to be used with our model
def load_and_prep_image(filename, img_shape=224, scale=True):
  """
  Reads in an image from filename, turns it into a tensor and reshapes into
  (224, 224, 3).

  Parameters
  ----------
  filename (str): string filename of target image
  img_shape (int): size to resize target image to, default 224
  scale (bool): whether to scale pixel values to range(0, 1), default True
  """
  # Read in the image
  img = tf.io.read_file(filename)
  # Decode it into a tensor
  img = tf.image.decode_jpeg(img)
  # Resize the image
  img = tf.image.resize(img, [img_shape, img_shape])
  if scale:
    # Rescale the image (get all values between 0 and 1)
    return img/255.
  else:
    return img

# Note: The following confusion matrix code is a remix of Scikit-Learn's 
# plot_confusion_matrix function - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_confusion_matrix.html
import itertools
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

# Our function needs a different name to sklearn's plot_confusion_matrix
def make_confusion_matrix(y_true, y_pred, classes=None, figsize=(10, 10), text_size=15, norm=False, savefig=False): 
  """Makes a labelled confusion matrix comparing predictions and ground truth labels.

  If classes is passed, confusion matrix will be labelled, if not, integer class values
  will be used.

  Args:
    y_true: Array of truth labels (must be same shape as y_pred).
    y_pred: Array of predicted labels (must be same shape as y_true).
    classes: Array of class labels (e.g. string form). If `None`, integer labels are used.
    figsize: Size of output figure (default=(10, 10)).
    text_size: Size of output figure text (default=15).
    norm: normalize values or not (default=False).
    savefig: save confusion matrix to file (default=False).
  
  Returns:
    A labelled confusion matrix plot comparing y_true and y_pred.

  Example usage:
    make_confusion_matrix(y_true=test_labels, # ground truth test labels
                          y_pred=y_preds, # predicted labels
                          classes=class_names, # array of class label names
                          figsize=(15, 15),
                          text_size=10)
  """  
  # Create the confustion matrix
  cm = confusion_matrix(y_true, y_pred)
  cm_norm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] # normalize it
  n_classes = cm.shape[0] # find the number of classes we're dealing with

  # Plot the figure and make it pretty
  fig, ax = plt.subplots(figsize=figsize)
  cax = ax.matshow(cm, cmap=plt.cm.Blues) # colors will represent how 'correct' a class is, darker == better
  fig.colorbar(cax)

  # Are there a list of classes?
  if classes:
    labels = classes
  else:
    labels = np.arange(cm.shape[0])
  
  # Label the axes
  ax.set(title="Confusion Matrix",
         xlabel="Predicted label",
         ylabel="True label",
         xticks=np.arange(n_classes), # create enough axis slots for each class
         yticks=np.arange(n_classes), 
         xticklabels=labels, # axes will labeled with class names (if they exist) or ints
         yticklabels=labels)
  
  # Make x-axis labels appear on bottom
  ax.xaxis.set_label_position("bottom")
  ax.xaxis.tick_bottom()

  # Set the threshold for different colors
  threshold = (cm.max() + cm.min()) / 2.

  # Plot the text on each cell
  for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    if norm:
      plt.text(j, i, f"{cm[i, j]} ({cm_norm[i, j]*100:.1f}%)",
              horizontalalignment="center",
              color="white" if cm[i, j] > threshold else "black",
              size=text_size)
    else:
      plt.text(j, i, f"{cm[i, j]}",
              horizontalalignment="center",
              color="white" if cm[i, j] > threshold else "black",
              size=text_size)

  # Save the figure to the current working directory
  if savefig:
    fig.savefig("confusion_matrix.png")
  
# Make a function to predict on images and plot them (works with multi-class)
def pred_and_plot(model, filename, class_names):
  """
  Imports an image located at filename, makes a prediction on it with
  a trained model and plots the image with the predicted class as the title.
  """
  # Import the target image and preprocess it
  img = load_and_prep_image(filename)

  # Make a prediction
  pred = model.predict(tf.expand_dims(img, axis=0))

  # Get the predicted class
  if len(pred[0]) > 1: # check for multi-class
    pred_class = class_names[pred.argmax()] # if more than one output, take the max
  else:
    pred_class = class_names[int(tf.round(pred)[0][0])] # if only one output, round

  # Plot the image and predicted class
  plt.imshow(img)
  plt.title(f"Prediction: {pred_class}")
  plt.axis(False);
  
import datetime

def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback

# Plot the validation and training data separately
import matplotlib.pyplot as plt

def plot_loss_curves(history):
  """
  Returns separate loss curves for training and validation metrics.

  Args:
    history: TensorFlow model History object (see: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/History)
  """ 
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  accuracy = history.history['accuracy']
  val_accuracy = history.history['val_accuracy']

  epochs = range(len(history.history['loss']))

  # Plot loss
  plt.plot(epochs, loss, label='training_loss')
  plt.plot(epochs, val_loss, label='val_loss')
  plt.title('Loss')
  plt.xlabel('Epochs')
  plt.legend()

  # Plot accuracy
  plt.figure()
  plt.plot(epochs, accuracy, label='training_accuracy')
  plt.plot(epochs, val_accuracy, label='val_accuracy')
  plt.title('Accuracy')
  plt.xlabel('Epochs')
  plt.legend();

def compare_historys(original_history, new_history, initial_epochs=5):
    """
    Compares two TensorFlow model History objects.
    
    Args:
      original_history: History object from original model (before new_history)
      new_history: History object from continued model training (after original_history)
      initial_epochs: Number of epochs in original_history (new_history plot starts from here) 
    """
    
    # Get original history measurements
    acc = original_history.history["accuracy"]
    loss = original_history.history["loss"]

    val_acc = original_history.history["val_accuracy"]
    val_loss = original_history.history["val_loss"]

    # Combine original history with new history
    total_acc = acc + new_history.history["accuracy"]
    total_loss = loss + new_history.history["loss"]

    total_val_acc = val_acc + new_history.history["val_accuracy"]
    total_val_loss = val_loss + new_history.history["val_loss"]

    # Make plots
    plt.figure(figsize=(8, 8))
    plt.subplot(2, 1, 1)
    plt.plot(total_acc, label='Training Accuracy')
    plt.plot(total_val_acc, label='Validation Accuracy')
    plt.plot([initial_epochs-1, initial_epochs-1],
              plt.ylim(), label='Start Fine Tuning') # reshift plot around epochs
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(total_loss, label='Training Loss')
    plt.plot(total_val_loss, label='Validation Loss')
    plt.plot([initial_epochs-1, initial_epochs-1],
              plt.ylim(), label='Start Fine Tuning') # reshift plot around epochs
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.xlabel('epoch')
    plt.show()
  
# Create function to unzip a zipfile into current working directory 
# (since we're going to be downloading and unzipping a few files)
import zipfile

def unzip_data(filename):
  """
  Unzips filename into the current working directory.

  Args:
    filename (str): a filepath to a target zip folder to be unzipped.
  """
  zip_ref = zipfile.ZipFile(filename, "r")
  zip_ref.extractall()
  zip_ref.close()

# Walk through an image classification directory and find out how many files (images)
# are in each subdirectory.
import os

def walk_through_dir(dir_path):
  """
  Walks through dir_path returning its contents.

  Args:
    dir_path (str): target directory
  
  Returns:
    A print out of:
      number of subdiretories in dir_path
      number of images (files) in each subdirectory
      name of each subdirectory
  """
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")
    
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
      y_true: true labels in the form of a 1D array
      y_pred: predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results


## Get Data

In [23]:
import pandas as pd
train_df = pd.read_csv("data/train/train.csv")
test_df = pd.read_csv("data/test/test.csv")

## Visualizing Data

In [24]:
train_df.head(), test_df.head()

(   id keyword location                                               text  \
 0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
 1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
 2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
 3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
 4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   
 
    target  
 0       1  
 1       1  
 2       1  
 3       1  
 4       1  ,
    id keyword location                                               text
 0   0     NaN      NaN                 Just happened a terrible car crash
 1   2     NaN      NaN  Heard about #earthquake is different cities, s...
 2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
 3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
 4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan)

In [25]:
train_df["text"][1]

'Forest fire near La Ronge Sask. Canada'

In [26]:
# Shuffle the training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [27]:
# Test data frame looks like
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [28]:
# Examples of each class
train_df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [29]:
# Total number of samples
len(train_df), len(test_df)

(7613, 3263)

In [30]:
# Visualize the random training samples
import random
import matplotlib.pyplot as plt

random_index = random.randint(0, len(train_df)-5) # create random indexes not exceeding total number of samples
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
    print(f"Text:\n{text}\n")
    print("---\n")

Target: 0 (not real disaster)
Text:
investigate why Robert mueller didn't respond to my complaints since Nov 2011 &amp; just left me/son out her in danger http://t.co/pe2D3HCsNI

---

Target: 0 (not real disaster)
Text:
Rip mama but I'm still thuggin cause the world is a war zone

---

Target: 1 (real disaster)
Text:

---

Target: 0 (not real disaster)
Text:
Love is the weapon for this wounded generation &lt;3

---

Target: 1 (real disaster)
Text:
Youth electrocuted in Khulna  | http://t.co/3EnyNdXpPm https://t.co/GQpi7jMKan via @sharethis

---



### Split data into training and test sets

In [31]:
# Split the training data into training and validation sets
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [32]:
# Check the lengths
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [33]:
# Check the first 10 samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

## Converting text into numbers

First step in working with text data is to convert them into numbers. There are few ways to do this:
* Tokenization - Direct mapping of token (a token could be a word or a character) to number.
* Embedding - Create a matrix of feature vector for each token (word) and use those vectors as features for the model.

### Text Vectorization (Tokenization)

In [34]:
train_sentences[:5], train_labels[:5]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
       dtype=object),
 array([0, 0, 1, 0, 0], dtype=int64))

In [35]:

from tensorflow.keras.layers import TextVectorization
#from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Use the default TextVectorization variables
text_vectorization = TextVectorization(max_tokens=10000, # Set max_tokens to a specific integer value
                                       standardize="lower_and_strip_punctuation",
                                       split="whitespace",
                                       ngrams=None, # Create groups of n-words?
                                       output_mode="int", # How to map tokens to numbers
                                       output_sequence_length=None, # How long do you want your sequences to be
                                       pad_to_max_tokens=True)

In [36]:
len(train_sentences[0].split())

7

In [37]:
# Find the average number of tokens in the training data
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [38]:
# Setup text vectorization variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length of the sequence

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [39]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [40]:
# Create a sample sentence and tokenize it
sample_sentence = "There's a flood in the streets of New York"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 264,    3,  232,    4,    2, 1585,    6,   50,  913,    0,    0,
           0,    0,    0,    0]], dtype=int64)>

In [41]:
# Choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
Brian Shaw + J.J. Hickson + Kenneth Faried trying to defend LaMarcus Aldridge was A BLOOD VOLCANO http://t.co/20TWGPmM7d      

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[4059, 4646,    1,    1,    1,    1,  563,    5, 3065,    1,    1,
          23,    3,  353,  524]], dtype=int64)>

In [42]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary() # get all of the unique words in our training data
top_5_words = words_in_vocab[:5] # get the most common words
bottom_5_words = words_in_vocab[-5:] # get the least common words
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") # most common words
print(f"Bottom 5 least common words: {bottom_5_words}") # least common words

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [43]:
# Length of the vocabulary
len(words_in_vocab)

10000

## Creating an Embedding using an Embedding Layer

In [44]:
max_length

15

In [45]:
embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length, # set input shape
                                      output_dim=128, # set size of embedding vector
                                      embeddings_initializer="uniform", # default initilization
                                      input_length=max_length) # how long is each input
embedding



<Embedding name=embedding, built=False>

In [46]:
# Get a random sentence from the training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into dense vectors of fixed size)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
Kids got Disney version of the game Operation only 2 AA batteries? I swear my old version had like 8 Ds and would nearly electrocute you.      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.01111512,  0.04703199, -0.00852578, ...,  0.02778682,
          0.02770183, -0.00210788],
        [-0.0274561 ,  0.01211984, -0.04456672, ...,  0.02295934,
          0.02836308,  0.03825342],
        [-0.02989386,  0.00054218,  0.01556512, ..., -0.00994333,
          0.0293901 , -0.03424597],
        ...,
        [-0.0455688 , -0.0168792 ,  0.00464436, ...,  0.00150947,
          0.04556867, -0.03075254],
        [-0.03000095, -0.04842854,  0.0039296 , ...,  0.03740156,
          0.02726556, -0.04566583],
        [-0.04700805, -0.01067149, -0.00766766, ...,  0.00782695,
         -0.0488183 , -0.04584458]]], dtype=float32)>

In [47]:
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ 0.01111512,  0.04703199, -0.00852578,  0.00191572, -0.01065428,
       -0.01498313,  0.02633623,  0.03437978,  0.04951206,  0.03917282,
       -0.02178314, -0.0234182 ,  0.02490464,  0.02902984,  0.01318875,
       -0.04393622, -0.00545887, -0.03056594,  0.04607492,  0.02159207,
        0.04865174,  0.01969624,  0.02207855, -0.01580678, -0.04600808,
       -0.0390352 , -0.04665365, -0.02196579,  0.01595819,  0.02089738,
       -0.01135976, -0.02804445,  0.02420712, -0.04824064, -0.03410465,
       -0.02375536, -0.02223892,  0.02565563, -0.03377674, -0.01063646,
       -0.03495748,  0.03497969, -0.02479496,  0.03851665,  0.03006319,
        0.00681591, -0.04883387,  0.0403551 ,  0.03817784,  0.04651382,
        0.02931148,  0.01303226, -0.03266937,  0.00799221,  0.04458238,
       -0.00213735, -0.00535144,  0.03227497, -0.00645848,  0.01569308,
       -0.02968205,  0.00700397,  0.00305537,  0.01094271, -0.01660488,
       -0.009315

## Modelling a text dataset (setting up modelling experiments)

* Model 0: Naive Bayes (baseline)
* Model 1: Feed-forward neural network (dense model)
* Model 2: LSTM model
* Model 3: GRU model
* Model 4: Bidirectional-LSTM model
* Model 5: 1D Convolutional Neural Network
* Model 6: TensorFlow Hub Pretrained Feature Extractor
* Model 7: Same as model 6 with 10% of the training data

Steps for each model:
1. Construct the model
2. Train the model
3. Make predictions
4. Track the results

### Model 0: Naive Bayes (baseline)

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create Tokenization and Modelling Pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

In [49]:
# Evaluate the baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Baseline model accuracy: {baseline_score*100:.2f}%")

Baseline model accuracy: 79.27%


In [50]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

### Create an evaluation function for model experiments

Modelling experiments are typically evaluated using:
* Accuracy - the higher the better
* Precision - the higher the better
* Recall - the higher the better
* F1-score - the higher the better

In [51]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
    """
    Calculates model accuracy, precision, recall and f1 score of a binary classification model.

    Args:
        y_true: true labels in the form of a 1D array
        y_pred: predicted labels in the form of a 1D array

    Returns a dictionary of accuracy, precision, recall, f1-score.
    """

    # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100 # get accuracy score in percentage
    # Calculate model precision, recall and f1 score using "weighted average"
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    # Create a dictionary of results
    model_results = {"accuracy": model_accuracy,
                     "precision": model_precision,
                     "recall": model_recall,
                     "f1 score": model_f1}
    
    return model_results

In [52]:
# Get the baseline results
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)

baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1 score': 0.7862189758049549}

### Model 1: Feed-forward neural network (dense model)

In [53]:
# Create a directory to save TensorBoard logs
SAVE_DIR = "model_logs"

In [54]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [55]:
# Build a model using the Sequential API
from tensorflow.keras import layers
input = layers.Input(shape=(1,), dtype=tf.string) # inputs are 1-dimensional strings
x = text_vectorizer(input) # turn the input text into numbers
x = embedding(x) # create an embedding of the numberized inputs
x = layers.GlobalAveragePooling1D()(x) # condense the feature vector for each token to one vector

output = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary output so use sigmoid activation function
model_1 = tf.keras.Model(input, output, name="model_1_dense") # construct the model

In [56]:
# Compile the model
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit the model
model_1_history = model_1.fit(train_sentences, train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                    experiment_name="model_1_dense")])

Saving TensorBoard log files to: model_logs/model_1_dense/20241016-040412
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.6492 - loss: 0.6507 - val_accuracy: 0.7467 - val_loss: 0.5415
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8072 - loss: 0.4633 - val_accuracy: 0.7900 - val_loss: 0.4752
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8537 - loss: 0.3583 - val_accuracy: 0.7913 - val_loss: 0.4605
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.8977 - loss: 0.2813 - val_accuracy: 0.7835 - val_loss: 0.4605
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9189 - loss: 0.2350 - val_accuracy: 0.7822 - val_loss: 0.4857


In [57]:
# Evaluate the model
model_1.evaluate(val_sentences, val_labels)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7630 - loss: 0.5125 


[0.4856758415699005, 0.7821522355079651]

In [58]:
# Make predictions
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs[:10]

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


array([[0.49948403],
       [0.8167913 ],
       [0.9976077 ],
       [0.2569029 ],
       [0.14629054],
       [0.95366937],
       [0.9286461 ],
       [0.9917695 ],
       [0.9756273 ],
       [0.49027598]], dtype=float32)

In [59]:
# Convert model_1 pred probs from probabilities to prediction labels
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [60]:
val_labels[:10]

array([0, 0, 1, 1, 1, 1, 1, 1, 1, 0], dtype=int64)

In [61]:
# Evaluate model_1 with evaluate function
model_1_results = calculate_results(y_true=val_labels,
                                    y_pred=model_1_preds)

model_1_results

{'accuracy': 78.21522309711287,
 'precision': 0.7818524785830743,
 'recall': 0.7821522309711286,
 'f1 score': 0.7816542238046842}

In [62]:
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1 score': 0.7862189758049549}

In [63]:
import numpy as np
np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

array([False, False, False, False])

### Visualizing learned embeddings

In [64]:
# Get the vocabulary from the text vectorization layer
words_in_vocab = text_vectorizer.get_vocabulary()

len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [65]:
model_1.summary()

In [66]:
# Get the weight matrix of the embedding layer
embed_weights = model_1.get_layer("embedding").get_weights()[0]
embed_weights.shape

(10000, 128)

In [67]:
import io

# Code to save trained embeddings to file - we got this from here: https://www.tensorflow.org/tutorials/text/word_embeddings#retrieve_the_trained_word_embeddings_and_save_them_to_disk
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

### Model 2: LSTM model

* LSTM (Long Short Term Memory) - one of the most popular LSTM cells. LSTM cells have the ability to "remember" things which are important in natural language processing tasks.

In [68]:
# Create LSTM model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
print(f"After embedding: {x.shape}")


x = layers.LSTM(64, activation="tanh")(x)
print(f"After LSTM cellL {x.shape}")

outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

After embedding: (None, 15, 128)
After LSTM cellL (None, 64)


In [69]:
# Compile the model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit the model
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_2_LSTM")])

Saving TensorBoard log files to: model_logs/model_2_LSTM/20241016-040434
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.9157 - loss: 0.2831 - val_accuracy: 0.7795 - val_loss: 0.5223
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.9447 - loss: 0.1511 - val_accuracy: 0.7782 - val_loss: 0.6230
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.9567 - loss: 0.1152 - val_accuracy: 0.7730 - val_loss: 0.7984
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9634 - loss: 0.0975 - val_accuracy: 0.7835 - val_loss: 0.6849
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9699 - loss: 0.0770 - val_accuracy: 0.7756 - val_loss: 0.8844


In [70]:
# Make predictions
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs.shape, model_2_pred_probs[:10]

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step


((762, 1),
 array([[1.0528750e-02],
        [7.2056240e-01],
        [9.9934149e-01],
        [1.2903231e-01],
        [7.8634708e-04],
        [9.9248153e-01],
        [6.7187923e-01],
        [9.9958509e-01],
        [9.9938983e-01],
        [3.9379117e-01]], dtype=float32))

In [71]:
# Round out pred probs and reduce to 1-dim array
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [72]:
# Calculate LSTM model results
model_2_results = calculate_results(y_true=val_labels,
                                    y_pred=model_2_preds)

model_2_results

{'accuracy': 77.55905511811024,
 'precision': 0.7794121823201254,
 'recall': 0.7755905511811023,
 'f1 score': 0.7726614113025507}

### Model 3: GRU mdoel

* GRU (Gated Recurrent Unit) - a variation of the LSTM cell but with less parameters.

In [73]:
# Build a RNN using the GRU cell
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
# x = layers.GRU(64, activation="tanh", return_sequences=True)(x) # return_sequences=True is required for stacking recurrent cells
# print(x.shape)
x = layers.GRU(64, activation="tanh")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_2_GRU")

In [74]:
# Model Summary
model_3.summary()

In [75]:
# Compile the model_3
model_3.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit the model_3
model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_3_GRU")])

Saving TensorBoard log files to: model_logs/model_3_GRU/20241016-040502
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.8581 - loss: 0.2825 - val_accuracy: 0.7822 - val_loss: 0.7949
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.9720 - loss: 0.0751 - val_accuracy: 0.7730 - val_loss: 0.8374
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.9757 - loss: 0.0645 - val_accuracy: 0.7756 - val_loss: 0.9960
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9771 - loss: 0.0582 - val_accuracy: 0.7664 - val_loss: 1.1446
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9784 - loss: 0.0467 - val_accuracy: 0.7769 - val_loss: 1.4052


In [76]:
# Make predictions with model_3
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs[:10], model_3_pred_probs.shape

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step


(array([[1.6999459e-04],
        [5.5065084e-01],
        [9.9991077e-01],
        [5.5410262e-02],
        [8.4390165e-05],
        [9.9984682e-01],
        [9.0184933e-01],
        [9.9995899e-01],
        [9.9992466e-01],
        [9.1763127e-01]], dtype=float32),
 (762, 1))

In [77]:
# Convert model_3 preds probs into labels
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [78]:
# Calculate model_3 results
model_3_results = calculate_results(y_true=val_labels,
                                    y_pred=model_3_preds)
model_3_results

{'accuracy': 77.69028871391076,
 'precision': 0.7809693289921038,
 'recall': 0.7769028871391076,
 'f1 score': 0.7739165030429329}

### Model 4: Bidirectional-LSTM model

In [79]:
# Build a model with bidrrectional RNN in TensorFlow
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)

x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_bidirectional")

In [80]:
model_4.summary()

In [81]:
# Compile the model
model_4.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit the model
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_4_bidirectional")])

Saving TensorBoard log files to: model_logs/model_4_bidirectional/20241016-040529
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - accuracy: 0.9450 - loss: 0.1899 - val_accuracy: 0.7743 - val_loss: 0.9546
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.9788 - loss: 0.0541 - val_accuracy: 0.7730 - val_loss: 1.1688
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.9820 - loss: 0.0404 - val_accuracy: 0.7703 - val_loss: 1.3331
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9836 - loss: 0.0393 - val_accuracy: 0.7677 - val_loss: 1.4471
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.9801 - loss: 0.0425 - val_accuracy: 0.7795 - val_loss: 1.3211


In [82]:
# Make predictions with model_4
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10], model_4_pred_probs.shape

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step


(array([[7.0028985e-04],
        [8.1401581e-01],
        [9.9997461e-01],
        [1.5989293e-01],
        [2.8766366e-05],
        [9.9989140e-01],
        [9.7885489e-01],
        [9.9998546e-01],
        [9.9997145e-01],
        [4.7260281e-01]], dtype=float32),
 (762, 1))

In [83]:
# Convert pred probs to labels
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [84]:
# Calculate model_4 results
model_4_results = calculate_results(y_true=val_labels,
                                    y_pred=model_4_preds)
model_4_results

{'accuracy': 77.95275590551181,
 'precision': 0.7841069305867823,
 'recall': 0.7795275590551181,
 'f1 score': 0.7764267379950773}

### Model 5: 1D Convolutional Neural Network

In [85]:
# Test to understand what things look like in Conv1D layer
embedding_test = embedding(text_vectorizer(["this is a test sentence"])) # turn target sentence into embedding
conv_1d_layer = layers.Conv1D(filters=32,
                              kernel_size=5, # setting this to 5 means it'll look at 5 words at a time, 3 would mean 3 words at a time
                              activation="relu")
conv_1d_output = conv_1d_layer(embedding_test)
max_pool = layers.GlobalMaxPool1D()
max_pool_output = max_pool(conv_1d_output)
print(f"Embedding output shape: {embedding_test.shape}")
print(f"Conv1D output shape: {conv_1d_output.shape}")
print(f"Max pool output shape: {max_pool_output.shape}")

Embedding output shape: (1, 15, 128)
Conv1D output shape: (1, 11, 32)
Max pool output shape: (1, 32)


In [86]:
# the outputs of each layer
print(f"Embedding output: {embedding_test}")
print(f"Conv1D output: {conv_1d_output}")
print(f"Max pool output: {max_pool_output}")

Embedding output: [[[-0.03267975  0.01071224  0.0083286  ... -0.02040259 -0.0159696
   -0.05391409]
  [-0.00767381  0.00160499  0.02372621 ...  0.00220557 -0.04373994
   -0.03787751]
  [-0.00332875 -0.06114716 -0.00819805 ... -0.03090096  0.03720066
   -0.04637963]
  ...
  [ 0.04536082 -0.04292151  0.02305425 ...  0.00076239  0.01025892
   -0.02293234]
  [ 0.04536082 -0.04292151  0.02305425 ...  0.00076239  0.01025892
   -0.02293234]
  [ 0.04536082 -0.04292151  0.02305425 ...  0.00076239  0.01025892
   -0.02293234]]]
Conv1D output: [[[0.01461227 0.01922776 0.02370081 0.00720802 0.02390547 0.
   0.         0.0511753  0.         0.05668524 0.         0.00462933
   0.06150455 0.02008806 0.         0.         0.         0.
   0.05898431 0.08298439 0.         0.07316959 0.         0.
   0.         0.00038438 0.07476184 0.         0.02630325 0.
   0.         0.        ]
  [0.         0.         0.         0.067795   0.         0.
   0.         0.06148407 0.0194143  0.03320348 0.00460175 0.00

In [87]:
# Build a model using 1D CNN
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)

x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)  # Reduced kernel size
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(1, activation="sigmoid", name="ouput_layer")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_CNN")

In [88]:
model_5.summary()

In [89]:
# Compile the model_5
model_5.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit the model_5
model_5_history = model_5.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_5_CNN")])

Saving TensorBoard log files to: model_logs/model_5_CNN/20241016-040558
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.9423 - loss: 0.2081 - val_accuracy: 0.7677 - val_loss: 0.8379
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.9731 - loss: 0.0771 - val_accuracy: 0.7677 - val_loss: 0.9567
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9784 - loss: 0.0590 - val_accuracy: 0.7572 - val_loss: 1.0741
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9799 - loss: 0.0495 - val_accuracy: 0.7638 - val_loss: 1.1360
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9804 - loss: 0.0454 - val_accuracy: 0.7572 - val_loss: 1.1499


In [90]:
# Make prediction with model_5
model_5_pred_probs = model_5.predict(val_sentences)
model_5_pred_probs.shape, model_5_pred_probs[:10]

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


((762, 1),
 array([[2.2581117e-01],
        [9.8028767e-01],
        [9.9995518e-01],
        [5.5958748e-02],
        [1.9280760e-06],
        [9.9886870e-01],
        [9.9721515e-01],
        [9.9997324e-01],
        [9.9999958e-01],
        [7.2530919e-01]], dtype=float32))

In [91]:
# Convert pred probs to labels
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [92]:
# Calculate model_5 results
model_5_results = calculate_results(y_true=val_labels,
                                    y_pred=model_5_preds)

model_5_results

{'accuracy': 75.7217847769029,
 'precision': 0.757104469267424,
 'recall': 0.7572178477690289,
 'f1 score': 0.7560477242612442}

### Model 6: TensorFlow Hub Pretrained Feature Extractor

In [93]:
sample_sentence

"There's a flood in the streets of New York"

In [94]:
import tensorflow_hub as hub
# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
# embed_samples = embed([sample_sentence,
#                        "When you call the universal sentence encoder on a sentence, it turns it into numbers."])
# print(embed_samples[0][:50])





In [95]:
# embed_samples[0].shape

In [104]:
# Create a Keras Layer using the USE pretrained layer from tensorflow hub
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name="USE")

# Ceate a model using the Sequential API
from tensorflow.keras import layers
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid",
                 name="output_layer")
], name="model_6_USE")

# Compile the model_6
model_6.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Model_6 summary
model_6.summary()

In [108]:
# Ceate a model using the Sequential API
from tensorflow.keras import layers
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid",
                 name="output_layer")
], name="model_6_USE")

# Compile the model_6
model_6.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Model_6 summary
model_6.summary()

ValueError: Only instances of `keras.Layer` can be added to a Sequential model. Received: <tensorflow_hub.keras_layer.KerasLayer object at 0x0000025A801605D0> (of type <class 'tensorflow_hub.keras_layer.KerasLayer'>)

In [None]:
# Fit the model_6
model_6_history = model_6.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     "tf_hub_sentence_encoder")])

In [None]:
# Make predictions with USE TF Hub Model
model_6_pred_probs = model_6.predict(val_sentences)
model_1_pred_probs[:10]

In [None]:
# Convert predicton probabilities to labels
model_6_preds = tf.squeeze(tf.round(model_6_pred_probs))
model_6_preds[:10]

In [None]:
# Calculate model_6 results
model_6_results = calculate_results(y_true=val_labels,
                                    y_pred=model_6_preds)

model_6_results

In [None]:
# Check for tensorflow gpu availability
print(tf.config.list_physical_devices("GPU"))

### Model 7: Same as model 6 with 10% of the training data

In [109]:
# Make 10% data split
train_10_percent_split = int(0.1 * len(train_sentences))
train_sentences_10_percent = train_sentences[:train_10_percent_split]
train_labels_10_percent = train_labels[:train_10_percent_split]

In [110]:
# Check number of each label in the updated training data subset
pd.Series(np.array(train_labels_10_percent)).value_counts()

0    406
1    279
Name: count, dtype: int64

In [111]:
# Check the number of targets in our subset of data
train_df_shuffled["target"].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [112]:
# Build model_7 same as model_6
model_7 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid", name="output_layer")
], name="model_7_USE")

# Compile the model_7
model_7.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Get a model_7 summery
model_7.summary()

ValueError: Only instances of `keras.Layer` can be added to a Sequential model. Received: <tensorflow_hub.keras_layer.KerasLayer object at 0x0000025A801605D0> (of type <class 'tensorflow_hub.keras_layer.KerasLayer'>)

In [None]:
# Fit the model_7 with 10% data split
model_7_history = model_7.fit(train_sentences_10_percent,
                              train_labels_10_percent,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,

                                                                     "tf_hub_sentence_ecoder_10_percent_correct_spilt")])

In [None]:
# Make predictions with the model_7
model_7_pred_probs = model_7.predict(val_sentences)
model_7_pred_probs[:10]

In [None]:
# Convert pred probs to labels
model_7_preds = tf.squeeze(tf.round(model_7_pred_probs))

model_7_preds[:10]

In [None]:
# Calculate model_7 results
model_7_results = calculate_results(y_true=val_labels,
                                    y_pred=model_7_preds)

model_7_results

### Comparing the performance of each of our models