# Intro to NLP Fundamentals in TensorFlow - Exercises

In [1]:
!nvidia-smi -L

GPU 0: Tesla K80 (UUID: GPU-9b42fd71-30ec-055a-6fc9-02d35c70bbee)


In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow.keras import layers

In [3]:
!wget "https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py" --quiet
from helper_functions import *

In [25]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of binary
  classification model.
  """
  model_acc = accuracy_score(y_true, y_pred) * 100
  model_prec, model_rec, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  return {"accuracy": model_acc,
          "precision": model_prec,
          "recall": model_rec,
          "f1": model_f1}

## Import data

In [4]:
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip" --quiet
unzip_data("nlp_getting_started.zip")
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)

print(f"# of Train Samples: {len(train_df)}\n# of Test Samples: {len(test_df)}")

# of Train Samples: 7613
# of Test Samples: 3263


In [6]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42
                                                                            )

In [7]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

### Tokenise data

In [9]:
print(f"Average Number of tokens (words in the training tweets): {round(sum([len(i.split()) for i in train_sentences]) / len(train_sentences))}")

Average Number of tokens (words in the training tweets): 15


In [12]:
from tensorflow.keras.layers import TextVectorization
import random

In [10]:
MAX_VOCAB_LENGTH = 10000
MAX_LENGTH = 15

text_vectorizer = TextVectorization(max_tokens=MAX_VOCAB_LENGTH,
                                    output_mode="int",
                                    output_sequence_length=MAX_LENGTH)
text_vectorizer.adapt(train_sentences)

In [13]:
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
        \nVectorised version:\n{text_vectorizer([random_sentence])}")

Original text:
Emergency Shutdown Systems - Edmonton http://t.co/F8GvWkFqox        
Vectorised version:
[[  73 4628 3359    1    1    0    0    0    0    0    0    0    0    0
     0]]


### Create Embeddings

In [15]:
from tensorflow.keras.layers import Embedding
embedding = Embedding(input_dim=MAX_VOCAB_LENGTH,
                      output_dim=128,
                      input_length=MAX_LENGTH)
embedding

<keras.layers.embeddings.Embedding at 0x7ff39c2af750>

In [16]:
random_sentence = random.choice(train_sentences)
embed_sentence = embedding(text_vectorizer([random_sentence]))
print(f"Original text:\n{random_sentence}\n\
      \nEmbedded version:")
embed_sentence

Original text:
Casually on the phone with Jasmine while she cries and screams about a spider
      
Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 3.7898276e-02,  3.1052899e-02,  3.0861925e-02, ...,
         -3.0985450e-02,  2.6774082e-02, -1.4554083e-02],
        [-3.2461204e-02,  1.9508157e-02,  4.3862749e-02, ...,
          3.0261304e-02,  2.5658194e-02,  4.1552354e-02],
        [ 5.7811663e-04,  4.2915177e-02, -2.6910782e-02, ...,
         -3.9522424e-03,  2.9156718e-02, -4.3930411e-02],
        ...,
        [ 4.9429845e-02,  1.8701162e-02, -4.6542633e-02, ...,
          1.9087840e-02,  4.8537780e-02, -4.7273781e-02],
        [ 2.7370300e-02,  2.0382669e-02, -1.3287522e-02, ...,
         -2.5889123e-02, -7.3134899e-05, -1.0794081e-02],
        [-2.1476634e-03,  1.9142162e-02,  9.2877075e-04, ...,
          2.5511160e-03, -8.7789446e-04,  3.3294234e-02]]], dtype=float32)>

## 1. Rebuild model_1, model_2 and model_5 using the Sequential API.

### Model 1: Feed Forward Neural Network

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

MOD1_NAME = "Model1_FFNN"

model_1 = Sequential(
    [
     tf.keras.Input(shape=(1,), dtype=tf.string),
     text_vectorizer,
     embedding,
     layers.GlobalAveragePooling1D(),
     layers.Dense(1, activation="sigmoid")
    ],
    name = MOD1_NAME
)

model_1.summary()

model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_1_history = model_1.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels)
                              )

Model: "Model1_FFNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 15)                0         
_________________________________________________________________
embedding (Embedding)        (None, 15, 128)           1280000   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 1,280,129
Trainable params: 1,280,129
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_results = calculate_results(y_true=val_labels, y_pred=model_1_preds)
model_1_results

{'accuracy': 78.74015748031496,
 'f1': 0.7841130596930417,
 'precision': 0.7932296029485675,
 'recall': 0.7874015748031497}

### Model 4: Gated Recurrent Neural Network

In [28]:
MOD3_NAME = "Model3_GRU"

model_3 = Sequential(
    [
     layers.Input(shape=(1,), dtype=tf.string),
     text_vectorizer,
     embedding,
     layers.GRU(64),
     layers.Dense(1, activation="sigmoid")
    ],
    name=MOD3_NAME
)

model_3.summary()

model_3.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

model_3_history = model_3.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

model_3_pred_probs = model_3.predict(val_sentences)
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_results = calculate_results(y_true=val_labels, y_pred=model_3_preds)
model_3_results

Model: "Model3_GRU"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 15)                0         
_________________________________________________________________
embedding (Embedding)        (None, 15, 128)           1280000   
_________________________________________________________________
gru (GRU)                    (None, 64)                37248     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 77.55905511811024,
 'f1': 0.7736182129212565,
 'precision': 0.7772070861555818,
 'recall': 0.7755905511811023}

### Model 5: 1D-Conv

In [35]:
MOD5_NAME = "Model5_Conv1D"

model_5 = Sequential(
    [
     layers.Input(shape=(1,), dtype=tf.string),
     text_vectorizer,
     embedding,
     layers.Conv1D(64, 5, strides=1, activation="relu", padding="valid"),
     layers.GlobalMaxPool1D(),
     layers.Dense(1, activation="sigmoid")
    ],
    name=MOD5_NAME
)

model_5.summary()

model_5.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

model_5_history = model_5.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

model_5_pred_probs = model_5.predict(val_sentences)
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_results = calculate_results(val_labels, model_5_preds)
model_5_results

Model: "Model5_Conv1D"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 15)                0         
_________________________________________________________________
embedding (Embedding)        (None, 15, 128)           1280000   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 11, 64)            41024     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 65        
Total params: 1,321,089
Trainable params: 1,321,089
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 75.98425196850394,
 'f1': 0.7586850461611226,
 'precision': 0.7597625353264251,
 'recall': 0.7598425196850394}

## 2. Train baseline on only 10% of the training data

In [40]:
perc_data = 0.1
index = round(len(train_sentences) * perc_data)
index

685

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB # multi-nomial niave bayes
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
                   ("tfidf", TfidfVectorizer()),
                   ("clf", MultinomialNB())
])
# shuffled train set on split so can take the first 10% and assume reasonably
# random distribution
model_0.fit(train_sentences[:index], train_labels[:index])
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Baseline Model Accuracy: {baseline_score*100:.2f}%")

baseline_preds = model_0.predict(val_sentences)
baseline_results = calculate_results(y_true=val_labels, y_pred=baseline_preds)
baseline_results
# i.e. roughly 9% worse when training on only 10% of the training data

Baseline Model Accuracy: 70.21%


{'accuracy': 70.20997375328083,
 'f1': 0.6736831571468213,
 'precision': 0.7599524002753854,
 'recall': 0.7020997375328084}