# Introduction to NLP Fundamentals in TensorFlow

## Preprocessing

In [1]:
import tensorflow as tf


In [2]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2023-11-11 08:36:34--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py.1’


2023-11-11 08:36:35 (26.1 MB/s) - ‘helper_functions.py.1’ saved [10246/10246]



In [3]:
from helper_functions import *

In [4]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2023-11-11 08:36:36--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.211.207, 173.194.212.207, 173.194.213.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.211.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip.1’


2023-11-11 08:36:37 (80.0 MB/s) - ‘nlp_getting_started.zip.1’ saved [607343/607343]



In [5]:
unzip_data('nlp_getting_started.zip')

In [6]:
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
train_df_shuffled = train_df.sample(frac=1)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
4467,6350,hostages,,'Well guess what young girls. You aren't damse...,0
1603,2314,collapse,New York City,Interview on The Collapse of Materialism Best ...,1
1926,2769,curfew,,She just said does he have a curfew 'nope'??,0
4756,6766,lightning,,Don't blink ?? won't see the Lightning take th...,1
7291,10432,whirlwind,"brooklyn, NYC",#picthis http://t.co/br7gmMh5Ek ÛÓ And IÛªm ...,0


In [9]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [10]:
train_df.shape,test_df.shape

((7613, 5), (3263, 4))

In [11]:
import random
random_index = random.randint(1,len(train_df)-5)

for row in train_df_shuffled[['text','target']][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f'Target: {target}', 'REAL DISASTER!!' if target > 0 else 'Not a DISASTER')

Target: 1 REAL DISASTER!!
Target: 0 Not a DISASTER
Target: 1 REAL DISASTER!!
Target: 0 Not a DISASTER
Target: 0 Not a DISASTER


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_data, val_data, train_labels, val_labels = train_test_split(train_df_shuffled.text.to_numpy(),
                                                                train_df_shuffled.target.to_numpy(),
                                                                test_size = 0.1)


In [14]:
train_labels[1]

1

In [15]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=None,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=None,
    # pad_to_max_tokens=True
)

In [16]:
round(sum([len(i.split()) for i in train_data])/len(train_data))

15

In [17]:
max_vocab_length = 10000
max_length = 15

text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode='int',
    output_sequence_length=max_length
    )

In [18]:
text_vectorizer.adapt(train_data)

In [19]:
embeding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length
)

In [20]:
embeding(text_vectorizer([train_data[0]]))

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.02269403,  0.01318047, -0.01738417, ..., -0.04162874,
          0.02060318, -0.00843298],
        [-0.0213043 , -0.0216939 ,  0.01565554, ..., -0.02613225,
         -0.02975353,  0.00141418],
        [-0.0149999 ,  0.01216196,  0.02103059, ..., -0.04175618,
         -0.01991791,  0.01093575],
        ...,
        [-0.04401923,  0.01391837,  0.04421905, ...,  0.00443709,
         -0.02114259,  0.04818742],
        [ 0.02183056, -0.01801525, -0.03028114, ..., -0.0009114 ,
          0.00988491,  0.03107685],
        [ 0.02183056, -0.01801525, -0.03028114, ..., -0.0009114 ,
          0.00988491,  0.03107685]]], dtype=float32)>

## Model ZERO

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline(
    [
        ('tfidf', TfidfVectorizer()),
        ('clf',MultinomialNB())

    ]

)

model_0.fit(train_data, train_labels)

In [22]:
baseline_score = model_0.score(val_data,val_labels)
baseline_score

0.8254593175853019

In [23]:
baseline_preds = model_0.predict(val_data)
baseline_preds[:10]

array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0])

In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_recall_fscore_support

In [25]:
confusion_matrix(val_labels,baseline_preds), f1_score(val_labels,baseline_preds)

(array([[400,  34],
        [ 99, 229]]),
 0.7749576988155668)

In [26]:
def calculate_results(y_true, y_pred):

  model_accuracy = accuracy_score(y_true,y_pred)

  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true,y_pred,average='weighted')

  model_results = {
      "accuracy" : model_accuracy,
      "precision" : model_precision,
      "recall" : model_recall,
      "f1_score" : model_f1
  }

  return model_results

In [27]:
calculate_results(val_labels,baseline_preds)

{'accuracy': 0.8254593175853019,
 'precision': 0.8313553146431383,
 'recall': 0.8254593175853019,
 'f1_score': 0.8219409840161348}

## Model 1

In [28]:
# Create callback
from helper_functions import create_tensorboard_callback

SAVE_DIR = 'model_logs'



In [29]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)

x = text_vectorizer(inputs)
x = embeding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model_1 = tf.keras.Model(inputs,outputs, name='model_1_dense')

In [30]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 128

In [31]:
model_1.compile(loss='binary_crossentropy',
                optimizer = tf.keras.optimizers.Adam(),
                metrics=['accuracy'])


In [32]:
model_1.fit(
    x= train_data,
    y= train_labels,
    epochs=5,
    validation_data = (val_data,val_labels)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c93115701f0>

In [33]:
pred_1 = model_1.predict(val_data)
pred_1.shape



(762, 1)

In [34]:
calculate_results(val_labels,tf.round(pred_1))

{'accuracy': 0.8202099737532809,
 'precision': 0.8197423772488158,
 'recall': 0.8202099737532809,
 'f1_score': 0.8194542446320726}

## Recurrent Neural Networks (RNN's)

### Model 2: LSTM

In [35]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)

x = text_vectorizer(inputs)

x = embeding(x)
print(x.shape)
x = layers.LSTM(16)(x)
print(x.shape)
# x = layers.GlobalAveragePooling1D()(x)
# x = layers.LSTM(64)(x)
print(x.shape)
# x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
print(outputs.shape)
model_2 = tf.keras.Model(inputs, outputs)

(None, 15, 128)
(None, 16)
(None, 16)
(None, 1)


In [36]:
model_2.compile(loss = 'binary_crossentropy',
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

In [37]:
model_2.fit(x=train_data,
            y=train_labels,
            epochs=5,
            validation_data=(val_data,val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c930ec4a350>

### Model 3: GRU

In [38]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)

x = text_vectorizer(inputs)

x = embeding(x)
print(x.shape)
x = layers.GRU(2,return_sequences=True)(x)
print(x.shape)
x = layers.LSTM(2)(x)
print(x.shape)
outputs = layers.Dense(1,activation='sigmoid')(x)


model_3 = tf.keras.Model(inputs,outputs)

(None, 15, 128)
(None, 15, 2)
(None, 2)


In [39]:
model_3.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [40]:
model_3.fit(
    x = train_data,
    y = train_labels,
    epochs = 5,
    validation_data = (val_data,val_labels)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c930c951390>

### Model 4: Bidirectional

In [41]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)

x = text_vectorizer(inputs)

x = embeding(x)
print(x.shape)
x = layers.Bidirectional(layers.GRU(2,return_sequences=True))(x)
print(x.shape)
x = layers.Bidirectional(layers.LSTM(2,return_sequences=True))(x)
print(x.shape)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1,activation='sigmoid')(x)


model_4 = tf.keras.Model(inputs,outputs)

(None, 15, 128)
(None, 15, 4)
(None, 15, 4)


In [42]:
model_4.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [43]:
model_4.fit(
    x = train_data,
    y = train_labels,
    epochs = 5,
    validation_data = (val_data,val_labels)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c930e8ec970>

### Model 5: 1D CNN

In [44]:
from tensorflow.keras import layers

inputs = layers.Input((1,), dtype='string')

x = text_vectorizer(inputs)

x = embeding(x)

x = layers.Bidirectional(layers.GRU(64,return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64,return_sequences=True))(x)
# x = layers.Bidirectional(layers.Dense(96,activation='relu'))(x)
# x = layers.Dense(96,activation='relu')
# x = layers.Dense(96,activation='relu')
# x = layers.Dense(96,activation='relu')
# x = layers.Dense(96,activation='relu')
# x = layers.Dense(96,activation='relu')



x = layers.Conv1D(96,48,padding='same', activation='relu')(x)
x = layers.Conv1D(48,24,padding='same', activation='relu')(x)
x = layers.Conv1D(24,15,padding='same', activation='relu')(x)

x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(96,activation='relu')(x)
x = layers.Dense(48,activation='relu')(x)
x = layers.Dense(24,activation='relu')(x)
x = layers.Dense(15,activation='relu')(x)
# print(x.shape)

outputs = layers.Dense(1,activation='sigmoid')(x)


model_5 = tf.keras.Model(inputs,outputs)


In [45]:
model_5.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional_2 (Bidirecti  (None, 15, 128)           74496     
 onal)                                                           
                                                                 
 bidirectional_3 (Bidirecti  (None, 15, 128)           98816     
 onal)                                                           
                                                           

In [46]:
model_5.compile(loss='binary_crossentropy',
                optimizer= tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [47]:
model_5.fit(
    x=train_data,
    y=train_labels,
    epochs=5,
    validation_data=(val_data,val_labels)
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c930d819a80>

## Model 6: Tensorflow Hub

In [48]:
import tensorflow_hub as hub


embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

In [49]:
sentence_encoder_layer = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False)

In [50]:
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    tf.keras.layers.Dense(128),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(32),
    tf.keras.layers.Dense(24),
    tf.keras.layers.Dense(16),
    tf.keras.layers.Dense(1,activation='sigmoid')
    ])

In [51]:
model_6.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [52]:
model_6.fit(
    x=train_data,
    y=train_labels,
    epochs=5,
    validation_data=(val_data,val_labels)
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c92fa377d00>

In [53]:
calculate_results(val_labels,tf.squeeze(tf.round(model_6.predict(val_data))))



{'accuracy': 0.8188976377952756,
 'precision': 0.81856239039989,
 'recall': 0.8188976377952756,
 'f1_score': 0.8178953399968445}

In [54]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## Model 7: With only 10% of The Data

In [55]:
train_df_10_pct = train_df_shuffled[['text','target']].sample(frac=0.1,random_state=42)

In [56]:
train_df_10_pct.shape, train_df_shuffled.shape

((761, 2), (7613, 5))

In [57]:
train_data_10_pct = train_df_10_pct['text'].to_list()
train_labels_10_pct = train_df_10_pct['target'].to_list()

In [58]:
model_7 = tf.keras.models.clone_model(model_6)

model_7.compile(
    loss = tf.keras.losses.BinaryCrossentropy(),
    optimizer = 'adam',
    metrics = ['accuracy']
)

model_7.fit(
    x=train_data_10_pct,
    y=train_labels_10_pct,
    epochs=5,
    validation_data=(val_data,val_labels)
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c92fc475de0>

In [59]:
model_6.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense_9 (Dense)             (None, 128)               65664     
                                                                 
 dense_10 (Dense)            (None, 64)                8256      
                                                                 
 dense_11 (Dense)            (None, 32)                2080      
                                                                 
 dense_12 (Dense)            (None, 24)                792       
                                                                 
 dense_13 (Dense)            (None, 16)                400       
                                                                 
 dense_14 (Dense)            (None, 1)                 1

In [84]:
df = pd.DataFrame(model_6.predict(val_data),val_labels,columns=['preds']).reset_index()
df.columns = ['target','preds_prob']
df['preds'] = df['preds_prob'].apply(np.round).astype(int)
df.head()



Unnamed: 0,target,preds_prob,preds
0,0,0.521801,1
1,0,0.170438,0
2,0,0.728305,1
3,0,0.099395,0
4,0,0.814704,1


In [96]:
df_wrong_preds = df[df['target'] != df['preds']].sort_values('target') #.sort_values('preds_prob',ascending=False)

In [97]:
df_wrong_preds.head()

Unnamed: 0,target,preds_prob,preds
0,0,0.521801,1
460,0,0.903399,1
439,0,0.560879,1
411,0,0.603477,1
392,0,0.575245,1
