In [4]:
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"


--2021-08-11 15:22:41--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.128, 142.251.2.128, 74.125.137.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2021-08-11 15:22:41 (121 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [5]:

!unzip "/content/nlp_getting_started.zip" -d "/content/dataset"

Archive:  /content/nlp_getting_started.zip
  inflating: /content/dataset/sample_submission.csv  
  inflating: /content/dataset/test.csv  
  inflating: /content/dataset/train.csv  


In [6]:
import pandas as pd 
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt


In [7]:
train_df = pd.read_csv("/content/dataset/train.csv")

In [13]:
#print first 10 tweets
for i, row in enumerate(train_df[:10][["text","target"]].itertuples()): 
  print("Disaster:" if row[2]==1 else "Not a Disaster:")
  print(row[1]) 
  print("")

Disaster:
Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all

Disaster:
Forest fire near La Ronge Sask. Canada

Disaster:
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected

Disaster:
13,000 people receive #wildfires evacuation orders in California 

Disaster:
Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 

Disaster:
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires

Disaster:
#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas

Disaster:
I'm on top of the hill and I can see a fire in the woods...

Disaster:
There's an emergency evacuation happening now in the building across the street

Disaster:
I'm afraid that the tornado is coming to our area...



In [44]:
#Shuffle our DF
train_df_shuffled = train_df.sample(frac=1, random_state=42)

In [45]:
#Split our data 90-10
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                    train_df_shuffled["target"].to_numpy(),
                                                    test_size = 0.1,
                                                    random_state=42)

In [46]:
len(X_train) , len(X_test)

(6851, 762)

In [47]:
#get the first 10 sentences and labels
X_train[:10], y_train[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object), array([0, 

In [48]:
## convert text to numbers using Tokenization (vectorization of text)
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens=None, #no limit
                                    standardize="lower_and_strip_punctuation", #remove punctuation and make letters lowercase
                                    split="whitespace", #whitespace delimiter
                                    ngrams = None, #dont group anything, every token alone
                                    output_mode ="int",
                                    output_sequence_length=None,#length of each sentence == length of largest sentence
                                    pad_to_max_tokens=True)

In [49]:
sum([len(i.split()) for i in X_train]) / len(X_train) #avg length of tweet

14.901036345059115

In [50]:
#hyperparameters
max_vocab_length = 10000 #number of words in the vocabulary 
max_length = 15 #tweet average length

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length= max_length)
#vectorize the text
text_vectorizer.adapt(X_train)

In [51]:
sample_sentence = " is the greatest of all time gdsga fdaoj fkdo"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   9,    2, 1669,    6,   44,   92,    1,    1,    1,    0,    0,
           0,    0,    0,    0]])>

In [52]:
#select a random sentence and vectorize it
import random
text_vectorizer([random.choice(X_train)])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   1, 3417,    6,    3,  849,  738,   77,  355,    7,  926,  355,
          51, 6246,  854,  132]])>

In [53]:
#get top 5 words in the vocab and bottom 5 words
text_vectorizer.get_vocabulary()[:5] , text_vectorizer.get_vocabulary()[-5:]

(['', '[UNK]', 'the', 'a', 'in'],
 ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1'])

In [54]:
#Using Embedding instead of vectorization
from tensorflow.keras import layers
embedding = layers.Embedding(input_dim= max_vocab_length,
                             output_dim=128, #USE THE NUMERS THAT ARE DIVISIBLE BY 8 FOR MORE GPU PERFORAMNCE
                             input_length=max_length)

In [62]:
#trying a random sentence
sample_sentence = " is the greatest of all time gdsga fdaoj fkdo"
embedding(text_vectorizer([sample_sentence]))

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.00776813,  0.03387128,  0.04187045, ..., -0.04238109,
          0.00833943, -0.00994902],
        [-0.03547393, -0.02146465, -0.03459584, ..., -0.02077488,
         -0.01670638,  0.03268284],
        [-0.03837866, -0.04802859, -0.00513575, ...,  0.03004959,
         -0.03456316, -0.01085935],
        ...,
        [-0.00390359,  0.01558534, -0.04648587, ..., -0.00117791,
         -0.03440706,  0.03916972],
        [-0.00390359,  0.01558534, -0.04648587, ..., -0.00117791,
         -0.03440706,  0.03916972],
        [-0.00390359,  0.01558534, -0.04648587, ..., -0.00117791,
         -0.03440706,  0.03916972]]], dtype=float32)>

In [66]:
##using a ML learning algorithm: Naive Bayes 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

naive_bayes = Pipeline([
                        ("tfidf", TfidfVectorizer()),
                        ("clf", MultinomialNB())
])
naive_bayes.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [67]:
#WE ACHIEVED 79.2%
naive_bayes.score(X_test, y_test)

0.7926509186351706

In [86]:
# simple dense model using functional api
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x) #very important, to change shape from 15,1 to 1 only
outputs = layers.Dense(1,activation ="sigmoid")(x)
simple_dense = tf.keras.Model(inputs,outputs, name="simpleNN")

In [87]:
simple_dense.compile(loss="binary_crossentropy",
                     optimizer="adam",
                     metrics=["accuracy"])

simple_dense.fit(x=X_train,
                 y=y_train,
                 epochs=5,
                 validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f55c4ea7450>

In [90]:
tf.squeeze(tf.round(simple_dense.predict(X_test)))

<tf.Tensor: shape=(762,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0.,
       0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1.,
       0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0.,
       1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
       1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0

In [102]:
simple_dense.get_layer("embedding").get_weights()
## 10,000 X 128
## what this means is that, we have 10k words, each of them is inside a 128 dimensional space, so each of them has 128 values. if a word has similar 128 values to another word, then these words are similar to each other


[array([[-0.05099023, -0.09464528,  0.00926686, ..., -0.03141614,
         -0.05031032,  0.06775701],
        [-0.00635716, -0.07694145, -0.01740959, ..., -0.0357018 ,
          0.00128732,  0.00787335],
        [-0.08969884, -0.08862485,  0.01367988, ..., -0.05321268,
         -0.01921013,  0.06614075],
        ...,
        [-0.03714328, -0.03884866, -0.04933063, ...,  0.04352114,
         -0.02638441, -0.01665139],
        [ 0.06781704, -0.18737489, -0.1681846 , ...,  0.0713449 ,
         -0.15821569, -0.06430142],
        [ 0.12874596, -0.20377828, -0.1739399 , ...,  0.16613746,
         -0.20951635, -0.07969934]], dtype=float32)]

In [118]:
## Recurrent Neural Networks GRU 
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x= layers.GRU(64, return_sequences=True)(x) #if you want to stack layers of RNN, u must use return sequences
x = layers.GRU(64, return_sequences=True)(x)
x = layers.GlobalAveragePooling1D()(x) # gets rid of a dimension, so that the output isnt 15x1
x = layers.Dense(64, activation="relu")(x)

outputs = layers.Dense(1, activation ="sigmoid")(x)
GRU_model = tf.keras.Model(inputs,outputs, name="model_LSTM")



In [119]:
GRU_model.compile(loss="binary_crossentropy",
                     optimizer="adam",
                     metrics=["accuracy"])

GRU_model.fit(x=X_train,
              y=y_train,
              epochs=5,
              validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f55bbdb8710>

In [120]:
from sklearn.metrics import accuracy_score,precision_recall_fscore_support
def calculate_results(y_true, y_pred):
	""""
	EVALUATE ACCURACY, PRECISION, RECALL, F1 SCORE
	"""
	model_accuracy = accuracy_score(y_true, y_pred) * 100
	model_precision, model_recall, model_f1,_ = precision_recall_fscore_support(y_true, y_pred,average="weighted")
	model_results = {"accuracy":model_accuracy,
					 "precision":model_precision,
					 "recall" :model_recall,
					 "f1":model_f1}
	return model_results
calculate_results(y_true=y_test,
                  y_pred=tf.squeeze(tf.round(LSTM_model.predict(X_test))))

{'accuracy': 78.08398950131233,
 'f1': 0.777681907957685,
 'precision': 0.7856875884556138,
 'recall': 0.7808398950131233}

In [None]:
## Slightly better performance than LSTM