### Practice code

In [1]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [2]:
# Reading the data from dataset
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# Shuffle training dataframe
train_df_shuffle = train_df.sample(frac=1, random_state=42)
train_df_shuffle.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [6]:
# test dataframe 
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
# How many total samples
len(train_df), len(test_df)

(7613, 3263)

In [11]:
# visualize random training examples
import random
random_index = random.randint(0, len(train_df)-5)   # Create random indexes not higher then the total number of samples
for row in train_df_shuffle[["text", "target"]][random_index:random_index+5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
    print(f"Text:\n{text}\n")
    print("---\n")

Target: 1 (real disaster)
Text:
#News FedEx no longer to transport bioterror germs in wake of anthrax lab mishaps (say what?): åÊFedEx no... http://t.co/K0Y7xFxmXA #TCOT

---

Target: 1 (real disaster)
Text:
@peterjukes But there are good grounds to believe that 'political military catastrophe' was a crime planned and committed by individuals.

---

Target: 0 (not real disaster)
Text:
Kijima_Matako: Breaking news! Unconfirmed! I just heard a loud bang nearby. in what appears to be a blast of wind from my neighbour's ass.

---

Target: 1 (real disaster)
Text:
Firefighting consumes Forest Service budget sparks political clash: Forest Service report cites increasing cost ofÛ_ http://t.co/lSWsitnkuk

---

Target: 1 (real disaster)
Text:
8th person dies in NY Legionnaires' disease outbreak http://t.co/fJdM8QHYAI #SEBEE

---



In [13]:
# Splitting the data into training and validation sets
from sklearn.model_selection import train_test_split

train_sentences ,val_sentences, train_labels, val_labels = train_test_split(train_df_shuffle["text"].to_numpy(),
                                                                            train_df_shuffle["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)


In [14]:
# Check the length 
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [15]:
# Check the first 10 samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

## Convert text into numbers

### Text vectorization

In [16]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [17]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# Use the default TextVectorization variables
text_vectorizer = TextVectorization(max_tokens=None,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=None)

In [18]:
# Find the average number of tokens(words) in the training tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [20]:
# Setup text vectorization variables
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [21]:
# Fit the text vector to the training text
text_vectorizer.adapt(train_sentences)

In [22]:
sample_sentence = "Light no fire!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[540,  40,  42,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [23]:
# Choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original text: \n {random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text: 
 @welshninja87 click on the tag there's lots of them. RT them to hijack the hashtag      

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[6843, 2159,   11,    2, 2270,  264, 1504,    6,   93,   96,   93,
           5,  623,    2, 3786]], dtype=int64)>

In [24]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottoom_5_words = words_in_vocab[-5:]
print(f"Number of words vocab: {len(words_in_vocab)}")
print(f"5 Most common words: {top_5_words}")
print(f"5 least common words: {bottoom_5_words}")

Number of words vocab: 10000
5 Most common words: ['', '[UNK]', 'the', 'a', 'in']
5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### Creating an embedding using embedding layer

In [25]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             embeddings_initializer="uniform",
                             input_length=max_length)
embedding

<keras.layers.core.embedding.Embedding at 0x1dbe024a7a0>

In [27]:
# Get a random sentence from the training set
random_sentence = random.choice(train_sentences)
print(f"Original Text:\n {random_sentence}\
      \n\nEmbedded Version:")

# Embed the random sentence (turn it into dense vectors of fixed size)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original Text:
 'Kessler Syndrome' is the name for the catastrophic exponential proliferation of Space debris and destruction of satellites. #GravityMovie      

Embedded Version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.01014906, -0.0214717 ,  0.00748205, ...,  0.02187506,
          0.0318298 ,  0.04430157],
        [-0.00226279,  0.04939541, -0.04617533, ..., -0.01779212,
         -0.01637324, -0.04355131],
        [ 0.0073084 ,  0.04017427,  0.04599061, ...,  0.00712402,
          0.04439776,  0.00459231],
        ...,
        [-0.04588966,  0.046596  ,  0.03436495, ..., -0.01757075,
         -0.02012315,  0.01480701],
        [-0.00471244,  0.03206224,  0.03278699, ..., -0.00394832,
          0.01192967, -0.02308027],
        [ 0.02449772,  0.00784804,  0.00581126, ...,  0.00566329,
         -0.03834822,  0.00410309]]], dtype=float32)>

In [28]:
# Check out a single token's embedding
sample_embed[0][0], sample_embed[0][0].shape, random_sentence[0]

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.01014906, -0.0214717 ,  0.00748205,  0.03739164,  0.04649284,
         0.04080017,  0.04157147, -0.04003718,  0.03285554,  0.0022526 ,
         0.01330577, -0.03245846,  0.00392222, -0.01445568,  0.03363129,
         0.00464828,  0.00873305,  0.02622036,  0.01873243,  0.03005281,
         0.00482553,  0.02874872, -0.0005447 ,  0.03162852, -0.00409057,
         0.04740312,  0.02221587, -0.00962995, -0.03535304,  0.03430052,
        -0.02371203,  0.00508406, -0.03001424,  0.02149788,  0.03214446,
         0.01928563, -0.03394683,  0.03087327, -0.00542686,  0.03008243,
        -0.03070699, -0.01425355, -0.01199425, -0.04103585, -0.01712825,
        -0.02945799, -0.03551795,  0.04806659, -0.00796696, -0.03389177,
        -0.03801572,  0.03076713, -0.03903864, -0.04384003, -0.00544085,
         0.0102886 ,  0.01676612,  0.04317284,  0.04311074,  0.02332561,
        -0.01046283,  0.00065017, -0.03640562,  0.03035723, -0.04755399,
  

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(train_sentences, train_labels)



In [31]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model acheives an accuracy of: {baseline_score*100:.2f}%")

Our baseline model acheives an accuracy of: 79.27%


In [35]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

In [36]:
train_labels

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [37]:
# Creating an evaluation function for our model experiments

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
    """
    Calculates model accuracy, precision, recall and f1 score of a binary classification model.
    """
    # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100

    # Calculate model precision, recall, f1 score using "weighted" average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {"accuracy": model_accuracy,
                     "precision": model_precision,
                     "recall": model_recall,
                     "f1": model_f1}
    
    return model_results

In [38]:
baseline_results = calculate_results(y_true=val_labels,
                                   y_pred=baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [40]:
# Model 1 with Feed forward Neural Network

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

In [41]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_2 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d_1   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [42]:
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [44]:
mode_1_history = model_1.fit(train_sentences,
                             train_labels,
                             epochs=5,
                             validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [45]:
model_1.evaluate(val_sentences, val_labels)



[0.48370620608329773, 0.787401556968689]

In [46]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs.shape



(762, 1)

In [47]:
model_1_pred_probs[:10]

array([[0.32507536],
       [0.76466364],
       [0.9974267 ],
       [0.08926271],
       [0.11345116],
       [0.93579555],
       [0.9094333 ],
       [0.99345666],
       [0.9555588 ],
       [0.2167963 ]], dtype=float32)

In [48]:
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [49]:
model_1_results = calculate_results(y_true=val_labels,
                                    y_pred=model_1_preds)
model_1_results

{'accuracy': 78.74015748031496,
 'precision': 0.7942180127180873,
 'recall': 0.7874015748031497,
 'f1': 0.7838012115396069}

In [50]:
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [51]:
# Model 2: LSTM

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(96)(x)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_2 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 15, 128)           131584    
                                                                 
 lstm_1 (LSTM)               (None, 96)                86400     
                                                                 
 dense_1 (Dense)             (None, 64)                6208      
                                                      

In [53]:
model_2.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

In [54]:
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]



array([[9.9660996e-03],
       [5.9340572e-01],
       [9.9963677e-01],
       [4.9345307e-02],
       [4.4096154e-04],
       [9.9304938e-01],
       [8.2460850e-01],
       [9.9984121e-01],
       [9.9962115e-01],
       [4.2051205e-01]], dtype=float32)

In [56]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_results = calculate_results(y_true=val_labels,
                                    y_pred=model_2_preds)

model_2_results

{'accuracy': 77.69028871391076,
 'precision': 0.7795556446582096,
 'recall': 0.7769028871391076,
 'f1': 0.7744873664507351}