# Kaggle contest training

The URL of the contest: https://www.kaggle.com/c/nlp-getting-started/overview

In [None]:
!nvidia-smi -L

GPU 0: Tesla K80 (UUID: GPU-d9648ade-c31e-ab81-be67-d6db48bddb26)


In [None]:
## Get helper functions
!wget https://raw.githubusercontent.com/BaoLocPham/Tensorflow_Deep_Learning/main/Utils/helper_functions.py

--2021-08-17 11:56:26--  https://raw.githubusercontent.com/BaoLocPham/Tensorflow_Deep_Learning/main/Utils/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11240 (11K) [text/plain]
Saving to: ‘helper_functions.py’


2021-08-17 11:56:26 (31.2 MB/s) - ‘helper_functions.py’ saved [11240/11240]



In [None]:
# import series of helper functions for the notebook
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

## Get text dataset

In [None]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

# unzip data
unzip_data("nlp_getting_started.zip")

--2021-08-17 11:56:29--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.188.128, 64.233.189.128, 108.177.97.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.188.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2021-08-17 11:56:29 (108 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [None]:
import pandas as pd

test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")
sample_submit_df = pd.read_csv("sample_submission.csv")

In [None]:
## shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


### Split data into training and validation set

In [None]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [None]:
len(train_sentences), len(val_sentences)

(6851, 762)

### Preprocess data

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
VOCAB_SIZE = 10000
MAX_LENGTH = 255

text_vectorizer = TextVectorization(max_tokens=VOCAB_SIZE,
                                       output_mode="int",
                                        output_sequence_length=MAX_LENGTH)
text_vectorizer.adapt(train_sentences)

In [None]:
import random 

# Choose a random sentence from the training data and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Origin text:\n {random_sentence}\
          \n\nVectorize version:")
text_vectorizer([random_sentence])

Origin text:
 Now that's what you call a batting collapse #theashes          

Vectorize version:


<tf.Tensor: shape=(1, 255), dtype=int64, numpy=
array([[  48,  215,   55,   12,  425,    3, 6124,  155, 7673,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   

In [None]:
# get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common words
bottom_5_words = words_in_vocab[-5:] # least common words
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"5 most common words: {top_5_words}")
print(f"5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
5 most common words: ['', '[UNK]', 'the', 'a', 'in']
5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [None]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=VOCAB_SIZE,
                             input_length=MAX_LENGTH,
                             output_dim=128,
                             embeddings_initializer="uniform")

In [None]:
# Get a random sentence in the training set 
random_sentence = random.choice(train_sentences)
print(f"Original text: \n{random_sentence}\
\n\nEmbedded version:")
# Embed the ramdon sentence (turn it into dense vector of fixed size)
sample_vectorized = text_vectorizer([random_sentence])
print(f"Sample vectorized shape: {sample_vectorized.shape}")
sample_embedded = embedding(sample_vectorized)
sample_embedded

Original text: 
A poignant reminder that in war there are many casualties. http://t.co/Mwmt3BdR5L

Embedded version:
Sample vectorized shape: (1, 255)


<tf.Tensor: shape=(1, 255, 128), dtype=float32, numpy=
array([[[ 0.01427845,  0.00697158,  0.02343445, ..., -0.00322936,
          0.01475923, -0.04973565],
        [-0.0059801 , -0.02082992,  0.02860582, ...,  0.04510782,
          0.04722795, -0.03120303],
        [ 0.00378575, -0.04992047,  0.03134001, ...,  0.01537198,
         -0.01993409, -0.01678674],
        ...,
        [ 0.01989795,  0.02853802,  0.01318219, ...,  0.02367428,
         -0.02803319, -0.02223553],
        [ 0.01989795,  0.02853802,  0.01318219, ...,  0.02367428,
         -0.02803319, -0.02223553],
        [ 0.01989795,  0.02853802,  0.01318219, ...,  0.02367428,
         -0.02803319, -0.02223553]]], dtype=float32)>

## Optimized loading data

## Training model

Approachs:
* model 1: Feed forward model (Dense model) -> Tokenization + Embedding
* model 2: Conv1D -> Tokenization + Embedding



### Model 1 Feed forward model (Dense model) -> Tokenization + Embedding

In [None]:
# num_class = len(train_sentences.class_names)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
# Create the model
inputs = layers.Input(shape=(1,),dtype=tf.string, name="input_layer")

x = text_vectorizer(inputs)
x = embedding(x)

x = layers.Flatten(name="flatten_layer")(x)

outputs = layers.Dense(1, activation="sigmoid")(x)

model_1 = tf.keras.Model(inputs, outputs, name="model_1")

In [None]:
model_1.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [None]:
model_1.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 255)               0         
_________________________________________________________________
embedding (Embedding)        (None, 255, 128)          1280000   
_________________________________________________________________
flatten_layer (Flatten)      (None, 32640)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 32641     
Total params: 1,312,641
Trainable params: 1,312,641
Non-trainable params: 0
_________________________________________________________________


In [None]:
model_1_history = model_1.fit(train_sentences,
                              train_labels,
                              validation_data=(val_sentences, val_labels),
                              epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Model 2: Conv1D -> Tokenization + Embedding

In [None]:
# Create the model
inputs = layers.Input(shape=(1,), dtype=tf.string, name="input_layer")

x = text_vectorizer(inputs)
x = embedding(x)

x = layers.Conv1D(filters=64,
                  kernel_size=5,
                  padding="valid",
                  activation="relu")(x)

x = layers.GlobalAveragePooling1D(name="global_average_pool_1D")(x)
x = layers.Dense(64, activation="relu")(x)

outputs = layers.Dense(1, activation="sigmoid")(x)

model_2 = tf.keras.Model(inputs, outputs, name="model_2")

In [None]:
# Compile the model
model_2.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [None]:
model_2.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 255)               0         
_________________________________________________________________
embedding (Embedding)        (None, 255, 128)          1280000   
_________________________________________________________________
conv1d (Conv1D)              (None, 251, 64)           41024     
_________________________________________________________________
global_average_pool_1D (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65  

In [None]:
# fit the model
model_2_history = model_2.fit(train_sentences, 
                              train_labels,
                              validation_data=(val_sentences, val_labels),
                              epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Model 3: Transfer learning: Feature extraction

Using Universal Sentence Embedding: https://tfhub.dev/google/universal-sentence-encoder/4

In [None]:
import tensorflow_hub as hub

# Create a Keras layer using USE pretrained layer from tensorflow hub
feature_extractor_layer = hub.KerasLayer(handle="https://tfhub.dev/google/universal-sentence-encoder/4",
                                 input_shape=[],
                                dtype=tf.string,
                                trainable=False,
                                name="USE_layer")

In [None]:
# Create a model

# inputs = layers.Input(shape=(1,), dtype=tf.string, name="input_layer")

# # stack the embedding layer
# x = embedding_layer(inputs)
# x = layers.Dense(64, activation="relu")(x)

# outputs = layers.Dense(1, activation="sigmoid")(x)

# model_3 = tf.keras.Model(inputs, outputs, name="model_3_feature_extract")

In [None]:
# Create a model
model_3 = tf.keras.Sequential([
  feature_extractor_layer,
  layers.Dense(64, activation="relu"),
  layers.Dense(1, activation="sigmoid")
])

# Compile the model
model_3.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

In [None]:
model_3.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
USE_layer (KerasLayer)       (None, 512)               256797824 
_________________________________________________________________
dense_5 (Dense)              (None, 64)                32832     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [None]:
model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              validation_data=(val_sentences, val_labels),
                              epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_4 = tf.keras.Sequential([
  feature_extractor_layer,
  layers.Dense(256, activation="relu"),
  layers.Dense(64, activation="relu"),
  layers.Dense(1, activation="sigmoid")
], name="model_4_feature_extraction")

# Compile the model
model_4.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

In [None]:
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              validation_data=(val_sentences, val_labels),
                              epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Model 5: Transfer learning: Fine tuning

In [None]:
import tensorflow_hub as hub

# Create a Keras layer using USE pretrained layer from tensorflow hub
feature_extractor_fine_tune_layer = hub.KerasLayer(handle="https://tfhub.dev/google/universal-sentence-encoder/4",
                                 input_shape=[],
                                dtype=tf.string,
                                trainable=True,
                                name="USE_fine_tune_layer")

In [None]:
model_5 = tf.keras.Sequential([
  feature_extractor_layer,
  layers.Dense(256, activation="relu"),
  layers.Dense(64, activation="relu"),
  layers.Dense(1, activation="sigmoid")
], name="model_5_fine_tuning")

# Compile the model
model_5.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    metrics=["accuracy"]
)

In [None]:
checkpoint_path = "model_5_fine_tune_new/checkpoint.ckpt"

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         monitor="val_accuracy",
                                                         save_best_only=True,
                                                         save_weights_only=True,
                                                         save_freq='epoch')

earlystopping_callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy",
                                                          patience=10,
                                                          restore_best_weights=True)

In [None]:
model_5_history_new = model_5.fit(train_sentences,
                              train_labels,
                              validation_data=(val_sentences, val_labels),
                              epochs=50,
                              callbacks=[checkpoint_callback, earlystopping_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50


In [None]:
model_5.evaluate(val_sentences, val_labels)



[0.4189437925815582, 0.8280839920043945]

In [None]:
model_5.load_weights("/content/model_5_fine_tune/checkpoint.ckpt")

model_5.evaluate(val_sentences, val_labels)



[0.42635491490364075, 0.8333333134651184]

In [None]:
model_5.save("/content/drive/MyDrive/Tensorflow Certificate Exam/Extra/NPL_disaster_tweets_kaggle_fine_tune")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Tensorflow Certificate Exam/Extra/NPL_disaster_tweets_kaggle_fine_tune/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Tensorflow Certificate Exam/Extra/NPL_disaster_tweets_kaggle_fine_tune/assets


### Model 6: LSTM model

In [None]:
# Create models
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string, name="input_layer")

x = text_vectorizer(inputs)
x = embedding(x)

# x = layers.LSTM(units=64, return_sequences=True)(x)
# when you stack RNN cells together, you need to return sequence = True
x = layers.LSTM(64)(x)

# x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid", name="output_layer")(x)

model_6 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

In [None]:
# compile the model
# Compile the model
model_6.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [None]:
checkpoint_path = "model_6_checkpoint/checkpoint.ckpt"

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                         monitor="val_accuracy",
                                                         save_best_only=True,
                                                         save_weights_only=True,
                                                         save_freq='epoch')

earlystopping_callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy",
                                                          patience=5,
                                                          restore_best_weights=True)

In [None]:
model_6_history = model_6.fit(train_sentences,
                              train_labels,
                              validation_data=(val_sentences, val_labels),
                              epochs=20,
                              callbacks=[checkpoint_callback,
                                         earlystopping_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

KeyboardInterrupt: ignored

### Create predictions

In [None]:
import pandas as pd

test_df = pd.read_csv("test.csv")

test_questions =  test_df[["id", "text"]]

test_questions.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
text_question = test_df[["text"]].to_numpy()

In [None]:
text_question.shape

(3263, 1)

In [None]:
pred_prob = model_5.predict(text_question)

In [None]:
pred = tf.squeeze(tf.cast(tf.round(pred_prob), tf.int16))
# pred[:10]
pred.shape

TensorShape([3263])

In [None]:
id_questions = test_df[["id"]].to_numpy()
id_questions = tf.squeeze(id_questions)
id_questions.shape

TensorShape([3263])

In [None]:
id_questions[:10]

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([ 0,  2,  3,  9, 11, 12, 21, 22, 27, 29])>

In [None]:
demo_submission = pd.DataFrame({"id":id_questions, "target":pred})
demo_submission.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,0
6,21,0
7,22,0
8,27,0
9,29,0


In [None]:
demo_submission.to_csv("demo_submission.csv",index=False)

In [None]:
submission_1 = pd.DataFrame({"id":id_questions, "target":pred})

In [None]:
submission_1.head(20)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [None]:
submission_1.to_csv("submission_1.csv", index=False)

KeyboardInterrupt: ignored