In [1]:
# Set environment seed
import os
os.environ['PYTHONHASHSEED']=str(1)

In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import datetime
import json
from contextlib import redirect_stdout
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import train_test_split
import time

from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def reset_random_seeds():
    '''
    Sets all necessary seed for reproduceability.
    '''
    os.environ['PYTHONHASHSEED']=str(1)
    tf.random.set_seed(1)
    np.random.seed(1)
    
reset_random_seeds()

In [None]:
# Read the dataset and perform stratified train test split

data = pd.read_csv('IMDB Dataset.csv')
x_train, x_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], 
                                                    test_size=0.2, random_state=0, 
                                                    stratify=data['sentiment'])

In [None]:
# Set the hyper-parameters. 

hyper_params = {
    'VOCAB_SIZE':50000,
    'BS':512,
    'LR':0.01,
    'OOV_TOK':"<OOV>"
}

In [None]:
# Tokenize using the training dataset. Subsequently apply is on the train and test dataset.

tokenizer = Tokenizer(oov_token=hyper_params['OOV_TOK'])
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

In [None]:
# We set more hyper-parameters

hyper_params['MAX_LENGTH'] = 520
hyper_params['PADDING_TYPE'] = "post"
hyper_params['TRUNC_TYPE'] = "post"

In [None]:
# Perform padding on training and testing dataset using our hyper-parameters set.

training_padded = pad_sequences(train_sequences, padding=hyper_params['PADDING_TYPE'], 
                                maxlen=hyper_params['MAX_LENGTH'], truncating=hyper_params['TRUNC_TYPE'])
testing_padded = pad_sequences(test_sequences, padding=hyper_params['PADDING_TYPE'], 
                               maxlen=hyper_params['MAX_LENGTH'], truncating=hyper_params['TRUNC_TYPE'])

In [None]:
# Convert the categeorical labels into integers and store in a numpy array

train_labels = []
test_labels = []

for item in y_train:
    if item == 'positive':
        train_labels.append(1)
    else:
        train_labels.append(0)
        
for item in y_test:
    if item == 'positive':
        test_labels.append(1)
    else:
        test_labels.append(0)

train_labels = np.asarray(train_labels).astype('float32')
test_labels = np.asarray(test_labels).astype('float32')

In [None]:
# Create ae custom call back to record the time taken per epoch.

class timelogCallback(keras.callbacks.Callback):
    '''
    Inherits callbacks.Callback from keras. Records
    the time taken from start to end of each epoch.
    '''
    
    def on_train_begin(self, logs={}):
        self.timelog = []
    def on_epoch_begin(self, batch, logs={}):
        self.start_time = time.time()
    def on_epoch_end(self, batch, logs={}):
        self.timelog.append(time.time() - self.start_time)

# <center>GRU-1 Grid Search Hyper-Parameters

In [None]:
# Create callbacks to be used during our Grid Search. The patience values were
# decided based on the Vanilla 1-Layer GRU trained

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.00005, verbose=1)
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
timelog = timelogCallback()

In [None]:
# Function to create a 1-Layer GRU model with tuneable hyper-parameters
# Returns a compiled model.

def create_GRU1(em_dims, units):
    GRU1_model = keras.Sequential([
        keras.layers.Embedding(len(word_index)+1, em_dims, input_length=520),
        keras.layers.GRU(units),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    GRU1_model.compile(optimizer=keras.optimizers.Adam(learning_rate=hyper_params['LR']),
                     loss='binary_crossentropy', metrics=['accuracy'])

    return GRU1_model


In [None]:
# Set the values for grid search

em_dims = [128, 256]
unit1 = [100, 200]

In [None]:
# Perform grid search and store the results in a dictionary for use later

GRU1_history_dict = {}

for em_dim in em_dims:
    for u1 in unit1:
        print(f"Training with embedding dimensions: {em_dim}, unit1: {u1}")
        model = create_GRU1(em_dim, u1)
        history = model.fit(training_padded, train_labels, batch_size = hyper_params['BS'],
                            epochs=250, validation_data=(testing_padded, test_labels),
                           callbacks=[reduce_lr, earlystop, timelog])
        GRU1_history_dict[f'GRU1_history_{em_dim}_{u1}'] = history, timelog.timelog

In [None]:
# Save the history and timelog of each model from the dictionary created in the cell above into a csv file

for title, item in GRU1_history_dict.items():
    # convert the history.history dict to a pandas DataFrame:     
    hist_df = pd.DataFrame(item[0].history)
    time_df = pd.DataFrame(item[1])

    # save to csv:
    hist_csv_file = 'History_' + str(title) + '.csv'
    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)
        
    time_csv_file = 'Time_' + str(title) + '.csv'
    with open(time_csv_file, mode='w') as f:
        time_df.to_csv(f)    

In [None]:
# Save loss figure for the history of each model trained during grid search. Save it as a html file

fig = go.Figure()

for key, item in GRU1_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['loss'], name=str(key)+'Training Loss'))
for key, item in GRU1_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['val_loss'], name=str(key)+'Val Loss'))

fig.show()
fig.write_html('GRU1_Gridsearch_Loss.html')


In [None]:
# Save accuracy figure for the history of each model trained during grid search. Save as a html file

fig = go.Figure()

for key, item in GRU1_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['accuracy'], name=str(key)+'Training Accuracy'))
for key, item in GRU1_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['val_accuracy'], name=str(key)+'Val Accuracy'))

fig.show()
fig.write_html('GRU1_Gridsearch_Accuracy.html')


In [None]:
# Plot the validation accuracy of the grid searched models. Validation Accuracy is determined by the 
# last epoch of training

import plotly.graph_objects as go
fig = go.Figure()

x = []
y = []
for key, item in GRU1_history_dict.items():
    x.append(str(key)+'Val Accuracy')
    y.append(item[0].history['val_accuracy'][-1])

fig.add_trace(go.Bar(x=x, y=y))

fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

fig.write_html('GRU1_Barplot_Val_Acc.html')

# <center>GRU-2 Grid Search Hyper-Parameters

In [None]:
# Create callbacks to be used during our Grid Search. The patience values were
# decided based on the Vanilla 2-Layer GRU trained

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=4, min_lr=0.00005, verbose=1)
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=12)
timelog = timelogCallback()

In [None]:
# Function to create a 2-Layer GRU model with tuneable hyper-parameters
# Returns a compiled model.

def create_GRU2(em_dims, unit1, unit2):
    GRU2_model = keras.Sequential([
        keras.layers.Embedding(len(word_index)+1, em_dims, input_length=520),
        keras.layers.GRU(unit1, return_sequences=True),
        keras.layers.Dropout(0.5),
        keras.layers.GRU(unit2),
        keras.layers.Dropout(0.5),    
        keras.layers.Dense(1, activation='sigmoid')
    ])

    GRU2_model.compile(optimizer=keras.optimizers.Adam(learning_rate=hyper_params['LR']),
                     loss='binary_crossentropy', metrics=['accuracy'])

    return GRU2_model

In [None]:
# Set the values for grid search

em_dims = [128, 256]
unit1 = [100, 200]
unit2 = [100, 200]

In [None]:
# Perform grid search and store the results in a dictionary for use later

GRU2_history_dict = {}

for em_dim in em_dims:
    for u1 in unit1:
        for u2 in unit2:
            print(f"Training with em_dim:{em_dim}, unit1:{u1}, unit2:{u2}")
            model = create_GRU2(em_dim, u1, u2)
            history = model.fit(training_padded, train_labels, batch_size = hyper_params['BS'],
                                epochs=250, validation_data=(testing_padded, test_labels),
                               callbacks=[reduce_lr, earlystop, timelog])
            GRU2_history_dict[f'GRU2_history_{em_dim}_{u1}_{u2}'] = history, timelog.timelog

In [None]:
# Save the history and timelog of each model from the dictionary created in the cell above into a csv file

for title, item in GRU2_history_dict.items():
    # convert the history.history dict to a pandas DataFrame:     
    hist_df = pd.DataFrame(item[0].history)
    time_df = pd.DataFrame(item[1])

    # save to csv:
    hist_csv_file = 'History_' + str(title) + '.csv'
    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)
        
    time_csv_file = 'Time_' + str(title) + '.csv'
    with open(time_csv_file, mode='w') as f:
        time_df.to_csv(f)    

In [None]:
# Save loss figure for the history of each model trained during grid search. Save it as a html file

fig = go.Figure()

for key, item in GRU2_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['loss'], name=str(key)+'Training Loss'))
for key, item in GRU2_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['val_loss'], name=str(key)+'Val Loss'))

fig.show()
fig.write_html('GRU2_Gridsearch_Loss.html')


In [None]:
# Save accuracy figure for the history of each model trained during grid search. Save as a html file

fig = go.Figure()

for key, item in GRU2_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['accuracy'], name=str(key)+'Training Accuracy'))
for key, item in GRU2_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['val_accuracy'], name=str(key)+'Val Accuracy'))

fig.show()
fig.write_html('GRU2_Gridsearch_Accuracy.html')


In [None]:
# Plot the validation accuracy of the grid searched models. Validation Accuracy is determined by the 
# last epoch of training

import plotly.graph_objects as go
fig = go.Figure()

x = []
y = []
for key, item in GRU2_history_dict.items():
    x.append(str(key)+'Val Accuracy')
    y.append(item[0].history['val_accuracy'][-1])

fig.add_trace(go.Bar(x=x, y=y))

fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

fig.write_html('GRU2_Barplot_Val_Acc.html')

# <center>LSTM-1 Grid Search Hyper-Parameters

In [None]:
# Create callbacks to be used during our Grid Search. The patience values were
# decided based on the Vanilla 1-Layer LSTM trained

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=8, min_lr=0.00005, verbose=1)
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=16)
timelog = timelogCallback()

In [None]:
# Function to create a 1-Layer LSTM model with tuneable hyper-parameters
# Returns a compiled model.

def create_LSTM1(em_dims, units):
    LSTM1_model = keras.Sequential([
        keras.layers.Embedding(len(word_index)+1, em_dims, input_length=520),
        keras.layers.LSTM(units),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    LSTM1_model.compile(optimizer=keras.optimizers.Adam(learning_rate=hyper_params['LR']),
                     loss='binary_crossentropy', metrics=['accuracy'])

    return LSTM1_model

In [None]:
# Set the values for grid search

em_dims = [128, 256]
unit1 = [100, 200]

In [None]:
# Perform grid search and store the results in a dictionary for use later

LSTM1_history_dict = {}
for em_dim in em_dims:
    for u1 in unit1:
        model = create_LSTM1(em_dim, u1)
        history = model.fit(training_padded, train_labels, batch_size = hyper_params['BS'],
                                epochs=250, validation_data=(testing_padded, test_labels),
                               callbacks=[reduce_lr, earlystop, timelog])
        LSTM1_history_dict[f'LSTM1_history_{em_dim}_{u1}'] = history, timelog.timelog

In [None]:
# Save the history and timelog of each model from the dictionary created in the cell above into a csv file

for title, item in LSTM1_history_dict.items():
    # convert the history.history dict to a pandas DataFrame:     
    hist_df = pd.DataFrame(item[0].history)
    time_df = pd.DataFrame(item[1])

    # save to csv:
    hist_csv_file = 'History_' + str(title) + '.csv'
    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)
        
    time_csv_file = 'Time_' + str(title) + '.csv'
    with open(time_csv_file, mode='w') as f:
        time_df.to_csv(f)    

In [None]:
# Save loss figure for the history of each model trained during grid search. Save it as a html file

fig = go.Figure()

for key, item in LSTM1_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['loss'], name=str(key)+'Training Loss'))
for key, item in LSTM1_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['val_loss'], name=str(key)+'Val Loss'))

fig.show()
fig.write_html('LSTM1_Gridsearch_Loss.html')


In [None]:
# Save accuracy figure for the history of each model trained during grid search. Save as a html file

fig = go.Figure()

for key, item in LSTM1_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['accuracy'], name=str(key)+'Training Accuracy'))
for key, item in LSTM1_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['val_accuracy'], name=str(key)+'Val Accuracy'))

fig.show()
fig.write_html('LSTM1_Gridsearch_Accuracy.html')


In [None]:
# Plot the validation accuracy of the grid searched models. Validation Accuracy is determined by the 
# last epoch of training

import plotly.graph_objects as go
fig = go.Figure()

x = []
y = []
for key, item in LSTM1_history_dict.items():
    x.append(str(key)+'Val Accuracy')
    y.append(item[0].history['val_accuracy'][-1])

fig.add_trace(go.Bar(x=x, y=y))

fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

fig.write_html('LSTM1_Barplot_Val_Acc.html')

# <center>LSTM-2 Grid Search Hyper-Parameters

In [None]:
# Create callbacks to be used during our Grid Search. The patience values were
# decided based on the Vanilla 2-Layer LSTM trained

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=8, min_lr=0.00005, verbose=1)
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=16)
timelog = timelogCallback()

In [None]:
# Function to create a 2-Layer LSTM model with tuneable hyper-parameters
# Returns a compiled model.

def create_LSTM2(em_dims, unit1, unit2):
    LSTM2_model = keras.Sequential([
        keras.layers.Embedding(len(word_index)+1, em_dims, input_length=520),
        keras.layers.LSTM(unit1, return_sequences=True),
        keras.layers.Dropout(0.5),
        keras.layers.LSTM(unit2),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    LSTM2_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.005),
                     loss='binary_crossentropy', metrics=['accuracy'])
    
    return LSTM2_model


In [None]:
# Set the values for grid search

em_dims = [128, 256]
unit1 = [100, 200]
unit2 = [100, 200]

In [None]:
# Perform grid search and store the results in a dictionary for use later

LSTM2_history_dict = {}
for em_dim in em_dims:
    for u1 in unit1:
        for u2 in unit2:
            model = create_LSTM2(em_dim, u1, u2)
            history = model.fit(training_padded, train_labels, batch_size = hyper_params['BS'],epochs=250, validation_data=(testing_padded, test_labels),callbacks=[reduce_lr, earlystop, timelog])
            LSTM2_history_dict[f'LSTM1_history_{em_dim}_{u1}_{u2}'] = history, timelog.timelog


In [None]:
# Save the history and timelog of each model from the dictionary created in the cell above into a csv file

for title, item in LSTM2_history_dict.items():
    # convert the history.history dict to a pandas DataFrame:     
    hist_df = pd.DataFrame(item[0].history)
    time_df = pd.DataFrame(item[1])

    # save to csv:
    hist_csv_file = 'History_' + str(title) + '.csv'
    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)
        
    time_csv_file = 'Time_' + str(title) + '.csv'
    with open(time_csv_file, mode='w') as f:
        time_df.to_csv(f)    

In [None]:
# Save loss figure for the history of each model trained during grid search. Save it as a html file

fig = go.Figure()

for key, item in LSTM2_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['loss'], name=str(key)+'Training Loss'))
for key, item in LSTM2_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['val_loss'], name=str(key)+'Val Loss'))

fig.show()
fig.write_html('LSTM2_Gridsearch_Loss.html')


In [None]:
# Save accuracy figure for the history of each model trained during grid search. Save as a html file

fig = go.Figure()

for key, item in LSTM2_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['accuracy'], name=str(key)+'Training Accuracy'))
for key, item in LSTM2_history_dict.items():
    fig.add_trace(go.Scatter(x=item[0].epoch, y=item[0].history['val_accuracy'], name=str(key)+'Val Accuracy'))

fig.show()
fig.write_html('LSTM2_Gridsearch_Accuracy.html')


In [None]:
# Plot the validation accuracy of the grid searched models. Validation Accuracy is determined by the 
# last epoch of training

import plotly.graph_objects as go
fig = go.Figure()

x = []
y = []
for key, item in LSTM2_history_dict.items():
    x.append(str(key)+'Val Accuracy')
    y.append(item[0].history['val_accuracy'][-1])

fig.add_trace(go.Bar(x=x, y=y))

fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

fig.write_html('LSTM2_Barplot_Val_Acc.html')