In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import numpy as np
import random
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet 
from random import sample
import string

from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tqdm import tqdm

In [2]:
data = pd.read_csv('./IMDB Dataset.csv')

In [3]:
# Set RNN hyper parameters

rnn_hyper_param = {
    'VOCAB_SIZE':50000,
    'EPOCHS':150,
    'BS':1024,
    'LR':0.005,
    'OOV_TOK':"<OOV>",
    'MAX_LENGTH':520,
    'PADDING_TYPE':"post",
    'TRUNC_TYPE':"post"
}

In [4]:
# Set CNN hyper parameters

cnn_hyper_param = {
    'VOCAB_SIZE':50000,
    'EPOCHS':150,
    'BS':1024,
    'LR':0.001,
    'OOV_TOK':"<OOV>",
    'MAX_LENGTH':520,
    'PADDING_TYPE':"post",
    'TRUNC_TYPE':"post"
}

In [5]:
# Keep to the same size of test set we've been using to test how well it generalizes.

x_train, x_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], 
                                                    test_size=0.2, random_state=0, 
                                                    stratify=data['sentiment'])

In [6]:
# Perform tokenization and apply the dictioanry on our train and test set.
# Convert all categorcial data to numpy array of integers

tokenizer = Tokenizer(num_words = rnn_hyper_param['VOCAB_SIZE'], oov_token=rnn_hyper_param['OOV_TOK'])
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)
testing_padded = pad_sequences(test_sequences, padding=rnn_hyper_param['PADDING_TYPE'], 
                               maxlen=rnn_hyper_param['MAX_LENGTH'], truncating=rnn_hyper_param['TRUNC_TYPE'])
y_test = list(y_test)
for i in range (len(y_test)):
    if y_test[i] == 'positive':
        y_test[i] = 1
    else:
        y_test[i] = 0
y_test = np.array(y_test)

In [9]:
def avgNestedLists(nested_vals):
    """
    Averages a 2-D array and returns a 1-D array of all of the columns
    averaged together, regardless of their dimensions.
    """
    
    output = []
    maximum = 0
    for lst in nested_vals:
        if len(lst) > maximum:
            maximum = len(lst)
    for index in range(maximum): # Go through each index of longest list
        temp = []
        for lst in nested_vals: # Go through each list
            if index < len(lst): # If not an index error
                temp.append(lst[index])
        output.append(np.nanmean(temp))
    return output

# <center>Create Model Functions

In [10]:
# Function creates a CNN model, compiles and returns it

def create_cnn():
    model = keras.Sequential([
    keras.layers.Embedding(len(word_index)+1, 30, input_length=520),
    
    keras.layers.Conv1D(40, 3, activation='relu'),
    keras.layers.MaxPooling1D(padding='same'),
    keras.layers.Dropout(0.3),
    
    keras.layers.Conv1D(5, 3, activation='relu'),
    keras.layers.MaxPooling1D(padding='same'),
    keras.layers.Dropout(0.3),
    
    keras.layers.Flatten(),
    keras.layers.Dense(20, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy', 
                  optimizer=keras.optimizers.Adam(learning_rate=cnn_hyper_param['LR']), 
                  metrics=['accuracy'])
    return model

In [11]:
# Function creates an RNN model, compiles and returns it

def create_rnn():
    model = keras.Sequential([
        keras.layers.Embedding(len(word_index)+1, 128, input_length=520),
        keras.layers.LSTM(100, return_sequences=True),
        keras.layers.Dropout(0.5),
        keras.layers.LSTM(100),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=rnn_hyper_param['LR']),
                     loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


# <center>Best CNN with Data Augmentation

In [12]:
# Set the callbacks pateince values as decided after analyzing the Vanilla CNN

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=4, min_lr=0.00005, verbose=1)
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=12)

In [13]:
# Train the CNN on various augmented datasets and write the history results to csv

CNN_aug_hist = {}
for size, dataset in data_dict.items():
    print(f"Training using {size} of dataset.")
    
    # read csv for that augmented file
    _dataset = pd.read_csv(f'data_aug_{size}.csv')
    x_train = list(_dataset['review'])
    y_train = list(_dataset['sentiment'])
    for j in range (len(y_train)):
        if y_train[j]=='positive':
            y_train[j] = 1
        else:
            y_train[j] = 0
    y_train = np.array(y_train)
    train_sequences = tokenizer.texts_to_sequences(x_train)
    training_padded = pad_sequences(train_sequences, padding=cnn_hyper_param['PADDING_TYPE'], 
                                        maxlen=cnn_hyper_param['MAX_LENGTH'], truncating=cnn_hyper_param['TRUNC_TYPE'])

    CNN = create_cnn()
    history = CNN.fit(training_padded, y_train, batch_size = cnn_hyper_param['BS'],
                            epochs=250, validation_data=(testing_padded, y_test),
                           callbacks=[reduce_lr, earlystop])

    CNN_aug_hist[f'{size}'] = history
    
    hist_df = pd.DataFrame(history.history)
    # save to csv:
    hist_csv_file = 'CNN_Aug_History_2_' + str(size) + '.csv'
    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)

Training using 0.01 of dataset.
Train on 6800 samples, validate on 10000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 00009: ReduceLROnPlateau reducing learning rate to 5e-05.
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 00013: early stopping
Training using 0.05 of dataset.
Train on 18000 samples, validate on 10000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 00013: ReduceLROnPlateau reducing learning rate to 5e-05.
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 00017: early stopping
Training using 0.1 of dataset.
Train on 20000 samples, validate on 10000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Ep

Epoch 17/250
Epoch 00017: early stopping
Training using 0.2 of dataset.
Train on 40000 samples, validate on 10000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 00011: ReduceLROnPlateau reducing learning rate to 5e-05.
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 00015: early stopping
Training using 0.3 of dataset.
Train on 60000 samples, validate on 10000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 00010: ReduceLROnPlateau reducing learning rate to 5e-05.
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 00014: early stopping
Training using 0.4 of dataset.
Train on 80000 samples, validate on 10000 samples
Epoch 1/250

Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 00010: ReduceLROnPlateau reducing learning rate to 5e-05.
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 00014: early stopping
Training using 0.6 of dataset.
Train on 120000 samples, validate on 10000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 00010: ReduceLROnPlateau reducing learning rate to 5e-05.
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 00014: early stopping
Training using 0.7 of dataset.
Train on 140000 samples, validate on 10000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 6/250
Epo

In [16]:
# Train the CNN on the augmented dataset that was applied on the ENTIRE training dataset.
# This step is done because the previous cell only trains up to 90% of dataset.

_dataset = pd.read_csv(f'data_aug_1.0.csv')
x_train = list(_dataset['review'])
y_train = list(_dataset['sentiment'])
for j in range (len(y_train)):
    if y_train[j]=='positive':
        y_train[j] = 1
    else:
        y_train[j] = 0
y_train = np.array(y_train)
train_sequences = tokenizer.texts_to_sequences(x_train)
training_padded = pad_sequences(train_sequences, padding=cnn_hyper_param['PADDING_TYPE'], 
                                    maxlen=cnn_hyper_param['MAX_LENGTH'], truncating=cnn_hyper_param['TRUNC_TYPE'])

for i in range(3):
    print("================================================")
    print(f"ITERATION {i}")
    print("================================================")

    CNN = create_cnn()
    history = CNN.fit(training_padded, y_train, batch_size = cnn_hyper_param['BS'],
                            epochs=250, validation_data=(testing_padded, y_test),
                           callbacks=[reduce_lr, earlystop])

    hist_df = pd.DataFrame(history.history)
    # save to csv:
    hist_csv_file = f'CNN_Aug_History_{i+1}_1.0' + '.csv'
    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)

ITERATION 0
Train on 200000 samples, validate on 10000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 00009: ReduceLROnPlateau reducing learning rate to 5e-05.
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 00013: early stopping
ITERATION 1
Train on 200000 samples, validate on 10000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 00009: ReduceLROnPlateau reducing learning rate to 5e-05.
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 00013: early stopping
ITERATION 2
Train on 200000 samples, validate on 10000 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoc

# <center>Best RNN with Data Augmentation

In [17]:
# Create callbacks for our RNN

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=8, min_lr=0.00005, verbose=1)
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=16)

In [21]:
# Read the dataset and convert categorical data to integers

_dataset = pd.read_csv(f'data_aug_1.0.csv')
x_train = list(_dataset['review'])
y_train = list(_dataset['sentiment'])
for j in range (len(y_train)):
    if y_train[j]=='positive':
        y_train[j] = 1
    else:
        y_train[j] = 0
y_train = np.array(y_train)
train_sequences = tokenizer.texts_to_sequences(x_train)
training_padded = pad_sequences(train_sequences, padding=rnn_hyper_param['PADDING_TYPE'], 
                                    maxlen=rnn_hyper_param['MAX_LENGTH'], truncating=rnn_hyper_param['TRUNC_TYPE'])



In [None]:
# Train our RNN on the various augmented datasets and store the history into a csv file.

RNN_aug_hist = {}
for size, dataset in data_dict.items():
    print(f"Training using {size} of dataset.")
    
    # read csv for that augmented file
    _dataset = pd.read_csv(f'data_aug_{size}.csv')
    x_train = list(_dataset['review'])
    y_train = list(_dataset['sentiment'])
    for j in range (len(y_train)):
        if y_train[j]=='positive':
            y_train[j] = 1
        else:
            y_train[j] = 0
    y_train = np.array(y_train)
    train_sequences = tokenizer.texts_to_sequences(x_train)
    training_padded = pad_sequences(train_sequences, padding=RNN_hyper_param['PADDING_TYPE'], 
                                        maxlen=RNN_hyper_param['MAX_LENGTH'], truncating=RNN_hyper_param['TRUNC_TYPE'])

    RNN = create_rnn()
    history = RNN.fit(training_padded, y_train, batch_size = RNN_hyper_param['BS'],
                            epochs=250, validation_data=(testing_padded, y_test),
                           callbacks=[reduce_lr, earlystop])

    RNN_aug_hist[f'{size}'] = history
    
    hist_df = pd.DataFrame(history.history)
    # save to csv:
    hist_csv_file = 'CNN_Aug_History_' + str(size) + '.csv'
    with open(hist_csv_file, mode='w') as f:
        hist_df.to_csv(f)