In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import joblib
import uuid


In [None]:
uniqueid = uuid.uuid4().int & (1 << 64)-1


In [None]:
use_pre_trained_embeds = False

In [None]:
dataset_dir = "imdb"
model_dir = "models"

# load a preprocessed dataframe see: (https://github.com/3nws/twitter-text-classification/blob/main/notebooks/process_dataframes.ipynb)
df = joblib.load("../dataframes/df_imdb.pkl")

In [None]:
df.shape

In [None]:
df.head()

In [None]:

import seaborn as sns

sns.countplot(df.sentiment)


In [None]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(df.review)

In [None]:
len(counter)

In [None]:
counter

In [None]:
counter.most_common(5)

In [None]:
num_unique_words = len(counter)

In [None]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df.review.to_numpy()
train_labels = train_df.sentiment.to_numpy()
val_sentences = val_df.review.to_numpy()
val_labels = val_df.sentiment.to_numpy()

In [None]:
train_sentences.shape, val_sentences.shape

In [None]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [None]:
# each word has unique index
word_index = tokenizer.word_index

In [None]:
word_index

In [None]:
len_of_vocab = len(word_index)

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [None]:
print(train_sentences[10:15])
print(train_sequences[10:15])

In [None]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
# max_length = max([len(text) for text in train_sequences])
max_length = 128
max_length

In [None]:

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape

In [None]:
train_padded[10]

In [None]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

In [None]:
# Check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [None]:
reverse_word_index

In [None]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [None]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

In [None]:
if use_pre_trained_embeds:
    embeddings_dictionary = dict()
    embedding_dim = 32
    glove_file = open('../embeds/glove.6B.300d.txt', 'rb')

    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary[word] = vector_dimensions

    glove_file.close()

    embeddings_matrix = np.zeros((num_unique_words, embedding_dim))
    for word, index in tokenizer.word_index.items():
        embedding_vector = embeddings_dictionary.get(word)
        if embedding_vector is not None:
            embeddings_matrix[index] = embedding_vector


In [None]:
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy


In [None]:
# 'softmax' activation function returns a probability distribution
# Binary for 0-1, Categorical for 2 or more classes, SparseCategorical for when labels are integers
# Dropout is used to prevent overfitting by randomly setting inputs to 0 at a low rate
# For stacked LSTMs set return_sequences to True except for the last one
# trainable parameter in Embedding layer should still be set to True when using already trained weights (it is by default anyway)

# 0
def Glove_Double_Bi_LSTM_w_Loss_Sparse_Cat():
    model = Sequential()
    model.add(Embedding(num_unique_words, embedding_dim,
                        input_length=max_length, name="embeddinglayer", weights=[embeddings_matrix], trainable=True))
    model.add(Bidirectional(LSTM(64, dropout=0.3, return_sequences=True)))
    model.add(Bidirectional(LSTM(64, dropout=0.3)))
    model.add(Dense(2, activation="softmax"))
    loss = SparseCategoricalCrossentropy(from_logits=False)
    optim = Adam(lr=0.001)
    metrics = [
        "accuracy",
        "sparse_categorical_accuracy",
    ]
    model.compile(loss=loss, optimizer=optim, metrics=metrics)
    return model

# 1
def LSTM_w_Loss_Binary():
    model = Sequential()
    model.add(Embedding(num_unique_words, embedding_dim,
                               input_length=max_length, name="embeddinglayer"))
    model.add(LSTM(64, dropout=0.1))
    model.add(Dense(1, activation="sigmoid"))
    loss = BinaryCrossentropy(from_logits=False)
    optim = Adam(lr=0.001)
    metrics = [
        "accuracy",
        "binary_accuracy",
    ]
    model.compile(loss=loss, optimizer=optim, metrics=metrics)
    return model
    
# 2
def LSTM_w_Loss_Sparse_Cat():
    model = Sequential()
    model.add(Embedding(num_unique_words, embedding_dim,
                               input_length=max_length, name="embeddinglayer"))
    model.add(LSTM(64, dropout=0.1))
    model.add(Dense(2, activation="softmax"))
    loss = SparseCategoricalCrossentropy(from_logits=False)
    optim = Adam(lr=0.001)
    metrics = [
        "accuracy",
        "sparse_categorical_accuracy",
    ]
    model.compile(loss=loss, optimizer=optim, metrics=metrics)
    return model

# 3
def Bi_LSTM_w_Loss_Sparse_Cat():
    model = Sequential()
    model.add(Embedding(num_unique_words, embedding_dim,
                               input_length=max_length, name="embeddinglayer"))
    model.add(Bidirectional(LSTM(64, dropout=0.1)))
    model.add(Dense(2, activation="softmax"))
    loss = SparseCategoricalCrossentropy(from_logits=False)
    optim = Adam(lr=0.001)
    metrics = [
        "accuracy",
        "sparse_categorical_accuracy",
    ]
    model.compile(loss=loss, optimizer=optim, metrics=metrics)
    return model

# 4
def Double_Bi_LSTM_w_Loss_Sparse_Cat():
    model = Sequential()
    model.add(Embedding(num_unique_words, embedding_dim,
                        input_length=max_length, name="embeddinglayer"))
    model.add(Bidirectional(LSTM(64, dropout=0.3, return_sequences=True)))
    model.add(Bidirectional(LSTM(64, dropout=0.3)))
    model.add(Dense(2, activation="softmax"))
    loss = SparseCategoricalCrossentropy(from_logits=False)
    optim = Adam(lr=0.001)
    metrics = [
        "accuracy",
        "sparse_categorical_accuracy",
    ]
    model.compile(loss=loss, optimizer=optim, metrics=metrics)
    return model

In [None]:
models = [
    Glove_Double_Bi_LSTM_w_Loss_Sparse_Cat,
    LSTM_w_Loss_Binary,
    LSTM_w_Loss_Sparse_Cat,
    Bi_LSTM_w_Loss_Sparse_Cat,
    Double_Bi_LSTM_w_Loss_Sparse_Cat,
]

model_to_use = -1

model_idx = 0 if use_pre_trained_embeds else model_to_use


In [None]:


model = models[model_idx]()

model.summary()

In [None]:
train_padded.shape

In [None]:
history = model.fit(train_padded, train_labels, epochs=4, validation_data=(val_padded, val_labels), verbose=1)

In [None]:

# plotting training graph

plt.plot(history.history['loss'])


In [None]:
predictions = model.predict(train_padded)


In [None]:

predictions.shape


In [None]:
predictions

In [None]:
val_predictions = model.predict(val_padded)


In [None]:
print(val_sentences[42])
print(val_labels[42])
print(val_predictions[42])


In [None]:
# Only for BinaryCrossentropy
# predictions = [1 if p > 0.5 else 0 for p in predictions]
# predictions

In [None]:

val_loss, val_acc = model.evaluate(val_padded, val_labels)
val_loss, val_acc


In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])


plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)


In [None]:
model_save_dir = f'../{model_dir}/NN_model_{model_idx}_{uniqueid}_{val_acc}'

model.save(model_save_dir)


In [None]:
loaded_model = load_model(model_save_dir)


In [None]:
loaded_model.summary()

In [None]:
# For debugging purposes


# model = keras.Model(inputs=model.input,
#                     outputs=[model.get_layer("embeddingL").output])

# feature = model.predict(val_padded)

# feature, feature.shape