# Mount Google Drive

First, we mount our Google Drive so that we may integrate Google Colab seamlessly in the generation of codes.

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import os
os.chdir('/content/gdrive/MyDrive/Colab Notebooks/AI6127/Project Chatbot')

# Import Libraries

In [3]:
import tensorflow as tf

In [4]:
from tensorflow import keras

In [5]:
from tensorflow.keras.layers import Input,Dense,LSTM
from tensorflow.keras.models import Model

In [6]:
import re
import random
import numpy as np
import json
import pandas as pd

In [7]:
import nltk
nltk.download('wordnet')
from nltk.stem import LancasterStemmer, PorterStemmer, WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 1. EXTRACTING DATA FROM THE JSON FILE

The SMSes are nested pretty deeply in the original json file. Next couple of cells are aimed at extracting the data into a dataframe format.

In [8]:
# load the dataset
with open("./SMS Messages/smsCorpus_en_2015.03.09_all.json") as f:
    data = json.load(f)

In [9]:
# get the messages details only of the dataset (smsCorpus)
listofMsg = data['smsCorpus']['message']

In [10]:
messages = [i['text']['$'] for i in listofMsg]
countries = [i['source']['userProfile']['country']['$'] for i in listofMsg]

In [11]:
sms = pd.DataFrame({'country':countries,'sms_text':messages})
sms['sms_text'] = sms['sms_text'].astype('str')

In [12]:
# simple function to clean the text and remove non-ascii characters
def clean_text(text):    
    text = text.encode("ascii", errors="ignore").decode("ascii") #remove non-ascii, Chinese characters
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\n\n", " ", text)
    text = re.sub(r"\W", " ", text)
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text)
    text = text.strip(" ")
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(' +',' ', text).strip() # get rid of multiple spaces and replace with a single space    
    return text

In [13]:
sms["clean_text"] = sms['sms_text'].map(lambda text: clean_text(text))

# Comment out the stemming or lemmatization that you do not want to use
#sms["clean_text"] = sms["clean_text"].map(lambda text: PorterStemmer().stem(text))
# sms["clean_text"] = sms["clean_text"].map(lambda text: LancasterStemmer().stem(text))
# sms["clean_text"] = sms["clean_text"].map(lambda text: WordNetLemmatizer().lemmatize(text))

sms = sms.dropna(subset=['clean_text'])

In [14]:
#adding a word count col for filtering

sms['word_count'] = sms['clean_text'].str.count(' ') + 1

## 1.1 CUTTING OUT THE NOISE

A bigger dataset isn't necessarily a better one if it is merely noisy. Prior to creating the training and validation sets, I filtered out SMSes of 3 words or less (too few words) and kept only those sent by users in Singapore.

In [15]:
crit1 = sms['word_count'] > 3
crit2 = sms['country'] == 'SG'
crit3 = sms['country'] == 'Singapore'

sms = sms[crit1 & (crit2 | crit3)].copy().reset_index()

In [16]:
sms.sample(3)

Unnamed: 0,index,country,sms_text,clean_text,word_count
24693,45850,Singapore,WHAT WHERE IS IT,WHAT WHERE IS IT,4
26427,47867,Singapore,Yup I told him le! he says it's in hostel!,Yup I told him le he says it s in hostel,11
9885,24999,Singapore,"Hello, just a reminder. Ulu pandan needs more ...",Hello just a reminder Ulu pandan needs more cl...,20


In [17]:
questions = list(sms['clean_text'])[0:29359]
response = list(sms['clean_text'])[1:29360]

In [18]:
pairs = list(zip(questions,response))

## 2. CREATE TRAINING AND VALIDATION SET

A disadvantage of using LSTM is the propensity to run out of RAM. As a result, for training purposes, we take in a random set of 3,000 consecutive lines in the pair of dialouge text set.
In the subsequent training function, we will indicate that the losses be calculated on 20% of the training set, which will not be trained.

In [19]:
total_pairs = len(pairs)
training_size = 2500

In [20]:
start = random.randrange(0, total_pairs-training_size) 

In [21]:
input_docs = []
target_docs = []
input_tokens = set()
target_tokens = set()

In [22]:
for line in pairs[start:start+training_size]:
    input_doc, target_doc = line[0], line[1]
    # Appending each input sentence to input_docs
    input_docs.append(input_doc)
    # Splitting words from punctuation
    target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
    # Redefine target_doc below and append it to target_docs
    target_doc = '<START> ' + target_doc + ' <END>'
    target_docs.append(target_doc)
    # Now we split up each sentence into words and add each unique word to our vocabulary set
    for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
        if token not in input_tokens:
            input_tokens.add(token)
    for token in target_doc.split():
        if token not in target_tokens:
            target_tokens.add(token)

In [23]:
input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

In [24]:
input_features_dict = dict([(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict([(token, i) for i, token in enumerate(target_tokens)])

In [25]:
reverse_input_features_dict = dict((i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict((i, token) for token, i in target_features_dict.items())

# 3. TRAINING SET UP

To train our seq2seq model we will use three matrices of one-hot vectors, Encoder input data, Decoder input data, and Decoder output data.

In [26]:
#Maximum length of sentences in input and target documents
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]",input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]",target_doc)) for target_doc in target_docs])

In [27]:
encoder_input_data = np.zeros((len(input_docs), 
                               max_encoder_seq_length,num_encoder_tokens),
                              dtype='float32')
decoder_input_data = np.zeros((len(input_docs), 
                               max_decoder_seq_length,num_decoder_tokens),
                              dtype='float32')
decoder_target_data = np.zeros((len(input_docs), 
                                max_decoder_seq_length,num_decoder_tokens),
                               dtype='float32')

In [28]:
for line, (input_doc, target_doc) in enumerate(zip(input_docs,target_docs)):
    for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]",input_doc)):
        #Assign 1. for the current line, timestep, & word in encoder_input_data
        encoder_input_data[line, timestep,input_features_dict[token]] = 1.
    for timestep, token in enumerate(target_doc.split()):
        decoder_input_data[line, timestep,target_features_dict[token]] = 1.
        if timestep > 0:
            decoder_target_data[line, timestep - 1,target_features_dict[token]] = 1.

Our encoder model requires an input layer which defines a matrix for holding the one-hot vectors and an LSTM layer with some number of hidden states. Decoder model structure is almost the same as encoder’s but here we pass in the state data along with the decoder inputs.

In [29]:
#Dimensionality
dimensionality = 256

In [30]:
#The batch size and number of epochs
batch_size = 1
epochs = 5

In [31]:
#Encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(dimensionality, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

In [32]:
#Decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(dimensionality, return_sequences=True,return_state=True)
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# 4. TRAINING AND EVALUATION

We will create our seq2seq model and train it with encoder and decoder data

In [33]:
#Model
training_model = Model([encoder_inputs, decoder_inputs],decoder_outputs)

The optimizar Adamax has a default learning rate of 0.001, which we will employ. Adamax is sometimes superior to Adam, specially in models with embeddings.

In [34]:
#Compiling
training_model.compile(optimizer='adamax',
                       loss='categorical_crossentropy', 
                       metrics=['accuracy'],sample_weight_mode='temporal')

In [35]:
#Training
hist = training_model.fit([encoder_input_data,decoder_input_data],decoder_target_data,
                          batch_size = batch_size,epochs = epochs,validation_split = 0.2)
training_model.save('training_model.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
with open('training_details.json', 'w') as f:
    json.dump(hist.history, f)

As a fraction of the training data is used as validation data, the model will set apart this fraction of the training data and will not train on it. It will evaluate the loss and any model metrics on this data at the end of each epoch. Hence, the perplexity score can be calculated off the last epoch.

In [40]:
with open("./training_details.json") as f:
    data = json.load(f)

In [41]:
import math

In [42]:
p_score = math.exp(np.mean(data['loss']))
print(f'The perplexity is {p_score:.3f}')

The perplexity is 2.752
