<a href="https://colab.research.google.com/github/AndrewPochapsky/chatbot/blob/master/ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import torch
from torch import nn, Tensor
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import re
import numpy as np
from pathlib import Path
from collections import Counter
from ast import literal_eval
import spacy
import pickle
import random
import math

In [0]:
base_path = Path('drive/My Drive/datasets/cornell movie-dialogs corpus')


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


# Word2Vec


Text Preprocessing

In [0]:
def preprocess(s):
    s = s.replace('\n',' ').lower()
    return s

def tokenize(corpus):
    tokenizer = spacy.blank("en").tokenizer
    doc = tokenizer(corpus)
    tokens = []
    for token in doc:
        if(token.text.strip() != ""):
            tokens.append(token.text)
    return tokens

def process_dataset():
    all_words = ""
    with open(base_path/'movie_lines.txt', encoding = 'ISO-8859-1') as f:
        for line in f:
            parts = line.split(' +++$+++ ')
            all_words += parts[-1]
    return all_words

def generate_vocab(tokens, min_freq = 0):
    all_unique_words_counter = Counter(tokens)
    vocab = {}
    index = 0
    for w in all_unique_words_counter.keys():
        if(all_unique_words_counter[w] >= min_freq and w.strip() != ""):
            vocab[w] = index
            index += 1
    return vocab

def subsample(tokens, t = 1e-5):
    """
        Paper: https://arxiv.org/pdf/1310.4546.pdf
    """
    sampled_tokens = []
    counter = Counter(tokens)
    for token in tokens:
        f_w = counter[token]/len(tokens)
        p_w = 1 - math.sqrt(t/f_w)
        val = random.uniform(0, 1)
        if(val >= p_w):
            sampled_tokens.append(token)
            
    return sampled_tokens
        
        
        
        
def create_training_matrices(vocab, all_words, window_size = 5):	
	"""
        Returns x_train: Tensor()
    """
	numTotalWords = len(all_words)
	xTrain=[]
	yTrain=[]
	for i in range(numTotalWords):
		wordsAfter = all_words[i + 1:i + window_size + 1]
		wordsBefore = all_words[max(0, i - window_size):i]
		wordsAdded = wordsAfter + wordsBefore
		for word in wordsAdded:
			xTrain.append(vocab[all_words[i]])
			yTrain.append(vocab[word])
	return Tensor(xTrain), Tensor(yTrain)


In [5]:

full_corpus = process_dataset() 
full_corpus = preprocess(full_corpus)
print('Begin Tokenization')
tokens = tokenize(full_corpus) # list of string
print('Generating vocab')
vocab = generate_vocab(tokens)
print(len(tokens))
print('Subsampling data')
tokens = subsample(tokens, 1e-5)
print(len(tokens))
print('Getting training data')
x_train, y_train = create_training_matrices(vocab, tokens, window_size = 3)

Begin Tokenization
Generating vocab
4194111
Subsampling data
688935
Getting training data


SkipGram Model

In [0]:
class SkipGramModel(nn.Module):
    def __init__(self, emb_size, emb_dim):
        super(SkipGramModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dim = emb_dim
        self.center_embeddings = nn.Embedding(emb_size, emb_dim, sparse = True)
        self.context_embeddings = nn.Embedding(emb_size, emb_dim, sparse = True)
        self.init_emb()
    
    def init_emb(self):
        #initrange = 0.5 / self.emb_dim
        #self.center_embeddings.weight.data.uniform_(-initrange, initrange)
        nn.init.kaiming_uniform_(self.center_embeddings.weight, a=math.sqrt(5))
        self.context_embeddings.weight.data.uniform_(-0, 0)
        
    def forward(self, pos_center, pos_context, neg_context):
        losses = []
        emb_center = self.center_embeddings(pos_center.long())
        emb_context = self.context_embeddings(pos_context.long())
        score = torch.mul(emb_center, emb_context).squeeze()
        #print(score.shape)
        score = torch.sum(score, dim = 1)
        score = F.logsigmoid(score) # I think it is logsigmoid since we are doing nll loss func?
        losses.append(sum(score))
        
        
        neg_emb_context = self.context_embeddings(neg_context.long())
        #print(neg_emb_context.shape)
        neg_score = torch.bmm(neg_emb_context, emb_center.unsqueeze(2)).squeeze()
        neg_score = torch.sum(neg_score, dim = 1)
        neg_score = F.logsigmoid(-1 * neg_score)
        losses.append(sum(neg_score))
        return -1 * sum(losses)
        
        
        

In [0]:
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size = 128, shuffle = False, num_workers = 4)

def map_to_index(np_array, vocab):
    output = torch.zeros(np_array.shape)
    for i in range(len(np_array)):
        output[i] = Tensor(list(map(lambda x: vocab[x], np_array[i])))
    return output

In [0]:
lr = 3e-3

num_epochs = 1
neg_sample_size = 5
emb_dim = 100
emb_size = len(vocab.keys())
model = SkipGramModel(emb_size, emb_dim)
optim = torch.optim.SGD(model.parameters(), lr = lr) #cant use mom or wd since that would require calculating for all the params, too expensive
i = 0
for epoch in range(num_epochs):
    total_loss = 0
    for xb, yb in train_dl:
        #neg sampling
        neg_context = np.random.choice(
            tokens,
            size=(len(xb), neg_sample_size)
        )
        
        neg_context = map_to_index(neg_context, vocab)
        #print(type(neg_context))
        optim.zero_grad()
        loss = model(xb, yb, neg_context)
        total_loss += loss
        loss.backward()
        optim.step()
        if(i % 100 == 0):
            print('completed {0}/{1} batches. Avg loss per batch: {2}'.format(i, len(train_dl), total_loss/(i)))
        i+=1
    
    
torch.save(model.state_dict(), base_path/'word2vec.pt')

completed 0/32296 batches. Avg loss per batch: inf
completed 100/32296 batches. Avg loss per batch: 179.21861267089844
completed 200/32296 batches. Avg loss per batch: 178.33065795898438
completed 300/32296 batches. Avg loss per batch: 178.03488159179688
completed 400/32296 batches. Avg loss per batch: 177.8870849609375
completed 500/32296 batches. Avg loss per batch: 177.79783630371094
completed 600/32296 batches. Avg loss per batch: 177.73805236816406
completed 700/32296 batches. Avg loss per batch: 177.69508361816406
completed 800/32296 batches. Avg loss per batch: 177.66226196289062
completed 900/32296 batches. Avg loss per batch: 177.63685607910156
completed 1000/32296 batches. Avg loss per batch: 177.61656188964844
completed 1100/32296 batches. Avg loss per batch: 177.60043334960938
completed 1200/32296 batches. Avg loss per batch: 177.5867156982422
completed 1300/32296 batches. Avg loss per batch: 177.57479858398438
completed 1400/32296 batches. Avg loss per batch: 177.564605712

# Seq2Seq

Get the data

In [0]:
line_map = {}
with open(base_path/'movie_lines.txt', encoding = 'ISO-8859-1') as f:
    for line in f:
        parts = line.split(' +++$+++ ')
        line_num = parts[0]
        #-2 to get rid of \n
        text = parts[-1][:-1]
        line_map[line_num] = text

In [0]:
table = []
with open(base_path/'movie_conversations.txt', encoding = 'ISO-8859-1') as f:
    for line in f:
        parts = line.split(' +++$+++ ')
        #get the referenced line numbers
        line_nums = re.findall('L[0-9]+', parts[-1])
        #form pairs
        
        for i in range(len(line_nums) - 1):
            pair = (line_nums[i], line_nums[i+1])
            #df.loc[df['column_name'] == some_value]
            first = line_map[line_nums[i]]
            second = line_map[line_nums[i+1]]
            table.append([tokenize(preprocess(first)), tokenize(preprocess(second))])
        
data_df = pd.DataFrame(table, columns = ['in', 'out'])
data_df.to_csv(base_path/'processed_data.csv', index = False)
data_df.head()

Unnamed: 0,in,out
0,"[can, we, make, this, quick, ?, roxanne, korri...","[well, ,, i, thought, we, 'd, start, with, pro..."
1,"[well, ,, i, thought, we, 'd, start, with, pro...","[not, the, hacking, and, gagging, and, spittin..."
2,"[not, the, hacking, and, gagging, and, spittin...","[okay, ..., then, how, 'bout, we, try, out, so..."
3,"[you, 're, asking, me, out, ., that, 's, so, c...","[forget, it, .]"
4,"[no, ,, no, ,, it, 's, my, fault, --, we, did,...","[cameron, .]"


In [0]:
def converter(x):
    #convert "list" to list
    return literal_eval(x)

converters={'in': converter, 'out': converter}
df = pd.read_csv(base_path/'processed_data.csv', converters = converters)


Build the model

In [8]:
emb_dim = 100
emb_size = len(vocab.keys())
word2vec_model = SkipGramModel(emb_size, emb_dim)
word2vec_model.load_state_dict(torch.load(base_path/'word2vec.pt'))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [0]:
#add special tokens to the vocab
vocab['xxxpad'] = emb_size
emb_size += 1
vocab['xxxeos'] = emb_size
emb_size += 1

In [0]:
def create_embedding_matrix(word2vec_model, emb_size, emb_dim, padding_idx, eos_idx):
    new_emb = nn.Embedding(emb_size, emb_dim, padding_idx=padding_idx)
    old_emb_weights = word2vec_model.center_embeddings.weight.data
    for i in range(len(old_emb_weights)):
        new_emb.weight.data[i] = old_emb_weights[i]
    
    #init the new embeddings to zero
    new_emb.weight.data[padding_idx].uniform_(-old_emb_weights.mean(), old_emb_weights.mean())
    new_emb.weight.data[eos_idx].uniform_(-old_emb_weights.mean(), old_emb_weights.mean())
    
    return new_emb

In [0]:
emb = create_embedding_matrix(word2vec_model, emb_size, emb_dim, vocab['xxxpad'], vocab['xxxeos'])    

In [0]:
def get_max_x_and_y(df):
    max_x = -1
    for x in df['in']:
        if(len(x) > max_x):
            max_x = len(x)
            
    max_y = -1
    for y in df['out']:
        if(len(y) > max_y):
            max_y = len(y)
    max_y += 1
    with open(base_path/'max.txt', 'w+') as f:
        f.write(str(max_x) + ',' + str(max_y))
    
    return max_x, max_y

In [26]:
get_max_x_and_y(df)

(370, 683)

In [0]:
max_x = None
max_y = None
with open(base_path/'max.txt') as f:
    for line in f:
        max_x, max_y = line.split(',')
        max_x = int(max_x)
        max_y = int(max_y)

In [0]:
def get_x_y_tensors(df, vocab, max_x, max_y):
    num_samples = len(df.index)
    res_x = torch.zeros(num_samples, max_x).long() 
    res_y = torch.zeros(num_samples, max_y).long()
    for sample_idx in range(num_samples):
        if(sample_idx % 10000 == 0):
            print('done {0}/{1} samples'.format(sample_idx, num_samples))
        x, y = df.iloc[sample_idx, :]
        
        x_tensor = torch.zeros(max_x) + vocab['xxxpad']
        y_tensor = torch.zeros(max_y) + vocab['xxxpad']
        

        num_padding = max_x - len(x)
         
        #populate the rest of it with actual input
        token_index = 0;
        for i in range(num_padding, max_x):
            x_tensor[i] = vocab[x[token_index]] 
            token_index += 1
               
            
        #add input to the output
        for i in range(len(y)):
            y_tensor[i] = vocab[y[i]]
            
        #add end of stream token
        y_tensor[len(y)] = vocab['xxxeos']
            
        res_x[sample_idx] = x_tensor.long()
        res_y[sample_idx] = y_tensor.long()
        
    return res_x, res_y
    

In [29]:
x, y = get_x_y_tensors(df, vocab, max_x, max_y)

done 0/221616 samples
done 10000/221616 samples
done 20000/221616 samples
done 30000/221616 samples
done 40000/221616 samples
done 50000/221616 samples
done 60000/221616 samples
done 70000/221616 samples
done 80000/221616 samples
done 90000/221616 samples
done 100000/221616 samples
done 110000/221616 samples
done 120000/221616 samples
done 130000/221616 samples
done 140000/221616 samples
done 150000/221616 samples
done 160000/221616 samples
done 170000/221616 samples
done 180000/221616 samples
done 190000/221616 samples
done 200000/221616 samples
done 210000/221616 samples
done 220000/221616 samples


In [0]:
with open(base_path/'x_tensors.pkl', 'wb') as f:
    pickle.dump(x, f)

with open(base_path/'y_tensors.pkl', 'wb') as f:
    pickle.dump(y, f)

In [0]:
def get_saved_tensors():
    x = None
    y = None
    with open(base_path/'x_tensors.pkl', 'rb') as f:
        x = pickle.load(f)
    with open(base_path/'y_tensors.pkl', 'rb') as f:
        y = pickle.load(f)
    return x, y

In [0]:
x, y = get_saved_tensors()

In [0]:
total_data = TensorDataset(x, y)

In [0]:
class Seq2SeqModel(nn.Module):
    def __init__(self, encoder_emb, decoder_emb, num_hidden, output_length, num_layers, padding_idx):
        super().__init__()
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.padding_idx = padding_idx
        self.output_length = output_length
        
        self.encoder_emb_size = encoder_emb.embedding_dim
        self.decoder_emb_size = decoder_emb.embedding_dim
        self.decoder_vocab_size = decoder_emb.num_embeddings
        
        self.encoder_emb = encoder_emb
        self.encoder_gru = nn.GRU(self.encoder_emb_size, self.num_hidden, num_layers = self.num_layers, batch_first = True)
        self.encoder_out = nn.Linear(self.num_hidden, self.decoder_emb_size, bias = False)
        
        self.decoder_emb = decoder_emb
        self.decoder_gru = nn.GRU(self.decoder_emb_size, self.decoder_emb_size, num_layers = self.num_layers, batch_first = True)
        self.out = nn.Linear(self.decoder_emb_size, self.decoder_vocab_size)
        self.out.weight.data = self.decoder_embedding.weight.data
        
    def encoder(self, bs, inp):
        h = self.init_hidden(bs)
        emb = self.encoder_emb(inp)
        _, h = self.encoder_gru(emb, h)
        h = self.encoder_out(h)
        return h
    
    def decoder(self, decoder_inp, h):
        emb = self.decoder_emb(decoder_inp).unsqueeze(1)
        out_pred, h = self.decoder_gru(emb, h)
        out_pred = self.out(out_pred)
        return out_pred, h
        
    def forward(self, inp):
        bs, seq_len = inp.size()
        h = self.encoder(bs, inp)
        dec_inp = inp.new_zeros(bs).long()
        
        res = []
        for i in range(self.output_length):
            h, out_pred = self.decoder(dec_inp, h)
            dec_inp = out_pred.max(1)[1]
            if (dec_inp==self.pad_idx).all(): break
        return torch.stack(res, dim = 1)
        
    def init_hidden(self, bs): return one_param(self).new_zeros(self.nl, bs, self.nh)
        
        