<a href="https://colab.research.google.com/github/AndrewPochapsky/chatbot/blob/master/ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import torch
from torch import nn, Tensor
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import re
import numpy as np
from pathlib import Path
from collections import Counter
import spacy
import pickle
import random
import math

In [0]:
base_path = Path('drive/My Drive/datasets/cornell movie-dialogs corpus')


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


# Training Data Setup

In [0]:
line_map = {}
with open(base_path/'movie_lines.txt', encoding = 'ISO-8859-1') as f:
    for line in f:
        parts = line.split(' +++$+++ ')
        line_num = parts[0]
        #-2 to get rid of \n
        text = parts[-1][:-2]
        line_map[line_num] = text
      
        


In [0]:
table = []
with open(base_path/'movie_conversations.txt', encoding = 'ISO-8859-1') as f:
    for line in f:
        parts = line.split(' +++$+++ ')
        #get the referenced line numbers
        line_nums = re.findall('L[0-9]+', parts[-1])
        #form pairs
        
        for i in range(len(line_nums) - 1):
            pair = (line_nums[i], line_nums[i+1])
            #df.loc[df['column_name'] == some_value]
            first = line_map[line_nums[i]]
            second = line_map[line_nums[i+1]]
            table.append([first, second])
        
            
            
data_df = pd.DataFrame(table, columns = ['in', 'out'])

In [5]:
data_df.head()

Unnamed: 0,in,out
0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's ...,Forget it
4,"No, no, it's my fault -- we didn't have a prop...",Cameron


# Word2Vec


Text Preprocessing

In [6]:
def preprocess(s):
    s = s.replace('\n',' ').lower()
    return s

def tokenize(corpus):
    tokenizer = spacy.blank("en").tokenizer
    doc = tokenizer(corpus)
    tokens = []
    for token in doc:
        if(token.text.strip() != ""):
            tokens.append(token.text)
    return tokens

def process_dataset():
    all_words = ""
    with open(base_path/'movie_lines.txt', encoding = 'ISO-8859-1') as f:
        for line in f:
            parts = line.split(' +++$+++ ')
            all_words += parts[-1]
    return all_words

def generate_vocab(tokens, min_freq = 0):
    all_unique_words_counter = Counter(tokens)
    vocab = {}
    index = 0
    for w in all_unique_words_counter.keys():
        if(all_unique_words_counter[w] >= min_freq and w.strip() != ""):
            vocab[w] = index
            index += 1
    return vocab

def subsample(tokens, t = 1e-5):
    """
        Paper: https://arxiv.org/pdf/1310.4546.pdf
    """
    sampled_tokens = []
    counter = Counter(tokens)
    for token in tokens:
        f_w = counter[token]/len(tokens)
        p_w = 1 - math.sqrt(t/f_w)
        val = random.uniform(0, 1)
        if(val >= p_w):
            sampled_tokens.append(token)
            
    return sampled_tokens
        
        
        
        
def create_training_matrices(vocab, all_words, window_size = 5):	
	"""
        Returns x_train: Tensor()
    """
	numTotalWords = len(all_words)
	xTrain=[]
	yTrain=[]
	for i in range(numTotalWords):
		wordsAfter = all_words[i + 1:i + window_size + 1]
		wordsBefore = all_words[max(0, i - window_size):i]
		wordsAdded = wordsAfter + wordsBefore
		for word in wordsAdded:
			xTrain.append(vocab[all_words[i]])
			yTrain.append(vocab[word])
	return Tensor(xTrain), Tensor(yTrain)

full_corpus = process_dataset() 
full_corpus = preprocess(full_corpus)
print('Begin Tokenization')
tokens = tokenize(full_corpus) # list of string
print('Generating vocab')
vocab = generate_vocab(tokens)
print(len(tokens))
print('Subsampling data')
tokens = subsample(tokens, 1e-5)
print(len(tokens))
print('Getting training data')
x_train, y_train = create_training_matrices(vocab, tokens, window_size = 3)


Begin Tokenization
Generating vocab
4194111
Subsampling data
689239
Getting training data


SkipGram Model

In [0]:
class SkipGramModel(nn.Module):
    def __init__(self, emb_size, emb_dim):
        super(SkipGramModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dim = emb_dim
        self.center_embeddings = nn.Embedding(emb_size, emb_dim, sparse = True)
        self.context_embeddings = nn.Embedding(emb_size, emb_dim, sparse = True)
        self.init_emb()
    
    def init_emb(self):
        initrange = 0.5 / self.emb_dim
        self.center_embeddings.weight.data.uniform_(-initrange, initrange)
        self.context_embeddings.weight.data.uniform_(-0, 0)
        
    def forward(self, pos_center, pos_context, neg_context):
        losses = []
        emb_center = self.center_embeddings(pos_center.long())
        emb_context = self.context_embeddings(pos_context.long())
        score = torch.mul(emb_center, emb_context).squeeze()
        #print(score.shape)
        score = torch.sum(score, dim = 1)
        score = F.logsigmoid(score) # I think it is logsigmoid since we are doing nll loss func?
        losses.append(sum(score))
        
        
        neg_emb_context = self.context_embeddings(neg_context.long())
        #print(neg_emb_context.shape)
        neg_score = torch.bmm(neg_emb_context, emb_center.unsqueeze(2)).squeeze()
        neg_score = torch.sum(neg_score, dim = 1)
        neg_score = F.logsigmoid(-1 * neg_score)
        losses.append(sum(neg_score))
        return -1 * sum(losses)
        
        
        

In [0]:
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size = 512, shuffle = True, num_workers = 4)

def map_to_index(np_array, vocab):
    output = torch.zeros(np_array.shape)
    for i in range(len(np_array)):
        output[i] = Tensor(list(map(lambda x: vocab[x], np_array[i])))
    return output

In [9]:
lr = 3e-3

num_epochs = 1
neg_sample_size = 3
emb_dim = 100
emb_size = len(vocab.keys())
model = SkipGramModel(emb_size, emb_dim).cuda()
optim = torch.optim.SGD(model.parameters(), lr = lr) #cant use mom or wd since that would require calculating for all the params, too expensive
print(len(train_dl))
for epoch in range(num_epochs):
    total_loss = 0
    for xb, yb in train_dl:
        #neg sampling
        neg_context = np.random.choice(
            tokens,
            size=(len(xb), neg_sample_size)
        )
        
        neg_context = map_to_index(neg_context, vocab)
        #print(type(neg_context))
        loss = model(xb.cuda(), yb.cuda(), neg_context.cuda())
        total_loss += loss
        loss.backward()
        optim.step()
    print('training_loss: ' + str(total_loss/len(xb)))


8077


RuntimeError: ignored