<a href="https://colab.research.google.com/github/AndrewPochapsky/chatbot/blob/master/ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import torch
import re
from fastai.text import *
from pathlib import Path
from collections import Counter
import spacy

In [0]:
base_path = Path('drive/My Drive/datasets/cornell movie-dialogs corpus')


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Training Data Setup

In [0]:
line_map = {}
with open(base_path/'movie_lines.txt', encoding = 'ISO-8859-1') as f:
    for line in f:
        parts = line.split(' +++$+++ ')
        line_num = parts[0]
        #-2 to get rid of \n
        text = parts[-1][:-2]
        line_map[line_num] = text
      
        


In [0]:
movie_lines_df.head()

Unnamed: 0,LineNum,text
0,L1045,They do not
1,L1044,They do to
2,L985,I hope so
3,L984,She okay
4,L925,Let's go


In [0]:
table = []
with open(base_path/'movie_conversations.txt', encoding = 'ISO-8859-1') as f:
    for line in f:
        parts = line.split(' +++$+++ ')
        #get the referenced line numbers
        line_nums = re.findall('L[0-9]+', parts[-1])
        #form pairs
        
        for i in range(len(line_nums) - 1):
            pair = (line_nums[i], line_nums[i+1])
            #df.loc[df['column_name'] == some_value]
            first = line_map[line_nums[i]]
            second = line_map[line_nums[i+1]]
            table.append([first, second])
        
            
            
data_df = pd.DataFrame(table, columns = ['in', 'out'])

In [0]:
data_df.head()

Unnamed: 0,in,out
0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's ...,Forget it
4,"No, no, it's my fault -- we didn't have a prop...",Cameron


# Word2Vec


Text Preprocessing

In [0]:
def preprocess(s):
    s = s.replace('\n',' ').lower()
    return s

Tokenize

In [0]:
def tokenize(corpus):
    tokenizer = spacy.blank("en").tokenizer
    return tokenizer(corpus)

In [0]:
def process_dataset():
    all_words = ""
    with open(base_path/'movie_lines.txt', encoding = 'ISO-8859-1') as f:
        for line in f:
            parts = line.split(' +++$+++ ')
            all_words += clean_message(parts[-1])
    return all_words, Counter(all_words.split())

def generate_vocab(dictionary):
    all_unique_words = list(dictionary.keys())
    vocab = {}
    index = 0
    for w in all_unique_words:
        vocab[w] = index
        index += 1
    return vocab

def create_training_matrices(vocab, corpus, window_size = 5):	
	allWords = corpus.split()
	numTotalWords = len(allWords)
	xTrain=[]
	yTrain=[]
	for i in range(numTotalWords):
		if i % 100000 == 0:
			print ('Finished %d/%d total words' % (i, numTotalWords))
		wordsAfter = allWords[i + 1:i + window_size + 1]
		wordsBefore = allWords[max(0, i - window_size):i]
		wordsAdded = wordsAfter + wordsBefore
		for word in wordsAdded:
			xTrain.append(vocab[allWords[i]])
			yTrain.append(vocab[word])
	return xTrain, yTrain

full_corpus, dataset_dictionary = process_dataset() 
full_corpus = pre_process(full_corpus)
tokenized_corpus = tokenize(full_corpus)
#vocab = generate_vocab(dataset_dictionary)
#x_train, y_train = create_training_matrices(vocab, full_corpus)


they
do
not
they
do
to
i
hope
so
she
okay
let
's
go
wow
okay
--
you
're
gon
na
need
to
learn
how
to
lie
no
i
'm
kidding
you
know
how
sometimes
you
just
become
this
"
persona
"
and
you
do
n't
know
how
to
quit
like
my
fear
of
wearing
pastels
the
"
real
you
"
what
good
stuff
i
figured
you
'd
get
to
the
good
stuff
eventually
thank
god
if
i
had
to
hear
one
more
story
about
your
coiffure
me
this
endless
blonde
babble
i
'm
like
boring
myself
what
crap
do
you
listen
to
this
crap
no
then
guillermo
says
"
if
you
go
any
lighter
you
're
gon
na
look
like
an
extra
on
90210
"
you
always
been
this
selfish
but
then
that
's
all
you
had
to
say
well
no
you
never
wanted
to
go
out
with
'
me
did
you
i
was
i
looked
for
you
back
at
the
party
but
you
always
seemed
to
be
"
occupied
"
tons
have
fun
tonight
i
believe
we
share
an
art
instructor
you
know
chastity
looks
like
things
worked
out
tonight
huh
hi
who
knows
all
i
've
ever
heard
her
say
is
that
she
'd
dip
before
dating
a
guy
that
smokes
so
that
's
the
kind
o

KeyboardInterrupt: ignored

3492043
