In [1]:
import tensorflow as tf
import random
import nltk
import itertools
from collections import defaultdict
import numpy as np
import pickle

  from ._conv import register_converters as _register_converters


In [2]:
lines=open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [3]:
lines[0:5]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go."]

In [4]:
line_ids = {}
for line in lines:
    line_lst = line.split(' +++$+++ ')
    if len(line_lst) == 5:
        line_ids[line_lst[0]] = line_lst[4]

In [5]:
conv = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [6]:
conv[0:5]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']"]

In [7]:
convs = []
for line in conv:
    line_lst = line.split(' +++$+++ ')
    line = line_lst[-1][1:-1].replace("'", "").replace(" ", "")
    convs.append(line.split(','))


In [8]:
# seperate question and answers 
questions = []
answers = []
for conv in convs:
    if len(conv) %2 != 0:
        conv = conv[:-1]    
    for i in range(len(conv)):
        if i%2 == 0:
            questions.append(line_ids[conv[i]])
        else:
            answers.append(line_ids[conv[i]]) 


In [9]:
# Conversion to lowercase
questions = [ line.lower() for line in questions ]
answers = [ line.lower() for line in answers ]

In [10]:
questions[0]

'can we make this quick?  roxanne korrine and andrew barrett are having an incredibly horrendous public break- up on the quad.  again.'

In [11]:
answers[0]

"well, i thought we'd start with pronunciation, if that's okay with you."

In [12]:
# remove special characters 
EN_WHITELIST = '0123456789abcdefghijklmnopqrstuvwxyz ' # space is included in whitelist
EN_BLACKLIST = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\''
def filter_line(line, whitelist):
    return ''.join([ ch for ch in line if ch in whitelist ])

In [13]:
questions = [ filter_line(line, EN_WHITELIST) for line in questions ]
answers = [ filter_line(line, EN_WHITELIST) for line in answers ]

In [14]:
questions[0]

'can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again'

In [15]:
answers[0]

'well i thought wed start with pronunciation if thats okay with you'

In [16]:
def filter_data(qseq, aseq):
    filtered_q, filtered_a = [], []
    raw_data_len = len(qseq)
    for i in range(raw_data_len):
        qlen, alen = len(qseq[i].split(' ')), len(aseq[i].split(' '))
        if qlen >= 2 and qlen <= 25:
            if alen >= 2 and alen <= 25:
                filtered_q.append(qseq[i])
                filtered_a.append(aseq[i])

    return filtered_q, filtered_a

In [17]:
questions, answers = filter_data(questions, answers)

In [18]:
# tokenization
question_tokens = [ [w.strip() for w in wordlist.split(' ') if w] for wordlist in questions ]
answer_tokens   = [ [w.strip() for w in wordlist.split(' ') if w] for wordlist in answers ]

In [19]:
question_tokens[0]

['can',
 'we',
 'make',
 'this',
 'quick',
 'roxanne',
 'korrine',
 'and',
 'andrew',
 'barrett',
 'are',
 'having',
 'an',
 'incredibly',
 'horrendous',
 'public',
 'break',
 'up',
 'on',
 'the',
 'quad',
 'again']

In [20]:
answer_tokens[0]

['well',
 'i',
 'thought',
 'wed',
 'start',
 'with',
 'pronunciation',
 'if',
 'thats',
 'okay',
 'with',
 'you']

In [21]:
# Vectorization: Words to index and index to words
UNK = 'unk'
freq_dist = nltk.FreqDist(itertools.chain(*(question_tokens + answer_tokens)))
vocab = freq_dist.most_common(8000)
index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
word2index = dict([(w,i) for i,w in enumerate(index2word)] )

In [22]:
def filter_unk(qtokenized, atokenized, w2idx):
    data_len = len(qtokenized)

    filtered_q, filtered_a = [], []

    for qline, aline in zip(qtokenized, atokenized):
        unk_count_q = len([ w for w in qline if w not in w2idx ])
        unk_count_a = len([ w for w in aline if w not in w2idx ])
        if unk_count_a <= 2:
            if unk_count_q > 0:
                if unk_count_q/len(qline) > 0.2:
                    pass
            filtered_q.append(qline)
            filtered_a.append(aline)

    return filtered_q, filtered_a

In [23]:
#Filter unknowns
qtokenized, atokenized = filter_unk(question_tokens, answer_tokens, word2index)

In [24]:
qtokenized[0]

['can',
 'we',
 'make',
 'this',
 'quick',
 'roxanne',
 'korrine',
 'and',
 'andrew',
 'barrett',
 'are',
 'having',
 'an',
 'incredibly',
 'horrendous',
 'public',
 'break',
 'up',
 'on',
 'the',
 'quad',
 'again']

In [25]:
atokenized[0]

['well',
 'i',
 'thought',
 'wed',
 'start',
 'with',
 'pronunciation',
 'if',
 'thats',
 'okay',
 'with',
 'you']

In [26]:
data_len = len(question_tokens)
# numpy arrays to store indices
idx_q = np.zeros([data_len, 25], dtype=np.int32) 
idx_a = np.zeros([data_len, 25], dtype=np.int32)

In [27]:
idx_a.shape

(98706, 25)

In [28]:
data_len

98706

In [29]:
def pad_seq(seq, lookup, maxlen):
    indices = []
    for word in seq:
        if word in lookup:
            indices.append(lookup[word])
        else:
            indices.append(lookup[UNK])
    return indices + [0]*(maxlen - len(seq))

In [30]:
for i in range(data_len):
    q_indices = pad_seq(question_tokens[i], word2index, 25)
    a_indices = pad_seq(answer_tokens[i], word2index, 25)
    idx_q[i] = np.array(q_indices)
    idx_a[i] = np.array(a_indices)

In [31]:
idx_q[0]

array([  52,   22,  114,   17,  899,    1,    1,   11, 4069, 7579,   28,
        410,   81, 3704,    1, 1257,  501,   55,   29,    4,    1,  183,
          0,    0,    0], dtype=int32)

In [32]:
idx_a[0]

array([ 43,   3, 140, 605, 331,  34,   1,  46,  49, 106,  34,   2,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
      dtype=int32)

In [33]:
np.save('idx_q.npy', idx_q)
np.save('idx_a.npy', idx_a)

In [34]:
metadata = {
        'w2idx' : word2index,
        'idx2w' : index2word,
        'freq_dist' : freq_dist
            }

In [35]:
with open('metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

In [36]:
def load_data(PATH=''):
    # read data control dictionaries
    with open(PATH + 'metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    # read numpy arrays
    idx_q = np.load(PATH + 'idx_q.npy')
    idx_a = np.load(PATH + 'idx_a.npy')
    return metadata, idx_q, idx_a

In [37]:
def split_dataset(x, y, ratio = [0.7, 0.15, 0.15] ):
    # number of examples
    data_len = len(x)
    lens = [ int(data_len*item) for item in ratio ]

    trainX, trainY = x[:lens[0]], y[:lens[0]]
    testX, testY = x[lens[0]:lens[0]+lens[1]], y[lens[0]:lens[0]+lens[1]]
    validX, validY = x[-lens[-1]:], y[-lens[-1]:]

    return (trainX,trainY), (testX,testY), (validX,validY)

In [38]:
(trainX, trainY), (testX, testY), (validX, validY) = split_dataset(idx_q, idx_a)

In [39]:
tf.reset_default_graph()
self.enc_ip = [ tf.placeholder(shape=[None,],
                dtype=tf.int64,
                name='ei_{}'.format(t)) for t in range(xseq_len) ]

self.labels = [ tf.placeholder(shape=[None,],
                dtype=tf.int64,
                name='ei_{}'.format(t)) for t in range(yseq_len) ]

self.dec_ip = [ tf.zeros_liek(self.enc_ip[0], dtype=tf.int64, name='GO')] + self.labels[:-1]


NameError: name 'xseq_len' is not defined