# Part 1 **Preparation**

RNN are called recurrent because they perform the same task for every element of a sequence
They loop back to themselves, that's called memory.
<br><br>
Sigmoid activation eventually leads to vanishing gradient for "deep" neural nets. 
<br><br>
They are especially important for the first layers, which are responsible for low-level features.
<br><br>
ReLu activation wont work for negative values. ReLu would still end up with vanishing gradients.
<br><br>
Leaky ReLu has a negative derivation for negative values. The zero is not there but small values.
<br><br>
LSTMs introduce a memory pipeline. They solve the problem. Here for RNN, each Neuron is an LSTM cell. 


## 1.1 pyTorch

In [168]:
import torch

In [169]:
# 1-D tensor
a = torch.tensor([2,2,1])
print(a)

tensor([2, 2, 1])


In [170]:
# This is a 2-D 
b = torch.tensor([[2,1,4],[3,5,4],[1,2,0],[4,3,2]])
print(b)

tensor([[2, 1, 4],
        [3, 5, 4],
        [1, 2, 0],
        [4, 3, 2]])


In [171]:
# The size of a tensor
print(a.shape)
print(b.shape)
print(a.size())
print(b.size())

torch.Size([3])
torch.Size([4, 3])
torch.Size([3])
torch.Size([4, 3])


In [172]:
# Get the height/number of rows of b
print(b.shape[0])

4


In [173]:
c = torch.FloatTensor([[2,1,4],[3,5,4],[1,2,0],[4,3,2]])
# c = torch.tensor([1,2,2], dtype = torch.float)
print(c)
print(c.dtype)

tensor([[2., 1., 4.],
        [3., 5., 4.],
        [1., 2., 0.],
        [4., 3., 2.]])
torch.float32


In [174]:
d = torch.DoubleTensor([[2,1,4],[3,5,4],[1,2,0],[4,3,2]])
# c = torch.tensor([1,2,2], dtype = torch.double)
print(d)
print(d.dtype)

tensor([[2., 1., 4.],
        [3., 5., 4.],
        [1., 2., 0.],
        [4., 3., 2.]], dtype=torch.float64)
torch.float64


In [175]:
print(c.mean())
print(c.std())

tensor(2.5833)
tensor(1.5050)


In [176]:
print(d.mean())
print(d.std())

tensor(2.5833, dtype=torch.float64)
tensor(1.5050, dtype=torch.float64)


In [177]:
# Reshape b
# if one dimension is -1, then its size can be inferred
print(b.view(-1, 1))
print(b.view(12))
print(b.view(-1, 4))
print(b.view(3, 4))
b = b.view(-1, 1)
print(b)
print(b.shape)
print()
three_dim = torch.randn(2, 3, 4)
print()
print(three_dim)
print()
print(three_dim.view(2,12))
print()
print(three_dim.view(2,-1))


tensor([[2],
        [1],
        [4],
        [3],
        [5],
        [4],
        [1],
        [2],
        [0],
        [4],
        [3],
        [2]])
tensor([2, 1, 4, 3, 5, 4, 1, 2, 0, 4, 3, 2])
tensor([[2, 1, 4, 3],
        [5, 4, 1, 2],
        [0, 4, 3, 2]])
tensor([[2, 1, 4, 3],
        [5, 4, 1, 2],
        [0, 4, 3, 2]])
tensor([[2],
        [1],
        [4],
        [3],
        [5],
        [4],
        [1],
        [2],
        [0],
        [4],
        [3],
        [2]])
torch.Size([12, 1])


tensor([[[ 1.6049, -0.0043, -0.5227,  0.9592],
         [ 0.5820, -0.4637,  0.5721, -0.0293],
         [-0.0214,  1.8570,  1.1003, -0.5689]],

        [[-0.2166,  0.1264, -2.0458, -1.7349],
         [-0.9254, -1.9582, -0.5273,  0.6805],
         [-0.1811,  0.6854, -0.6828,  0.5640]]])

tensor([[ 1.6049, -0.0043, -0.5227,  0.9592,  0.5820, -0.4637,  0.5721, -0.0293,
         -0.0214,  1.8570,  1.1003, -0.5689],
        [-0.2166,  0.1264, -2.0458, -1.7349, -0.9254, -1.9582, -0.5273,

In [178]:
# create a matrix with random numbers taken from a normal distribution with mean 0 and variance 1
r2 = torch.randn(4,4)
print(r2)
print(r2.dtype)

tensor([[-0.1457,  0.8138,  0.0161, -0.1944],
        [-0.4398,  0.5313, -1.5113, -0.2654],
        [-0.7952,  1.5608,  1.8891,  1.6383],
        [-0.1394,  0.0750, -0.2893, -0.2829]])
torch.float32


In [179]:
# create an array of 5 random inteeres from values between 6 and 9
in_array = torch.randint(6,10,(5,))
print(in_array)
print(in_array.dtype)

tensor([8, 9, 7, 9, 8])
torch.int64


In [180]:
# create a 2-D array of size 3x3 filled with random integers from values between 6 and 9
in_array2 = torch.randint(6,10,(3,3))
print(in_array2)

tensor([[9, 6, 8],
        [7, 6, 7],
        [6, 7, 7]])


In [181]:
# get the number of elements
print(torch.numel(in_array))
print(torch.numel(in_array2))

5
9


In [182]:
# construct a 3x3 matrix of zeros and of dtype long:
z = torch.zeros(3,3,dtype=torch.long)
print(z)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])


In [183]:
# construct a 3x3 matrix of ones
o = torch.ones(3,3)
print(o)
print(o.dtype)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
torch.float32


In [184]:
r2_like = torch.randn_like(r2, dtype=torch.double)
print(r2_like)

tensor([[ 0.3392, -1.6874, -0.2047,  1.2174],
        [ 1.2757, -1.2088,  1.3244, -1.5746],
        [ 0.0298, -0.8718, -0.2900,  0.0554],
        [ 0.5342,  0.6592,  1.1108,  0.6998]], dtype=torch.float64)


In [185]:
# add two tensors, make sure they are the same size and data type
add_result = torch.add(r2_like,r2)
print(add_result)

tensor([[ 0.1935, -0.8736, -0.1886,  1.0230],
        [ 0.8358, -0.6776, -0.1869, -1.8401],
        [-0.7653,  0.6890,  1.5991,  1.6937],
        [ 0.3948,  0.7342,  0.8215,  0.4169]], dtype=torch.float64)


In [186]:
# in place addition, inplace operation
r2.add_(r2_like)
print(r2)

tensor([[ 0.1935, -0.8736, -0.1886,  1.0230],
        [ 0.8358, -0.6776, -0.1869, -1.8401],
        [-0.7653,  0.6890,  1.5991,  1.6937],
        [ 0.3948,  0.7342,  0.8215,  0.4169]])


In [187]:
# matrix slicing
print(r2[:,1])
print(r2[:,:2])
print(r2[:3,:])
num_ten = r2[2,3]
print(num_ten)
print(num_ten.item())
print(r2[2,:])

tensor([-0.8736, -0.6776,  0.6890,  0.7342])
tensor([[ 0.1935, -0.8736],
        [ 0.8358, -0.6776],
        [-0.7653,  0.6890],
        [ 0.3948,  0.7342]])
tensor([[ 0.1935, -0.8736, -0.1886,  1.0230],
        [ 0.8358, -0.6776, -0.1869, -1.8401],
        [-0.7653,  0.6890,  1.5991,  1.6937]])
tensor(1.6937)
1.6936734914779663
tensor([-0.7653,  0.6890,  1.5991,  1.6937])


## 1.2 Numpy Bridge

In [188]:
import numpy as np
# Converting a torch tensor to a NumPy Array
a = torch.ones(5)
b = a.numpy()
print(b)

# See how the numpy array changed its value
a.add_(1)
print(a)
print(b)

[1. 1. 1. 1. 1.]
tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]


In [189]:
# convert numpy array to torch tensor
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b)

[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


In [190]:
# move the tensor to the GPU
r2 = r2.cuda()
print(r2)

tensor([[ 0.1935, -0.8736, -0.1886,  1.0230],
        [ 0.8358, -0.6776, -0.1869, -1.8401],
        [-0.7653,  0.6890,  1.5991,  1.6937],
        [ 0.3948,  0.7342,  0.8215,  0.4169]], device='cuda:0')


In [191]:
CUDA = torch.cuda.is_available()
print(CUDA)
if CUDA:
  add_result = add_result.cuda()
  print(add_result)

True
tensor([[ 0.1935, -0.8736, -0.1886,  1.0230],
        [ 0.8358, -0.6776, -0.1869, -1.8401],
        [-0.7653,  0.6890,  1.5991,  1.6937],
        [ 0.3948,  0.7342,  0.8215,  0.4169]], device='cuda:0',
       dtype=torch.float64)


In [192]:
# convert a list to a tensor
a = [2,3,4,1]
print(a)
to_list = torch.tensor(a)
print(to_list, to_list.dtype)

[2, 3, 4, 1]
tensor([2, 3, 4, 1]) torch.int64


In [193]:
data = [[1.,2.],[3.,4.],[5.,6.],[7.,8.]]
T = torch.tensor(data)
print(T, T.dtype)

tensor([[1., 2.],
        [3., 4.],
        [5., 6.],
        [7., 8.]]) torch.float32


## 1.3 Tensor Concatenation

In [194]:
first_1 = torch.randn(2, 5)
print(first_1)
second_1 = torch.randn(3, 5)
print(second_1)
con_1 = torch.cat([first_1, second_1])
print()
print(con_1)
print()
first_2 = torch.randn(2, 3)
second_2 = torch.randn(2, 5)
print(first_2)
print(second_2)
con_2 = torch.cat([first_2, second_2], 1)
print(con_2)

tensor([[ 1.8082, -0.8210, -0.0235,  2.3321, -1.4842],
        [-1.1543, -0.7313,  0.7369, -1.3759, -1.6354]])
tensor([[ 0.7411,  1.4919,  0.5363,  0.0944, -0.3182],
        [ 1.2736, -0.2378, -0.3954,  0.4853, -0.2460],
        [ 1.3595,  1.2056,  1.3826,  0.7342, -0.0476]])

tensor([[ 1.8082, -0.8210, -0.0235,  2.3321, -1.4842],
        [-1.1543, -0.7313,  0.7369, -1.3759, -1.6354],
        [ 0.7411,  1.4919,  0.5363,  0.0944, -0.3182],
        [ 1.2736, -0.2378, -0.3954,  0.4853, -0.2460],
        [ 1.3595,  1.2056,  1.3826,  0.7342, -0.0476]])

tensor([[ 0.2545, -0.8233, -0.2017],
        [ 0.7761,  1.8821,  0.3406]])
tensor([[-0.0149, -2.2492,  0.0748, -0.8012, -1.4579],
        [-1.6666,  0.9097, -0.4240,  1.0185, -0.1013]])
tensor([[ 0.2545, -0.8233, -0.2017, -0.0149, -2.2492,  0.0748, -0.8012, -1.4579],
        [ 0.7761,  1.8821,  0.3406, -1.6666,  0.9097, -0.4240,  1.0185, -0.1013]])


## Adding dimensions to tensor

In [195]:
tensor_1 = torch.tensor([1,2,3,4])
print(tensor_1)
tensor_a = torch.unsqueeze(tensor_1, 0)
print(tensor_a)
print(tensor_a.shape)
tensor_b = torch.unsqueeze(tensor_1, 1)
print(tensor_b)
print(tensor_b.shape)
print()
tensor_2 = torch.rand(2,3,4)
print(tensor_2)
print()
tensor_c = tensor_2[:,:,2]
print(tensor_c)
print(tensor_c.shape)
print()
tensor_d = torch.unsqueeze(tensor_c, 2)
print(tensor_d)
print(tensor_d.shape)

tensor([1, 2, 3, 4])
tensor([[1, 2, 3, 4]])
torch.Size([1, 4])
tensor([[1],
        [2],
        [3],
        [4]])
torch.Size([4, 1])

tensor([[[0.8841, 0.7729, 0.4977, 0.7353],
         [0.5057, 0.4887, 0.3625, 0.0291],
         [0.5705, 0.0687, 0.3374, 0.7471]],

        [[0.0322, 0.1514, 0.4635, 0.4413],
         [0.9635, 0.1517, 0.7576, 0.7530],
         [0.5919, 0.0848, 0.5322, 0.8456]]])

tensor([[0.4977, 0.3625, 0.3374],
        [0.4635, 0.7576, 0.5322]])
torch.Size([2, 3])

tensor([[[0.4977],
         [0.3625],
         [0.3374]],

        [[0.4635],
         [0.7576],
         [0.5322]]])
torch.Size([2, 3, 1])


## 2.1 Building a Chatbot
Code adopted from https://pytorch.org/tutorials/beginner7chatbot_tutorial.html

In [196]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import  re
import os
import unicodedata
import codecs
import itertools

In [197]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")

In [198]:
print(device)

cuda


### 2.2. Preprocessing

In [199]:
lines_filepath = os.path.join("cornell movie-dialogs corpus", "movie_lines.txt")
conv_filepath = os.path.join("cornell movie-dialogs corpus", "movie_conversations.txt")

In [200]:
# visualize some lines
with open(lines_filepath, 'r', encoding="iso-8859-1") as file:
  lines = file.readlines()
for line in lines[:8]:
  print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [201]:
# split each line of the file into a dictionary of fields (lineID, characterID, moveiID, character, text)
line_fields = ["lineID", "characterID", "movieID", "character", "text"]
lines = {}
with open(lines_filepath, 'r', encoding="iso-8859-1") as f:
  for line in f:
    values = line.split(" +++$+++ ")
    # Extract fields
    lineObj = {}
    for i, field in enumerate(line_fields):
      lineObj[field] = values[i]
    lines[lineObj['lineID']] = lineObj

In [202]:
lines['L194']

{'character': 'BIANCA',
 'characterID': 'u0',
 'lineID': 'L194',
 'movieID': 'm0',
 'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'}

In [203]:
[]# groups fields of lines from 'loadlines' into conversations based on +movie-conversations.txt"
conv_fields = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
conversations = []
with open(conv_filepath, 'r', encoding='iso-8859-1') as f:
  for line in f:
    values = line.split(" +++$+++ ")
    # Extract fields
    convObj = {}
    for i, field in enumerate(conv_fields):
      convObj[field] = values[i]
    # convert string result from split to list, since convObj["utteranceIDs"] == "['L598485','L598486',...]"
    lineIds = eval(convObj["utteranceIDs"])
    # Reassamble lines
    convObj["lines"] = []
    for lineId in lineIds:
      convObj["lines"].append(lines[lineId])
      conversations.append(convObj)

In [204]:
print(conv_fields)
print(convObj)
conversations[0]


['character1ID', 'character2ID', 'movieID', 'utteranceIDs']
{'character1ID': 'u9030', 'character2ID': 'u9034', 'movieID': 'm616', 'utteranceIDs': "['L666520', 'L666521', 'L666522']\n", 'lines': [{'lineID': 'L666520', 'characterID': 'u9034', 'movieID': 'm616', 'character': 'VEREKER', 'text': 'Well I assure you, Sir, I have no desire to create difficulties. 45\n'}, {'lineID': 'L666521', 'characterID': 'u9030', 'movieID': 'm616', 'character': 'DURNFORD', 'text': "And I assure you, you do not In fact I'd be obliged for your best advice. What have your scouts seen?\n"}, {'lineID': 'L666522', 'characterID': 'u9034', 'movieID': 'm616', 'character': 'VEREKER', 'text': 'So far only their scouts. But we have had reports of a small Impi farther north, over there. \n'}]}


{'character1ID': 'u0',
 'character2ID': 'u2',
 'lines': [{'character': 'BIANCA',
   'characterID': 'u0',
   'lineID': 'L194',
   'movieID': 'm0',
   'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
  {'character': 'CAMERON',
   'characterID': 'u2',
   'lineID': 'L195',
   'movieID': 'm0',
   'text': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
  {'character': 'BIANCA',
   'characterID': 'u0',
   'lineID': 'L196',
   'movieID': 'm0',
   'text': 'Not the hacking and gagging and spitting part.  Please.\n'},
  {'character': 'CAMERON',
   'characterID': 'u2',
   'lineID': 'L197',
   'movieID': 'm0',
   'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}],
 'movieID': 'm0',
 'utteranceIDs': "['L194', 'L195', 'L196', 'L197']\n"}

In [205]:
# extract pairs of sentences from conversations
qa_pairs = []
for conversation in conversations:
  # iterate over all the lines of the conversation
  for i in range(len(conversation["lines"]) - 1):
    inputLine = conversation["lines"][i]["text"].strip()
    targetLine = conversation["lines"][i+1]["text"].strip()
    # filter wrong samples (if only one of the lists is empty)
    if inputLine and targetLine:
      qa_pairs.append([inputLine, targetLine])


In [206]:
qa_pairs[0]

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you."]

In [207]:
# define path to a new file
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
  writer = csv.writer(outputfile, delimiter = delimiter)
  for pair in qa_pairs:
    writer.writerow(pair)
print("Done writing to file")



Writing newly formatted file...
Done writing to file


In [208]:
# Visualize some lines
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
with open(datafile, 'rb') as file:
  lines = file.readlines()
for line in lines[:8]:
  print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\n"
b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\n"
b"Can we make this quick

In [209]:
PAD_token = 0 # Use for padding short sentences
SOS_token = 1 # Start-of-sentence token
EOS_token = 2 # End-of-sentence token

class Vocabulary:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {PAD_token: 'PAD', SOS_token: 'SOS', EOS_token: 'EOS'}
    self.num_words = 3 # count sos, eos, pad
  
  def addSentence(self, sentence):
    for word in sentence.split(' '):
      self.addWord(word)
    
  def addWord(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.num_words
      self.word2count[word] = 1
      self.index2word[self.num_words] = word
      self.num_words += 1
    else: 
      self.word2count[word] += 1

  # Remove words below a certain coun threshold
  def trim(self, min_count):
    keep_words = []
    for k, v in self.word2count.items():
      if v>= min_count:
        keep_words.append(k)

    print('keep_words {} / {} = {:.4f}'.format(len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)))
    # Reinitialize dictionaries
    self.word2index = {}
    self.word2count = {}
    self.index2word = {PAD_token: 'PAD', SOS_token: 'SOS', EOS_token: 'EOS'}
    self.num_words = 3 # count sos, eos, pad

    for word in keep_words:
      self.addWord(word)

      

In [210]:
def unicodeToAscii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [211]:
unicodeToAscii("Montréal,Françoise... ")

'Montreal,Francoise... '

In [212]:
def normalizeString(s):
  s = unicodeToAscii(s.lower().strip())
  # replace any character .!? by the whitespace + the character e.g. '!' --> ' !'.
  # \1 means the first bracketed group.
  # do not consider \1.
  # r to escape the backslash.
  s = re.sub(r"([.!?])", r" \1", s)
  # remove any sequence that is not a sequence of lower or uppercase letters
  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  # remove a sequence of whitespaces
  s = re.sub(r"\s+", r" ", s).strip()
  return s

In [213]:
normalizeString("aa123aa!s's   dd?")

'aa aa !s s dd ?'

In [214]:
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
# read file and split into lines
print("Reading and processing file....Please Wait")
lines = open(datafile, encoding='utf-8').read().strip().split("\n")
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines]
print("Done Reading!")
voc = Vocabulary("cornell movie-dialogs corpus")

Reading and processing file....Please Wait
Done Reading!


In [215]:
len(pairs[0][0].split())

25

In [216]:
# Returns True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
MAX_LENGTH = 10 # Maximum sentence length to consider (max words)
def filterPair(p):
  # Input sentences need to preserve the last word for EOS token
  return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH

def filterPairs(pairs):
  return [pair for pair in pairs if filterPair(pair)]

In [217]:
pairs = [pair for pair in pairs if len(pair) > 1]
print("There are {} pairs/conversations in the dataset".format(len(pairs)))
pairs = filterPairs(pairs)
print("There are {} pairs/conversations in the dataset".format(len(pairs)))

There are 1504021 pairs/conversations in the dataset
There are 425344 pairs/conversations in the dataset


In [218]:
for pair in pairs:
  voc.addSentence(pair[0])
  voc.addSentence(pair[1])
print("Counted words: ", voc.num_words)
for pair in pairs[:10]:
  print(pair)

Counted words:  18008
['there .', 'where ?']
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['have fun tonight ?', 'tons']


In [219]:
MIN_COUNT = 3 # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
  # Trim words used under the MIN_COUNT from voc
  voc.trim(MIN_COUNT)
  # Filter out pairs with trimmed words
  keep_pairs = []
  for pair in pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    keep_input = True
    keep_output = True
    # Check input sentence
    for word in input_sentence.split(' '):
      if word not in voc.word2index:
        keep_input = False
        break
    # Check input sentence
    for word in output_sentence.split(' '):
      if word not in voc.word2index:
        keep_output = False
        break

    # Only keep pairs that do not contain trimmed word(s) in their input or or output sentence
    if keep_input and keep_output:
      keep_pairs.append(pair)

  print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs)/len(pairs)))
  return keep_pairs

trim = trimRareWords(voc, pairs, MIN_COUNT)
    

keep_words 15963 / 18005 = 0.8866
Trimmed from 425344 pairs to 421890, 0.9919 of total


## 2.3 Data preparation

In [220]:
def indexesFromSentence(voc, sentence):
  return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [221]:
# Test the function
indexesFromSentence(voc, pairs[1][0])

[3, 4, 2]

In [222]:
# Define some samples for testing
inp = []
out = []
i = 0
for pair in pairs[:10]:
  inp.append(pair[0])
  out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexesFromSentence(voc, sentence) for sentence in inp]
indexes

['there .', 'there .', 'you have my word . as a gentleman', 'you have my word . as a gentleman', 'hi .', 'hi .', 'you know chastity ?', 'you know chastity ?', 'have fun tonight ?', 'have fun tonight ?']
10


[[3, 4, 2],
 [3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [16, 4, 2],
 [7, 24, 25, 6, 2],
 [7, 24, 25, 6, 2],
 [8, 32, 22, 6, 2],
 [8, 32, 22, 6, 2]]

In [223]:
# zip by longer list: itertools.zip_longest(a,b) --> zip object --> past to a list or iterate through it

In [224]:
def zeroPadding(l, fillvalue = 0):
  return list(itertools.zip_longest(*l, fillvalue = fillvalue))

In [225]:
leng = [len(ind) for ind in indexes]
max(leng)

9

In [226]:
# Test the function
test_result = zeroPadding(indexes)
print(len(test_result))
test_result

9


[(3, 3, 7, 7, 16, 16, 7, 7, 8, 8),
 (4, 4, 8, 8, 4, 4, 24, 24, 32, 32),
 (2, 2, 9, 9, 2, 2, 25, 25, 22, 22),
 (0, 0, 10, 10, 0, 0, 6, 6, 6, 6),
 (0, 0, 4, 4, 0, 0, 2, 2, 2, 2),
 (0, 0, 11, 11, 0, 0, 0, 0, 0, 0),
 (0, 0, 12, 12, 0, 0, 0, 0, 0, 0),
 (0, 0, 13, 13, 0, 0, 0, 0, 0, 0),
 (0, 0, 2, 2, 0, 0, 0, 0, 0, 0)]

In [227]:
def binaryMatrix(l, value = 0):
  m = []
  for i, seq in enumerate(l):
    m.append([])
    for token in seq:
      if token == PAD_token:
        m[i].append(0)
      else:
        m[i].append(1)
  return m

In [228]:
binary_result = binaryMatrix(test_result)
binary_result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [0, 0, 1, 1, 0, 0, 1, 1, 1, 1],
 [0, 0, 1, 1, 0, 0, 1, 1, 1, 1],
 [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 1, 0, 0, 0, 0, 0, 0]]

In [229]:
def inputVar(l, voc):
  indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
  lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
  padList = zeroPadding(indexes_batch)
  padVar = torch.LongTensor(padList)
  return padVar, lengths

In [230]:
def outputVar(l, voc):
  indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
  max_target_len = max([len(indexes) for indexes in indexes_batch])
  padList = zeroPadding(indexes_batch)
  mask = binaryMatrix(padList)
  mask = torch.ByteTensor(mask)
  padVar = torch.LongTensor(padList)
  return padVar, mask, max_target_len


In [231]:
# Return all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
  # Sort the questions in descending length
  pair_batch.sort(key = lambda x: len(x[0].split(" ")), reverse = True)
  input_batch, output_batch = [], []
  for pair in pair_batch:
    input_batch.append(pair[0])
    output_batch.append(pair[1])
  inp, lengths = inputVar(input_batch, voc)
  # assert len(inp9 == lengths[0]
  output, mask, max_target_len = outputVar(output_batch, voc)
  return inp, lengths, output, mask, max_target_len

In [232]:
# example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input variable")
print(input_variable)
print("lenghts: ", lengths)
print("target_variable:")
print(target_variable)
print("mask:")
print(mask)
print("max_target_len:", max_target_len)


input variable
tensor([[ 103,   26,   26,  255, 1221],
        [1703,  212,    8,   23,    6],
        [ 120,  569,   41,    4,    2],
        [  67,   62,   62,    2,    0],
        [ 703,   41,    4,    0,    0],
        [ 426,  177,    2,    0,    0],
        [  68,  309,    0,    0,    0],
        [   6,    4,    0,    0,    0],
        [   2,    2,    0,    0,    0]])
lenghts:  tensor([9, 9, 6, 4, 3])
target_variable:
tensor([[  26,   35,  863,  125, 1222],
        [ 209,    7,    4,  304,    4],
        [ 120,   14,    2,    7,    9],
        [  24,   69,    0,   40,  316],
        [  68,    4,    0,   41,    4],
        [   2,    2,    0,  383,   78],
        [   0,    0,    0,    4,   38],
        [   0,    0,    0,    4,  271],
        [   0,    0,    0,    4,    4],
        [   0,    0,    0,    2,    2]])
mask:
tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 0, 1, 1],
        [1, 1, 0, 1, 1],
        [1, 1, 0, 1, 1],
        [0, 0, 0

## Building the model

In [233]:
class EncoderRNN(nn.Module):
  def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
    super(EncoderRNN, self).__init__()
    self.n_layers = n_layers
    self.hidden_size = hidden_size
    self.embedding = embedding
    # Initialize GRU; the input_size and hidden_size are both set to 'hidden_size'
    # because our input size is a word embedding with number of features == hidden_size
    self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

  # overriding the forward function
  def forward(self, input_seq, input_lengths, hidden = None):
    # input_seq: batch of input sentences; shape=(max_length, batch_size)
    # input_lengths: list of sentence lengths corresponding to each sentence in the batch
    # hidden state, of shape: (n_layers x num_directions, batch_size, hidden_size)
    # Convert word indexes to embeddings
    embedded = self.embedding(input_seq)
    # Pack padded batch of sequences for RNN module
    packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
    # Forward pass through GRU
    outputs, hidden = self.gru(packed, hidden)
    # Unpack padding
    outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
    # Sum bidirectional GRU outputs
    outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
    # return output and final hidden state
    return outputs, hidden
    # outputs: the output features h_t form the last layer of the GRU, for each timestep (sum of bidirectional outputs)
    # outputs shape = (max_length, batch_size, hidden_size)
    # hidden: hidden state for the last timestep, of shape = (n_layers x num_directions, batch_size, hidden_size)


In [234]:
# Luong attention layer
class Attn(nn.Module):
  def __init__(self, method, hidden_size):
    super(Attn, self).__init__()
    self.method = method
    self.hidden_size = hidden_size

  def dot_score(self, hidden, encoder_output):
    # Element-Wise Multiply the curernt target decoder state with the encoder output and sum them
    return torch.sum(hidden * encoder_output, dim = 2)

  def forward(self, hidden, encoder_outputs):
    # hidden of shape: (1, batch_size, hidden_size)
    # encoder_outputs of shape: (max_length, batch_size, hidden_size)

    # Calculate the attention weights (energies)
    attn_energies = self.dot_score(hidden, encoder_outputs) # (max_length, batch_size)
    # transpose max_length and batch_size dimensions
    attn_energies = attn_energies.t()                       # (batch_size, max_length)
    # return the softmax normalized probability scores (with added dimension)
    return F.softmax(attn_energies, dim = 1).unsqueeze(1)   # (batch_size, l, max_length)

In [235]:
class LuongAttnDecoderRNN(nn.Module):
  def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers = 1, dropout = 0.1):
    super(LuongAttnDecoderRNN, self).__init__()
    self.attn_model = attn_model
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout = dropout

    # Define layers
    self.embedding = embedding
    self.embedding_dropout = nn.Dropout(dropout)
    self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout = (0 if n_layers == 1 else dropout))
    self.concat = nn.Linear(hidden_size * 2, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)

    self.attn = Attn(attn_model, hidden_size)

  def forward(self, input_step, last_hidden, encoder_outputs):
    # iput_step: one time step (one word) of input sequence batch; shape = (1, batch_size)
    # last_hidden: final hidden layer of GRU; shape = (n layers x num_directions, batch_size, hidden_size)
    # encoder_outputs: encoder model's output; shape = (max_length, batch_size, hidden_size)
    # Note: we run this one step (word) at a time

    # Get embedding of current input word
    embedded = self.embedding(input_step)
    embedded = self.smedding_dropout(embedded)

    # Forward through unidirectional GRU
    rnn_output, hidden = self.gru(embedded, last_hidden)
    # Calculate attention weights from the current GRU output
    attn_weights = self.attn(rnn_output, encoder_outputs)
    # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
    # (batch_size, 1, max_length) bmm with (batch_size, max_length, hidden) = (batch_size, 1, hidden)
    context = attn_weights.bmm(encoder_outputs.transpose(0,1))
    # concatenate weighted context vector and GRU output
    rnn_output = rnn_output.squeeze(0)
    context = context.squeze(1)
    concat_input = torch.cat((rnn_output, context), 1)
    concat_output = torch.tanh(self.concat(concat_input))
    # predict next word using Luong eq. 6
    output = self.out(concat_output)
    output = F.softmax(output, dim = 1)
    # Return output and final hidden state
    return output, hidden
    # output: softmax normalized tensor giving probabilities of each word being the correct next word in the decoded sequence
    # shape = (batch_size, voc.num_words)
    # hidden: final hidden state of GRU; shape = (n_layers x num_directions, batch_size, hidden_size)
    


## Training the model

In [238]:
# Creating the loss function
def maskNLLLoss(decoder_out, target, mask):
  nTotal = mask.sum() # How many element to consider
  target = target.view(-1, 1)
  # decoder_out shape: (batch_size, vocab_size), target_size = (batch_size, 1)
  gathered_tensor = torch.gather(decoder_out, 1, target)
  # calculate the negative log likelihoods
  crossEntrophy = -torch.log(gathered_tensor)
  # Select the non-zero elements
  loss = crossEntrophy.masked_select(mask)
  # calculate the mean of the loss
  loss = loss.mean()
  loss = loss.to(device)
  return loss, nTotal.item()

In [239]:
# visualize what is happening in the loss function
# decoder_out shape : (batch_size, vocab_size), target_size = (batch_size, 1)
dec_o = torch.rand(5,7)
dec_o = F.softmax(dec_o, dim = 1)
tar = torch.tensor([2,1,5,4,0], dtype = torch.long)
tar = tar.view(-1, 1)
mask = torch.tensor([1,0,1,1,0], dtype = torch.uint8)
print(dec_o)
print(tar)
gath_ten = torch.gather(dec_o, 1, tar)
print(gath_ten)
print(gath_ten.shape)
crossEntropy = -torch.log(gath_ten)
print("cross entropy")
print(crossEntropy)
mask = mask.unsqueeze(1)
loss = crossEntropy.masked_select(mask)
print("Loss:")
print(loss)
print(loss.shape)
print("Sum of mask elements (How many elements we are considering):", mask.sum())
print("Mean of the Loss:", loss.mean())
print("Mean of the cross-entropy loss (without masking):", crossEntropy.mean())


tensor([[0.0936, 0.1425, 0.2297, 0.1032, 0.1476, 0.1395, 0.1438],
        [0.1639, 0.1464, 0.1899, 0.0884, 0.1050, 0.0947, 0.2117],
        [0.1012, 0.1176, 0.1077, 0.1163, 0.2514, 0.1466, 0.1592],
        [0.1032, 0.1848, 0.1482, 0.2078, 0.1180, 0.1431, 0.0949],
        [0.1187, 0.1127, 0.1128, 0.1050, 0.2370, 0.1036, 0.2103]])
tensor([[2],
        [1],
        [5],
        [4],
        [0]])
tensor([[0.2297],
        [0.1464],
        [0.1466],
        [0.1180],
        [0.1187]])
torch.Size([5, 1])
cross entropy
tensor([[1.4708],
        [1.9216],
        [1.9200],
        [2.1372],
        [2.1315]])
Loss:
tensor([1.4708, 1.9200, 2.1372])
torch.Size([3])
Sum of mask elements (How many elements we are considering): tensor(3)
Mean of the Loss: tensor(1.8427)
Mean of the cross-entropy loss (without masking): tensor(1.9162)




In [241]:
# Visualize  training
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable shape:", input_variable.shape)
print("lengths shape:", lengths.shape)
print("target_variable_shape:", target_variable.shape)
print("mask shape:", mask.shape)
print("max_target_len:", max_target_len)

# Define the parameters
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
attn_model = 'dot'
embedding = nn.Embedding(voc.num_words, hidden_size)

# define the encoder and decoder
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)
# ensure dropout layers are in train mode
encoder.train()
decoder.train()

# initialize optimizers
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.0001)
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)

loss = 0
print_losses = []
n_totals = 0

encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
print("Encoder Outputs Shape:", encoder_outputs.shape)
print("Last Encoder Hidden Shape", encoder_hidden.shape)

decoder_input = torch.LongTensor([SOS_token for _ in range(small_batch_size)])
decoder_input = decoder_input.to(device)
print("Initial decoder hidden state shape:", decoder_input.shape)
print(decoder_input)

# Set initial decoder hidden state to the encoder's final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
print("Initial decoder hidden state shape: ", decoder_hidden.shape)
print("\n")
print("----------------------------------------------------------")
print("timesteps of the GRU")
print("----------------------------------------------------------")
print("\n")

# Assume we are using Teacher Forcing
for t in range(max_target_len):
  decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
  print("decoder Output Shape:", decoder_output.shape)
  print("decoder Hidden Shape:", decoder_hidden.shape)
  # Teacher forcing: next input is current target
  decoder_input = target_variable[t].view(1, -1)
  print("The target variable at the current timestep before reshaping and its shape:", target_variable[t], target_variable[t].shape)
  print("The Decoder input shape (reshaping the target variable):", decoder_input.shape)
  # calculate and accumulate loss
  print("The mask at the current timestep:", mask[t])
  print("The mask at the current timestep shape:", mask[t].shape)
  mask_loss, nTotal  =maskNLLLoss(decoder_output, target_variable[t], mask[t])
  print("Mask Loss:", mask_loss)
  print("Total:", nTotal)
  loss += nTotal
  print(n_totals)
  encoder_optimizer.step()
  decoder_optimizer.step()
  returned_loss = sum(print_losses) / n_totals
  print("Returned Loss:", returned_loss)
  print("\n")
  print("-------------------------- Done one timestep ------------")
  print("\n")






input_variable shape: torch.Size([9, 5])
lengths shape: torch.Size([5])
target_variable_shape: torch.Size([8, 5])
mask shape: torch.Size([8, 5])
max_target_len: 8


RuntimeError: ignored

In [None]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding, encoder_optimizer,
          decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):
  
  # zero gradients
  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  # set device options
  input_variable = input_variable.to(device)
  lengths = lengths.to(device)
  target_variable = target_variable.to(device)
  mask = mask.to(device)

  # initialize variables
  loss = 0
  print_losses = []
  n_totals = 0

  # Forward pass through encoder
  encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

  # create initial decoder input (start with SOS tokens for each sentence)
  decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
  decoder_input = decoder_input.to(device)

  # set initial decoder hidden state to the encoder's final hidden state
  decoder_hidden = encoder_hidden[:decoder.n_layers]

  # determine if we are using teacher forcing this iteration
  use_teacher_forcing = True
  # use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  # Forwarding batch of sequence through decoder one time step at a time
  if use_teacher_forcing:
    for t in range(max_target_len):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
      # Teacher forcing: next input is current target
      decoder_input = target_variable[t].view(1, -1)
      # calculate and accumulate loss
      mask_total, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
      loss += mask_loss
      print_losses.append(mask_loss.item() * nTotal)
      n_totals += nTotal
  else:
    for t in range(max_target_len):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
      # No teacher forcing: next input is decoder's own current output
      _, topi = decoder_output.topk(1)
      decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
      decoder_input = decoder_input.to(device)
      # calculate and accumulate loss
      mask_loss, nTotal = maskNLLLoss = (decoder_output, target_variable[t], mask[t])
      loss += mask_loss
      print_losses.append(mask_loss.item() * nTotal)
      n_totals += nTotal

  # Perform backpropagation
  loss.backward()

  # clip gradients: gradients are modified in place
  _ = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
  _ = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)

  # adjust model weights
  encoder_optimizer.step()
  decoder_optimizer.step()

  return sum(print_losses) / n_totals
  




### Now that you understood the basics of a chatbot and how it works, you can continue from here:


Yuan-Kuei Wu’s pytorch-chatbot implementation

https://github.com/ywk991112/pytorch-chatbot
<br><br>

Sean Robertson’s practical-pytorch seq2seq-translation example

https://github.com/spro/practical-pytorch/tree/master/seq2seq-translation
<br><br>

FloydHub’s Cornell Movie Corpus preprocessing code

https://github.com/floydhub/textutil-preprocess-cornell-movie-corpus
<br><br>
Pytorch

https://pytorch.org/tutorials/beginner/chatbot_tutorial.html



# Part 2 **chatbot**

## preparations

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [5]:
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("data", corpus_name)

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

printLines(os.path.join(corpus, "movie_lines.txt"))

b'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n'
b'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n'
b'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n'
b'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n'
b"L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"
b'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n'
b"L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n"
b'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n'
b'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n'
b'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n'


In [6]:
# Splits each line of the file into a dictionary of fields
def loadLines(fileName, fields):
    lines = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            # Extract fields
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]
            lines[lineObj['lineID']] = lineObj
    return lines


# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
def loadConversations(fileName, lines, fields):
    conversations = []
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            # Extract fields
            convObj = {}
            for i, field in enumerate(fields):
                convObj[field] = values[i]
            # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
            utterance_id_pattern = re.compile('L[0-9]+')
            lineIds = utterance_id_pattern.findall(convObj["utteranceIDs"])
            # Reassemble lines
            convObj["lines"] = []
            for lineId in lineIds:
                convObj["lines"].append(lines[lineId])
            conversations.append(convObj)
    return conversations


# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations:
        # Iterate over all the lines of the conversation
        for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i+1]["text"].strip()
            # Filter wrong samples (if one of the lists is empty)
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs

In [7]:
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")

delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Initialize lines dict, conversations list, and field ids
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]

# Load lines and process conversations
print("\nProcessing corpus...")
lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
print("\nLoading conversations...")
conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"),
                                  lines, MOVIE_CONVERSATIONS_FIELDS)

# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)

# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)


Processing corpus...

Loading conversations...

Writing newly formatted file...

Sample lines from file:
b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't dat

In [8]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [9]:
MAX_LENGTH = 10  # Maximum sentence length to consider

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using filterPair condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 221282 sentence pairs
Trimmed to 64271 sentence pairs
Counting words...
Counted words: 18008

pairs:
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [10]:
MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 7823 / 18005 = 0.4345
Trimmed from 64271 pairs to 53165, 0.8272 of total


In [11]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[1592,  199,  742,    7,  787],
        [  25,    7,    4,  540,    6],
        [   8,   14,  742,    6,    2],
        [ 544, 5499,    4,    2,    0],
        [4407,   66,    2,    0,    0],
        [  75,    2,    0,    0,    0],
        [   7,    0,    0,    0,    0],
        [   4,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths: tensor([9, 6, 5, 4, 3])
target_variable: tensor([[  25,   25,   25,   25,   16],
        [ 387,  200,  247,  200,    4],
        [ 660,   50,  117,  177,  115],
        [ 483,    6, 2975,  541,  673],
        [   4,   50,   76,    4,    3],
        [   2,    6,    4,    2,    6],
        [   0,    2,    2,    0,    2]])
mask: tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [False, 

In [12]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

In [13]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [14]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [15]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [16]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [17]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [18]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [19]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [20]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [21]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 8.9591
Iteration: 2; Percent complete: 0.1%; Average loss: 8.8486
Iteration: 3; Percent complete: 0.1%; Average loss: 8.6990
Iteration: 4; Percent complete: 0.1%; Average loss: 8.3454
Iteration: 5; Percent complete: 0.1%; Average loss: 8.0792
Iteration: 6; Percent complete: 0.1%; Average loss: 7.4043
Iteration: 7; Percent complete: 0.2%; Average loss: 6.8912
Iteration: 8; Percent complete: 0.2%; Average loss: 6.9441
Iteration: 9; Percent complete: 0.2%; Average loss: 6.7648
Iteration: 10; Percent complete: 0.2%; Average loss: 6.4145
Iteration: 11; Percent complete: 0.3%; Average loss: 6.2179
Iteration: 12; Percent complete: 0.3%; Average loss: 5.8093
Iteration: 13; Percent complete: 0.3%; Average loss: 5.5705
Iteration: 14; Percent complete: 0.4%; Average loss: 5.6295
Iteration: 15; Percent complete: 0.4%; Average loss: 5.4547
Iteration: 16; Percent complete: 0.4%

In [23]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

In [1]:
# Begin chatting
evaluateInput(encoder, decoder, searcher, voc)



CUDA = torch.cuda.is_available()
print(CUDA)
if CUDA:
  add_result = add_result.cuda()
  print(add_result)