In [1]:
# Basic packages
import json, re, itertools, random, pickle
import numpy as np
import pandas as pd

# Spacy for tokenization
import spacy 
import en_core_web_md
nlp = en_core_web_md.load()

# Gensim
from gensim.corpora.dictionary import Dictionary
import gensim

In [2]:
# read Data
with open("BBT_episodes.json","r") as file:
    data = json.load(file)
  
# Before converting dict to list add episode markers and remove unwanted texts
removeScripts = ["Story: Chuck Lorre & Bill Prady","Story: Chuck Lorre","None",'Credits sequence.', 'Credit sequence.','Teleplay: Robert Cohen & Dave Goetsch']
scripts = []
for key, value in sorted(data.items()):
    script = ["StartofEpisode:"] + [str(val).strip() for val in value if str(val).strip() not in removeScripts] + ["EndofEpisode:"]
    scripts.append(script)

# All script as single sentence
script = " ".join(map(str,scripts))

# No. of Scripts
print("No. of Episodes Available: ",len(scripts))

No. of Episodes Available:  231


In [4]:
# Keep only dialogues from characters and scene descriptions
r = re.compile(".*:.*")
for i in range(len(scripts)):
  scripts[i] = list(filter(r.match, scripts[i]))

# No. of Scripts
print("No. of Episodes Available: ",len(scripts))

# No. of dialogues
print("No. of Dialogues: ", np.sum([len(script) for script in scripts]) -  2 * len(scripts))

No. of Episodes Available:  231
No. of Dialogues:  50957


In [6]:
# Take a peek
scripts[0][:10]

['StartofEpisode:',
 'Scene: A corridor at a sperm bank.',
 'Sheldon: So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.',
 'Leonard: Agreed, what’s your point?',
 'Sheldon: There’s no point, I just think it’s a good idea for a tee-shirt.',
 'Leonard: Excuse me?',
 'Receptionist: Hang on.',
 'Leonard: One across is Aegean, eight down is Nabakov, twenty-six across is MCM, fourteen down is… move your finger… phylum, which makes fourteen across Port-au-Prince. See, Papa Doc’s capital idea, that’s Port-au-Prince. Haiti.',
 'Receptionist: Can I help you?',
 'Leonard: Yes. Um, is this the High IQ sperm bank?']

In [5]:
# the first word of the dialogue denotes the person speaking, we will give unique identifier for this
# replace sheldon: with sheldonspeaks
def speaking(s):
    if re.findall("^[A-z ]+:", s):
        # Extract name and replace space if its multi word before adding speaks to it
        toReplace = str(re.findall("^[A-z ]+:", s)[0])
        return s.replace(toReplace, toReplace[:-1].replace(" ","") + "speaks")

def recursively_apply(l, f):
    for n, i in enumerate(l):
        if type(i) is list:
            l[n] = recursively_apply(l[n], f)
        elif type(i) is str:
            l[n] = f(i)
    return l
  
# Apply function
modified_scripts = recursively_apply(scripts, speaking)

# Flatten list
modified_scripts = [str(y) for x in modified_scripts for y in x]

# Use only dialogues
r = re.compile(".*speaks.*")
modified_scripts = list(filter(r.match, modified_scripts))

# Get starts of episode indexs
start_ind = [ind for ind, val in enumerate(modified_scripts) if val == 'StartofEpisodespeaks']

In [6]:
# Random check
ind = start_ind[10]
modified_scripts[ind: ind+10]

['StartofEpisodespeaks',
 'Scenespeaks The living room of the apartment. Leonard and Sheldon are playing the three dimensional chess game from the original Star Trek series. It is Leonard’s move. He takes his time, moving round the board and checking things from various angles. Finally he tentatively makes a move. Sheldon moves almost immediately.',
 'Sheldonspeaks Checkmate.',
 'Leonardspeaks O-o-o-o-h! Again?',
 'Sheldonspeaks Obviously you’re not well suited for three-dimensional chess, perhaps three dimensional candyland would be more your speed.',
 'Leonardspeaks Just reset the board.',
 'Sheldonspeaks It must be humbling to suck on so many different levels.',
 'Leonardspeaks Hey!',
 'Pennyspeaks Did you get my mail.',
 'Leonardspeaks Yeah, right here. How was Nebraska?']

In [9]:
# Collect NER types so that similar meaning multi-words can be standardised
NER = {}
for doc in nlp.pipe(modified_scripts, n_threads = -1, batch_size= 5000):
    for w in doc.ents:
        if w.label_ in NER.keys():
            if not bool(re.search('speaks', str(w))):
                NER[w.label_].append(str(w))
        else:
            NER[w.label_] = []
            if not bool(re.search('speaks', str(w))):
                NER[w.label_].append(str(w))

In [10]:
# Find the Nr of occurence of each NER type
def stat(NER_LIST):
    stat = pd.DataFrame(pd.DataFrame(NER_LIST)[0].value_counts())
    stat.reset_index(inplace = True)
    stat.columns = ["Name", "Count"]
    return stat.loc[stat.Count > 10]

NER_dflist = {}
for ner in list(NER.keys()):
    NER_dflist[ner] = stat(NER[ner])

# Write to file
def save_xls(dict_df, path):
    writer = pd.ExcelWriter(path)
    for key in dict_df:
        dict_df[key].to_excel(writer, '%s' % key, index = False)
    writer.save()
        
#save_xls(NER_dflist, 'NER_stat.xls')

In [11]:
# Check number of dialogues spoken by each person
regex = r'\b\w+\b'
speaker = [x for x in re.findall(regex," ".join(modified_scripts)) if bool(re.search("speaks", x))]
speaker = stat(speaker)
speaker.head(10)

Unnamed: 0,Name,Count
0,Sheldonspeaks,10935
1,Leonardspeaks,9242
2,Pennyspeaks,7246
3,Howardspeaks,5555
4,Rajspeaks,4462
5,Amyspeaks,3350
6,Bernadettespeaks,2599
7,Scenespeaks,2115
8,Stuartspeaks,716
9,EndofEpisodespeaks,231


In [12]:
# Replace every item in list
replace = [ (['Sheldon Cooper', 'Shelly'], 'Sheldon'),
            (['Wil Wheaton'], 'Wheaton'),
            (['Stephen Hawking'], 'Hawking'),
            (['Howie', 'Howard Wolowitz', 'Wolowitz'], 'Howard'),
            (['Amy Farrah Fowler'], 'Amy'),
            (['Bernie'], 'Bernadette'),
            (['Hofstadter', 'Leonard Hofstadter'], 'Leonard'),
            (['Rajesh', 'Koothrappali'], 'Raj'),
            (['Leslie Winkle'], 'Leslie'),
            (['The Cheesecake Factory', 'the Cheesecake Factory'], 'CheesecakeFactory'),
            (['Game of Thrones'], 'GameofThrones'),
            (['Fun with Flags'], 'FunwithFlags'),
            (['Los Angeles'], 'LosAngeles'),
            (['New Jersey'], 'NewJersey'),
            (['New Delhi'], 'NewDelhi'),
            (['Star Trek'], 'StarTrek'),
            (['Star Wars'], 'StarWars')
          ]

for val in replace:
    toreplace = "(" + ")|(".join(val[0]) + ")"
    for i in range(len(modified_scripts)):
        modified_scripts[i] = re.sub(toreplace, val[1], modified_scripts[i])

In [13]:
# Random Check if properly replaced or not
r = re.compile(".*Shelly.*")
list(filter(r.match, modified_scripts))

[]

In [14]:
# Split the script into normalised words
scripts_norms = []
scripts_tokens = []
for doc in nlp.pipe(modified_scripts, n_threads = -1, batch_size= 5000):
    norms = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.text.lower() for word in doc ]
    tokens = [word.norm_.lower().strip() if word.norm_ != 'gonna' else 'a' for word in doc ]
    scripts_norms.append(norms)
    scripts_tokens.append(tokens)

In [15]:
# print list of scripts
def prnt_tok(lst):
    print( "\n\n".join([" ".join(l) for l in lst] ))

ind = start_ind[10]
prnt_tok(scripts_tokens[ind - 5: ind+5])

pennyspeaks leo , you are a very sweet , really funny guy . you are going to do okay .

tobyspeaks one day at a time , penny , one day at a time .

leonardspeaks how long is he going to stay here .

sheldonspeaks he 's a homeless drug addict , leonard , where is he going to go ? boy , you have a lot to learn about lying .

endofepisodespeaks

startofepisodespeaks

scenespeaks the living room of the apartment . leonard and sheldon are playing the three dimensional chess game from the original startrek series . it is leonard 's move . he takes his time , moving round the board and checking things from various angles . finally he tentatively makes a move . sheldon moves almost immediately .

sheldonspeaks checkmate .

leonardspeaks o - o - o - o - h ! again ?

sheldonspeaks obviously you are not well suited for three - dimensional chess , perhaps three dimensional candyland would be more your speed .


In [16]:
# Assign a unique index to them.
START, END = '<s>', '</s>'
START_EP, END_EP = 'startofepisodespeaks', 'endofepisodespeaks'

# Add next person to speak as last word of the script, along with start and end identifiers
for i in range(len(scripts_tokens) - 1):
  if scripts_tokens[i] == [END_EP] or scripts_tokens[i + 1] == [END_EP] or scripts_tokens[i] == [START_EP]:
    scripts_tokens[i] = [START] + scripts_tokens[i] + [END]
  else:
    scripts_tokens[i] = [START] + scripts_tokens[i] + [scripts_tokens[i + 1][0]] + [END]
    
# Remove the last element
del scripts_tokens[-1]

In [17]:
# random check
ind = start_ind[30]
prnt_tok(scripts_tokens[ind - 5 : ind + 5])

<s> scenespeaks the climbing centre . howardspeaks </s>

<s> howardspeaks you got to give him credit for sticking with it . leonardspeaks </s>

<s> leonardspeaks i do not think he have it in him . rajspeaks </s>

<s> rajspeaks he almost made it to the top this time . </s>

<s> endofepisodespeaks </s>

<s> startofepisodespeaks </s>

<s> scenespeaks the apartment . the guys are studying a complex chart on the whiteboard . leonardspeaks </s>

<s> leonardspeaks hmmm . sheldonspeaks </s>

<s> sheldonspeaks the problem appears to be unsolvable . rajspeaks </s>

<s> rajspeaks maybe you could run some computer simulations . howardspeaks </s>


In [18]:
modified_scripts[ind - 5: ind + 5]

['Scenespeaks The climbing centre.',
 'Howardspeaks You gotta give him credit for sticking with it.',
 'Leonardspeaks I didn’t think he had it in him.',
 'Rajspeaks He almost made it to the top this time.',
 'EndofEpisodespeaks',
 'StartofEpisodespeaks',
 'Scenespeaks The apartment. The guys are studying a complex chart on the whiteboard.',
 'Leonardspeaks Hmmm.',
 'Sheldonspeaks The problem appears to be unsolvable.',
 'Rajspeaks Maybe you could run some computer simulations.']

In [19]:
# Check if last episode is mapped correctly
prnt_tok(scripts_tokens[-5:])

<s> ramonaspeaks mmm . no big deal , i enjoy spending time with you . sheldonspeaks </s>

<s> sheldonspeaks and i with you . question , are you seeking a romantic relationship with me ? ramonaspeaks </s>

<s> ramonaspeaks what if i were ? sheldonspeaks </s>

<s> sheldonspeaks well , that would raise a number of problems . we are colleagues . i am currently in a relation … excuse me a moment . scenespeaks </s>

<s> scenespeaks princeton . </s>


In [20]:
# Find weird scripts: no dialogue at all
find = ['<s>', 'pennyspeaks', 'aliciaspeaks', '</s>']
ind = []
for i, v in enumerate(scripts_tokens):
    if v == find:
        ind.append(i)

# Delete the original script
del scripts_tokens[ind[0]]

# Fix continuity
scripts_tokens[ind[0] - 1][-2] = scripts_tokens[ind[0]][1]
prnt_tok(scripts_tokens[ind[0] - 5 : ind[0] + 5])

<s> pennyspeaks apples and oranges here , sheldon . i am telling you , that girl is a user , iceskating through the life on her looks , taking advantage of innocent weak - willed men , getting auditions for stupid network shows . it creams my corn . sheldonspeaks </s>

<s> sheldonspeaks may i interject something here ? pennyspeaks </s>

<s> pennyspeaks please . sheldonspeaks </s>

<s> sheldonspeaks you got the wrong mustard . scenespeaks </s>

<s> scenespeaks the laundry room . aliciaspeaks </s>

<s> aliciaspeaks guess what ? i got the part on csi . pennyspeaks </s>

<s> pennyspeaks oh boy . aliciaspeaks </s>

<s> aliciaspeaks something wrong ? pennyspeaks </s>

<s> pennyspeaks uh , no . no , no , no , you know , congratulations , i think you will make a great hooker . aliciaspeaks </s>

<s> aliciaspeaks thank you . hey , i got to ask you something , how much do physicists make ? pennyspeaks </s>


In [21]:
# Check number of words in each dialogue
First, Last = [START, START_EP, END], [START, END_EP, END]
lens = [len(script) for script in scripts_tokens if script != First and script != Last]
length, count = np.unique(lens, return_counts= True)
pd.DataFrame({"Length" : length, "Count" : count}).head(60)

Unnamed: 0,Length,Count
0,5,38
1,6,3226
2,7,2265
3,8,3123
4,9,2685
5,10,3071
6,11,2727
7,12,2659
8,13,2594
9,14,2418


In [22]:
# check all dialogues with length of 5
length = 5
len_fil = [s for s in scripts_tokens if len(s) == length]
len_fil[:10]

[['<s>', 'leonardspeaks', 'morning', 'sheldonspeaks', '</s>'],
 ['<s>', 'scenespeaks', 'cheesecakefactory', 'sheldonspeaks', '</s>'],
 ['<s>', 'togetherspeaks', 'awesome', '!', '</s>'],
 ['<s>', 'scenespeaks', 'cheesecakefactory', 'pennyspeaks', '</s>'],
 ['<s>', 'leonardspeaks', 'yep', 'scenespeaks', '</s>'],
 ['<s>', 'ramonaspeaks', '4-a.', 'pennyspeaks', '</s>'],
 ['<s>', 'computervoicespeaks', 'honey', '.', '</s>'],
 ['<s>', 'leonardspeaks', 'great', '.', '</s>'],
 ['<s>', 'pennyspeaks', 'yes', 'sheldonspeaks', '</s>'],
 ['<s>', 'leonardspeaks', 'yes', 'howardspeaks', '</s>']]

In [23]:
# Convert consecutive dialogues as input and target for the model 
# Remove the next person to speak from target dialogue
input = []
output = []

for i in range(len(scripts_tokens) - 1):
  if scripts_tokens[i + 1] == Last or scripts_tokens[i] == First or scripts_tokens[i] == Last:
    pass
  else:
    input.append(scripts_tokens[i])
    output.append(scripts_tokens[i + 1][:-2] + [END])
    
# Now, make them as `script_pairs`. 
script_pairs = list(zip(input, output))

In [24]:
# Print functions and random checks
def prnt_pair(tup):
    inp, out = tup
    prn = " ".join(inp) + "\n" + " ".join(out) 
    print(prn.strip(), end ="\n\n")

def prnt_scripts(scripts):
    [prnt_pair(pair) for pair in scripts]
    
# Sample of input and target for the model
prnt_scripts(script_pairs[-5:])

<s> sheldonspeaks how thoughtful . thank you . ramonaspeaks </s>
<s> ramonaspeaks mmm . no big deal , i enjoy spending time with you . </s>

<s> ramonaspeaks mmm . no big deal , i enjoy spending time with you . sheldonspeaks </s>
<s> sheldonspeaks and i with you . question , are you seeking a romantic relationship with me ? </s>

<s> sheldonspeaks and i with you . question , are you seeking a romantic relationship with me ? ramonaspeaks </s>
<s> ramonaspeaks what if i were ? </s>

<s> ramonaspeaks what if i were ? sheldonspeaks </s>
<s> sheldonspeaks well , that would raise a number of problems . we are colleagues . i am currently in a relation … excuse me a moment . </s>

<s> sheldonspeaks well , that would raise a number of problems . we are colleagues . i am currently in a relation … excuse me a moment . scenespeaks </s>
<s> scenespeaks princeton </s>



In [25]:
# Decide maxmium sequence length
print("No. of scripts before trimming: ", len(script_pairs))
max_len = 30
trim_scripts = [pair for pair in script_pairs if len(pair[0]) <= max_len and len(pair[1]) <= max_len]

print("No. of scripts after filtering: ", len(trim_scripts))

No. of scripts:  50548
No. of scripts after filtering:  39006


In [26]:
prnt_scripts(trim_scripts[-5:])

<s> sheldonspeaks how thoughtful . thank you . ramonaspeaks </s>
<s> ramonaspeaks mmm . no big deal , i enjoy spending time with you . </s>

<s> ramonaspeaks mmm . no big deal , i enjoy spending time with you . sheldonspeaks </s>
<s> sheldonspeaks and i with you . question , are you seeking a romantic relationship with me ? </s>

<s> sheldonspeaks and i with you . question , are you seeking a romantic relationship with me ? ramonaspeaks </s>
<s> ramonaspeaks what if i were ? </s>

<s> ramonaspeaks what if i were ? sheldonspeaks </s>
<s> sheldonspeaks well , that would raise a number of problems . we are colleagues . i am currently in a relation … excuse me a moment . </s>

<s> sheldonspeaks well , that would raise a number of problems . we are colleagues . i am currently in a relation … excuse me a moment . scenespeaks </s>
<s> scenespeaks princeton </s>



In [28]:
emb_data = [pair[0] for pair in trim_scripts]
emd_model = gensim.models.Word2Vec(emb_data, size = 300, min_count = 5, workers = 3)
emd_model.train(scripts_tokens, total_examples=len(emb_data), epochs = 100)

(48951220, 92598900)

In [29]:
# Use only inputs for word embedding model and build vocabulary
emb_data = [pair[0] for pair in trim_scripts]
emd_model = gensim.models.Word2Vec(size = 300, min_count = 5, iter = 10, workers=4)
emd_model.build_vocab(emb_data)

# load initial embeddings for common words from google wordvec
emd_model.intersect_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True, lockf = 1.0)

# train for new words
emd_model.train(emb_data, total_examples = len(emb_data), epochs = 100)

In [33]:
# Get Mappings
pad, pad_idx = '<pad>', 0
unk, unk_idx = '<unk>', 1
vocab = [pad, unk, *list(emd_model.wv.vocab.keys())]
word2index = {word:emd_model.wv.vocab[word].index + 2 for word in vocab if word in emd_model.wv.vocab}
word2index[pad] = pad_idx
word2index[unk] = unk_idx
index2word = {index:word for word, index in word2index.items()}

# Example
word2index['leonard'], word2index[index2word[word2index['leonard']]]

(70, 70)

In [34]:
# Padding is zero vector and initialize random vec for unknown token
vectors = np.zeros((len(vocab), 300))
vectors[1] = np.random.uniform(-0.1, 0.1, (1, 300))
for i in np.arange(2, len(vocab)):
    vectors[i] = emd_model.wv[index2word[i]]

In [35]:
# get count of words in training data
word_count = {}

for sentence in trim_scripts:
    for word in sentence[0]:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
            
print('Number of unique words:', len(word_count))

Number of unique words: 15066


In [38]:
# Save data
np.save('vectors.npy', vectors)
with open('word2index.json', 'w') as fp:
    json.dump(word2index, fp)

with open('scripts.pickle', 'wb') as f:
    pickle.dump(trim_scripts, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('word2index.json', 'r') as fp:
    word2index = json.load(fp)
vectors = np.load('vectors.npy')
max_len = 30

with open('scripts.pickle', 'rb') as f:
    trim_scripts = pickle.load(f)

In [39]:
# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [62]:
class BBTDataset(Dataset):
    '''
        BBT scripts modelling.
    '''
    
    def __init__(self, data, word2idx, seq_length):
        self.inp_script = [pair[0] for pair in data]
        self.out_script = [pair[1] for pair in data]
        self.word2idx = word2idx
        self.seq_length = seq_length
        self.unk = set()
    
    def __len__(self):
        return len(self.inp_script)
    
    def __getitem__(self, idx):
        '''
            Returns a pair of tensors containing word indices
            for the specified sentence pair in the dataset.
        '''
        
        # init torch tensors, note that 0 is the padding index
        inp_tensor = torch.zeros(self.seq_length, dtype=torch.long)
        out_tensor = torch.zeros(self.seq_length, dtype=torch.long)
        
        # Get sentence pair
        input_script = self.inp_script[idx]
        output_script = self.out_script[idx]
        
        
        # Load word indices
        for i, word in enumerate(input_script):
            if word in self.word2idx and word_count[word] > 5:
                inp_tensor[i] = self.word2idx[word]
            else:
                inp_tensor[i] = self.word2idx[unk]
                self.unk.add(word)
        
        for i, word in enumerate(output_script):
            if word in self.word2idx and word_count[word] > 5:
                out_tensor[i] = self.word2idx[word]
            else:
                out_tensor[i] = self.word2idx[unk]
                self.unk.add(word)
            
        sample = {'input_tensor': inp_tensor, 'input_script': input_script,
                  'output_tensor': out_tensor, 'output_script': output_script}
        return sample

In [63]:
bbtDataset = BBTDataset(trim_scripts, word2index, seq_length = max_len)

In [64]:
sample = bbtDataset[20]
sample.keys()

dict_keys(['input_tensor', 'input_script', 'output_tensor', 'output_script'])

In [65]:
print('Input example:')
print('Sentence:', sample['input_script'])
print('Tensor:', sample['input_tensor'])

print('\nTarget example:')
print('Sentence:', sample['output_script'])
print('Tensor:', sample['output_tensor'])

Input example:
Sentence: ['<s>', 'leonardspeaks', 'see', 'you', '.', 'scenespeaks', '</s>']
Tensor: tensor([  3,   7,  91,   8,   2,  28,   4,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0])

Target example:
Sentence: ['<s>', 'scenespeaks', 'the', 'stairs', 'of', 'the', 'apartment', 'building', '.', '</s>']
Tensor: tensor([   3,   28,   13,  877,   29,   13,   95,  522,    2,    4,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0])


In [66]:
# Build dataloader to check how the batching works
dataloader = DataLoader(bbtDataset, batch_size=5,shuffle=False, num_workers=4)

In [68]:
for i in dataloader:
    batch = i
    break

for i in range(5):
    print('Input Script:', batch['input_script'][i])
    print('Output Script:', batch['output_script'][i],'\n')

Input Script: ('<s>', '<s>', '<s>', '<s>', '<s>')
Output Script: ('<s>', '<s>', '<s>', '<s>', '<s>') 

Input Script: ('leonardspeaks', 'sheldonspeaks', 'leonardspeaks', 'receptionistspeaks', 'leonardspeaks')
Output Script: ('sheldonspeaks', 'leonardspeaks', 'receptionistspeaks', 'leonardspeaks', 'receptionistspeaks') 

Input Script: ('agreed', 'there', 'excuse', 'can', 'yes')
Output Script: ('there', 'excuse', 'hang', 'yes', 'if') 

Input Script: (',', "'s", 'me', 'i', '.')
Output Script: ("'s", 'me', 'on', '.', 'you') 

Input Script: ('what', 'no', '?', 'help', 'um')
Output Script: ('no', '?', '.', 'um', 'have') 



In [69]:
class EncoderBiGRU(nn.Module):
    def __init__(self, hidden_size, pretrained_embeddings):
        super(EncoderBiGRU, self).__init__()
        
        # Model parameters
        self.hidden_size = hidden_size
        self.embedding_dim = pretrained_embeddings.shape[1]
        self.vocab_size = pretrained_embeddings.shape[0]
        self.num_layers = 2
        self.dropout = 0.1 if self.num_layers > 1 else 0
        self.bidirectional = True
        
        
        # Construct the layers
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        self.embedding.weight.requires_grad = False
        
        self.gru = nn.GRU(self.embedding_dim,
                            self.hidden_size,
                            self.num_layers,
                            batch_first = True,
                            dropout=self.dropout,
                            bidirectional=self.bidirectional)
        
        # Initialize hidden to hidden weights in GRU to the Identity matrix
        # PyTorch GRU has 3 different hidden to hidden weights stacked in one matrix
        identity_init = torch.eye(self.hidden_size)
        self.gru.weight_hh_l0.data.copy_(torch.cat([identity_init]*3, dim=0))
        self.gru.weight_hh_l0_reverse.data.copy_(torch.cat([identity_init]*3, dim=0))
        self.gru.weight_hh_l1.data.copy_(torch.cat([identity_init]*3, dim=0))
        self.gru.weight_hh_l1_reverse.data.copy_(torch.cat([identity_init]*3, dim=0))
    
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output = self.gru(embedded, hidden)
        return output
    
    def initHidden(self, batch_size):
        
        hidden_state = torch.zeros(self.num_layers*(2 if self.bidirectional else 1),
                                   batch_size,
                                   self.hidden_size, 
                                   device=device)
        
        return hidden_state

In [70]:
test_batch_size = 1
test_seq_length = 3
test_hidden_size = 5
test_encoder = EncoderBiGRU(test_hidden_size, vectors).to(device)
test_hidden = test_encoder.initHidden(test_batch_size)

# Create an input tensor of random indices
test_inputs = torch.randint(0, 50, (test_batch_size, test_seq_length), dtype=torch.long, device=device)

test_encoder_output, test_encoder_hidden = test_encoder.forward(test_inputs, test_hidden)

print("The final output of the GRU Encoder on our test input is: \n\n", test_encoder_output.shape)

print('\n\nEncoder output tensor: \n\n', test_encoder_output)

The final output of the GRU Encoder on our test input is: 

 torch.Size([1, 3, 10])


Encoder output tensor: 

 tensor([[[-0.0499, -0.1631,  0.2637,  0.3069,  0.0464, -0.1635, -0.4241,
           0.1198, -0.3655, -0.2922],
         [ 0.2388, -0.3575,  0.5471,  0.1993,  0.1039, -0.0111, -0.6888,
          -0.0569, -0.3468, -0.3591],
         [ 0.1542, -0.3440,  0.6296,  0.3255,  0.1205, -0.0647, -0.2563,
           0.0969, -0.1122, -0.0884]]])


In [71]:
test_encoder_hidden

tensor([[[ 0.7023, -0.3842, -0.0874, -0.9902, -0.2901]],

        [[-0.1193,  0.1815,  0.9926,  0.0030, -0.5165]],

        [[ 0.1542, -0.3440,  0.6296,  0.3255,  0.1205]],

        [[-0.1635, -0.4241,  0.1198, -0.3655, -0.2922]]])

In [91]:
class AttnDecoderGRU(nn.Module):
    def __init__(self, decoder_hidden_size, pretrained_embeddings, seq_length):
        super(AttnDecoderGRU, self).__init__()
        # Embedding parameters
        self.embedding_dim = pretrained_embeddings.shape[1]
        self.output_vocab_size = pretrained_embeddings.shape[0]
        
        # GRU parameters
        self.decoder_hidden_size = decoder_hidden_size
        self.num_layers = 2 # Potentially add more layers to LSTM later
        self.dropout = 0.1 if self.num_layers > 1 else 0 # Potentially add dropout later
        
        # Attention parameters
        self.seq_length = max_len
        self.encoder_hidden_dim = 2*decoder_hidden_size
        
        # Construct embedding layer for output 
        self.embedding = nn.Embedding(self.output_vocab_size, self.embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        self.embedding.weight.requires_grad = False # we don't want to train the embedding weights
        
        # Construct layer that calculates attentional weights
        self.attn = nn.Linear(self.decoder_hidden_size + self.embedding_dim, self.seq_length)
        
        # Construct layer that compresses the combined matrix of the input embeddings
        # and the encoder inputs after attention has been applied
        self.attn_with_input = nn.Linear(self.embedding_dim + self.encoder_hidden_dim, self.embedding_dim)
        
        # gru for Decoder
        self.gru = nn.GRU(self.embedding_dim,
                            self.decoder_hidden_size,
                            self.num_layers,
                            dropout=self.dropout)
        
        # Initialize hidden to hidden weights in GRU to the Identity matrix
        # PyTorch GRU has 3 different hidden to hidden weights stacked in one matrix
        identity_init = torch.eye(self.decoder_hidden_size)
        self.gru.weight_hh_l0.data.copy_(torch.cat([identity_init]*3, dim=0))
        self.gru.weight_hh_l1.data.copy_(torch.cat([identity_init]*3, dim=0))
        
        # Output layer
        self.out = nn.Linear(self.decoder_hidden_size, self.output_vocab_size)
    
    def forward(self, input, hidden, encoder_output):
        # Input word indices, should have dim(1, batch_size), output will be (1, batch_size, embedding_dim)
        embedded = self.embedding(input)
        
        # Calculate Attention weights
        attn_weights = F.softmax(self.attn(torch.cat((hidden[0], embedded[0]), 1)), dim=1)
        attn_weights = attn_weights.unsqueeze(1) # Add dimension for batch matrix multiplication
        
        # Apply Attention weights
        attn_applied = torch.bmm(attn_weights, encoder_output)
        attn_applied = attn_applied.squeeze(1) # Remove extra dimension, dim are now (batch_size, encoder_hidden_size)
        
        # Prepare GRU input tensor

        attn_combined = torch.cat((embedded[0], attn_applied), 1) # Combine embedding input and attn_applied,
        gru_input = F.relu(self.attn_with_input(attn_combined)) # pass through fully connected with ReLU
        gru_input = gru_input.unsqueeze(0) # Add seq dimension so tensor has expected dimensions for lstm
        
        output, hidden = self.gru(gru_input, hidden) # Output dim = (1, batch_size, decoder_hidden_size)
        output = F.log_softmax(self.out(output[0]), dim=1) # softmax over all words in vocab
        
        return output, hidden, attn_weights

In [92]:
# Test the decoder on sample inputs to check that the dimensions of everything is correct
test_decoder_hidden_size = 5

test_decoder = AttnDecoderGRU(test_decoder_hidden_size, vectors, test_seq_length).to(device)

In [241]:
def train(input_tensor, target_tensor, encoder, decoder,
          encoder_optimizer, decoder_optimizer, criterion):
    
    # Initialize encoder hidden state
    encoder_hidden = encoder.initHidden(input_tensor.shape[0])
    
    # clear the gradients in the optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # run forward pass through encoder on entire sequence
    encoder_output, encoder_hidden = encoder.forward(input_tensor, encoder_hidden)
    
    # Initialize decoder input(Start of Sentence tag) and hidden state from encoder
    decoder_input =  torch.tensor([word2index[START]]*input_tensor.shape[0], dtype=torch.long, device=device).unsqueeze(0)
    
    # Use correct initial hidden state dimensions depending on type of RNN
    decoder_hidden = encoder_hidden[1::2].contiguous()
    
    # Initialize loss
    loss = 0
    
    # Implement teacher forcing
    use_teacher_forcing = True if random.random() < 0.5 else False

    if use_teacher_forcing:
        # Step through target output sequence
        for di in range(1, max_len):
            output, decoder_hidden, attn_weights = decoder(decoder_input,
                                                           decoder_hidden,
                                                           encoder_output)
            
            # Feed target as input to next item in the sequence
            decoder_input = target_tensor[di].unsqueeze(0)
            loss += criterion(output, target_tensor[di])
    else:
        # Step through target output sequence
        for di in range(1, max_len):
            
            # Forward pass through decoder
            output, decoder_hidden, attn_weights = decoder(decoder_input,
                                                           decoder_hidden,
                                                           encoder_output)
            
            # Feed output as input to next item in the sequence
            decoder_input = output.topk(1)[1].view(1,-1).detach()
            
            # Calculate loss
            loss += criterion(output, target_tensor[di])
    
    # Compute the gradients
    loss.backward()
    
    # Clip the gradients
    nn.utils.clip_grad_norm_(encoder.parameters(), 25)
    nn.utils.clip_grad_norm_(decoder.parameters(), 25)
    
    # Update the weights
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item()

In [242]:
def trainIters(encoder, decoder, dataloader, epochs, print_every_n_batches=100, learning_rate=0.01):
    
    # keep track of losses
    plot_losses = []

    # Initialize Encoder Optimizer
    encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
    encoder_optimizer = optim.Adam(encoder_parameters, lr=learning_rate)
    
    # Initialize Decoder Optimizer
    decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
    decoder_optimizer = optim.Adam(decoder_parameters, lr=learning_rate)

    # Specify loss function, ignore the <pad> token index so it does not contribute to loss.
    criterion = nn.NLLLoss(ignore_index=0)
    
    # Cycle through epochs
    for epoch in range(epochs):
        loss_avg = 0
        print(f'Epoch {epoch + 1}/{epochs}')
        # Cycle through batches
        for i, batch in enumerate(dataloader):
            
            input_tensor = batch['input_tensor'].to(device)
            target_tensor = batch['output_tensor'].transpose(1,0).to(device)
            

            loss = train(input_tensor, target_tensor, encoder, decoder,
                         encoder_optimizer, decoder_optimizer, criterion)
            
            loss_avg += loss
            if i % print_every_n_batches == 0 and i != 0:
                loss_avg /= print_every_n_batches
                print(f'After {i} batches, average loss/{print_every_n_batches} batches: {loss_avg}')
                plot_losses.append(loss)
                loss_avg = 0
    return plot_losses

In [243]:
# Set hyperparameters and construct dataloader
hidden_size = 256
batch_size = 16
dataloader = DataLoader(bbtDataset, batch_size=batch_size,
                        shuffle=True, num_workers=4) 

In [244]:
encoder_gru = EncoderBiGRU(hidden_size, vectors).to(device)
decoder_gru = AttnDecoderGRU(hidden_size, vectors, max_len).to(device)

In [103]:
from_scratch = True # Set to False if you have saved weights and want to load them

if not from_scratch:
        # Load weights from earlier model
    encoder_gru_state_dict = torch.load('encoder1_gru.pth')
    decoder_gru_state_dict = torch.load('decoder1_gru.pth')

    encoder_gru.load_state_dict(encoder_gru_state_dict)
    decoder_gru.load_state_dict(decoder_gru_state_dict)
else:
    print('Training model from scratch.')

Training model from scratch.


In [247]:
# For dataset 1, models were trained for 3 epochs
# For dataset 2, models were trained for 50 epochs
print('Training GRU based network.')
learning_rate = 0.0001
encoder_gru.train() # Set model to training mode
decoder_gru.train() # Set model to training mode

gru_losses = trainIters(encoder_gru, decoder_gru, dataloader, epochs=47, learning_rate = learning_rate)
np.save('gru2_losses.npy', gru_losses)

Training GRU based network.
Epoch 1/47
After 100 batches, average loss/100 batches: 109.56063678741455
After 200 batches, average loss/100 batches: 105.89036998748779
After 300 batches, average loss/100 batches: 106.4497661972046
After 400 batches, average loss/100 batches: 106.74870559692383
After 500 batches, average loss/100 batches: 106.91029273986817
After 600 batches, average loss/100 batches: 111.22699851989746
After 700 batches, average loss/100 batches: 110.87524394989013
After 800 batches, average loss/100 batches: 106.74352241516114
After 900 batches, average loss/100 batches: 107.70466983795166
After 1000 batches, average loss/100 batches: 104.62533634185792
After 1100 batches, average loss/100 batches: 103.82937576293945
After 1200 batches, average loss/100 batches: 106.01955200195313
After 1300 batches, average loss/100 batches: 108.12676761627198
After 1400 batches, average loss/100 batches: 106.81457172393799
After 1500 batches, average loss/100 batches: 107.22124481201

After 700 batches, average loss/100 batches: 104.47098751068116
After 800 batches, average loss/100 batches: 98.26775093078614
After 900 batches, average loss/100 batches: 102.76000801086425
After 1000 batches, average loss/100 batches: 104.90059677124023
After 1100 batches, average loss/100 batches: 102.4737738418579
After 1200 batches, average loss/100 batches: 102.22017658233642
After 1300 batches, average loss/100 batches: 105.48122699737549
After 1400 batches, average loss/100 batches: 103.04278522491455
After 1500 batches, average loss/100 batches: 101.58248489379883
After 1600 batches, average loss/100 batches: 102.53911865234375
After 1700 batches, average loss/100 batches: 104.29577617645263
After 1800 batches, average loss/100 batches: 104.21519512176513
After 1900 batches, average loss/100 batches: 98.20664264678955
After 2000 batches, average loss/100 batches: 104.42839248657226
After 2100 batches, average loss/100 batches: 103.40879444122315
After 2200 batches, average los

After 1400 batches, average loss/100 batches: 100.72310012817383
After 1500 batches, average loss/100 batches: 98.64979553222656
After 1600 batches, average loss/100 batches: 99.08288402557373
After 1700 batches, average loss/100 batches: 97.28545463562011
After 1800 batches, average loss/100 batches: 102.77111045837403
After 1900 batches, average loss/100 batches: 97.23604354858398
After 2000 batches, average loss/100 batches: 94.54350360870362
After 2100 batches, average loss/100 batches: 103.3486206817627
After 2200 batches, average loss/100 batches: 98.23368495941162
After 2300 batches, average loss/100 batches: 99.9844435119629
After 2400 batches, average loss/100 batches: 99.92993698120117
Epoch 12/47
After 100 batches, average loss/100 batches: 101.61166446685792
After 200 batches, average loss/100 batches: 98.45233192443848
After 300 batches, average loss/100 batches: 99.06790744781495
After 400 batches, average loss/100 batches: 97.74601657867431
After 500 batches, average los

After 2200 batches, average loss/100 batches: 98.8921768951416
After 2300 batches, average loss/100 batches: 96.3934049987793
After 2400 batches, average loss/100 batches: 98.20318386077881
Epoch 17/47
After 100 batches, average loss/100 batches: 99.17813907623291
After 200 batches, average loss/100 batches: 96.33238723754883
After 300 batches, average loss/100 batches: 93.6120894241333
After 400 batches, average loss/100 batches: 96.07028602600097
After 500 batches, average loss/100 batches: 99.86515686035156
After 600 batches, average loss/100 batches: 97.29045688629151
After 700 batches, average loss/100 batches: 96.82001003265381
After 800 batches, average loss/100 batches: 97.43315044403076
After 900 batches, average loss/100 batches: 103.61631141662598
After 1000 batches, average loss/100 batches: 97.01843246459961
After 1100 batches, average loss/100 batches: 96.03460060119629
After 1200 batches, average loss/100 batches: 93.68364624023438
After 1300 batches, average loss/100 ba

After 600 batches, average loss/100 batches: 93.27973114013672
After 700 batches, average loss/100 batches: 94.49757961273194
After 800 batches, average loss/100 batches: 91.58651371002198
After 900 batches, average loss/100 batches: 92.40607669830322
After 1000 batches, average loss/100 batches: 97.27389980316163
After 1100 batches, average loss/100 batches: 89.93472522735595
After 1200 batches, average loss/100 batches: 93.47131542205811
After 1300 batches, average loss/100 batches: 99.56501945495606
After 1400 batches, average loss/100 batches: 94.99928058624268
After 1500 batches, average loss/100 batches: 93.36105175018311
After 1600 batches, average loss/100 batches: 99.74368019104004
After 1700 batches, average loss/100 batches: 92.09591815948487
After 1800 batches, average loss/100 batches: 91.16223461151122
After 1900 batches, average loss/100 batches: 96.17536037445069
After 2000 batches, average loss/100 batches: 97.55055141448975
After 2100 batches, average loss/100 batches

After 1500 batches, average loss/100 batches: 90.02308612823487
After 1600 batches, average loss/100 batches: 94.78154136657714
After 1700 batches, average loss/100 batches: 92.9025271987915
After 1800 batches, average loss/100 batches: 94.16095630645752
After 1900 batches, average loss/100 batches: 95.84950012207031
After 2000 batches, average loss/100 batches: 92.99199123382569
After 2100 batches, average loss/100 batches: 93.66306007385253
After 2200 batches, average loss/100 batches: 86.89715316772461
After 2300 batches, average loss/100 batches: 95.02153858184815
After 2400 batches, average loss/100 batches: 94.5888879776001
Epoch 28/47
After 100 batches, average loss/100 batches: 92.80522686004639
After 200 batches, average loss/100 batches: 89.45699008941651
After 300 batches, average loss/100 batches: 90.83427902221679
After 400 batches, average loss/100 batches: 93.64577423095703
After 500 batches, average loss/100 batches: 94.5161665725708
After 600 batches, average loss/100 

After 2400 batches, average loss/100 batches: 89.98759967803954
Epoch 33/47
After 100 batches, average loss/100 batches: 93.66020877838135
After 200 batches, average loss/100 batches: 90.33451354980468
After 300 batches, average loss/100 batches: 89.86055465698242
After 400 batches, average loss/100 batches: 92.28289386749267
After 500 batches, average loss/100 batches: 92.06903747558594
After 600 batches, average loss/100 batches: 87.64637699127198
After 700 batches, average loss/100 batches: 92.55069244384765
After 800 batches, average loss/100 batches: 86.9928115081787
After 900 batches, average loss/100 batches: 87.10671592712403
After 1000 batches, average loss/100 batches: 90.50275707244873
After 1100 batches, average loss/100 batches: 93.18638568878174
After 1200 batches, average loss/100 batches: 94.34265716552734
After 1300 batches, average loss/100 batches: 86.71833976745606
After 1400 batches, average loss/100 batches: 95.66843772888184
After 1500 batches, average loss/100 b

After 800 batches, average loss/100 batches: 85.45818172454834
After 900 batches, average loss/100 batches: 89.47337585449219
After 1000 batches, average loss/100 batches: 89.36895957946777
After 1100 batches, average loss/100 batches: 86.01013772964478
After 1200 batches, average loss/100 batches: 88.79148273468017
After 1300 batches, average loss/100 batches: 87.70394046783447
After 1400 batches, average loss/100 batches: 92.00237712860107
After 1500 batches, average loss/100 batches: 88.15280971527099
After 1600 batches, average loss/100 batches: 89.28662929534912
After 1700 batches, average loss/100 batches: 88.20903285980225
After 1800 batches, average loss/100 batches: 90.71666542053222
After 1900 batches, average loss/100 batches: 89.38992572784424
After 2000 batches, average loss/100 batches: 84.18445335388184
After 2100 batches, average loss/100 batches: 85.53436954498291
After 2200 batches, average loss/100 batches: 88.52389167785644
After 2300 batches, average loss/100 batch

After 1700 batches, average loss/100 batches: 88.75022712707519
After 1800 batches, average loss/100 batches: 86.40903263092041
After 1900 batches, average loss/100 batches: 90.07275394439698
After 2000 batches, average loss/100 batches: 90.46919109344482
After 2100 batches, average loss/100 batches: 89.3794552230835
After 2200 batches, average loss/100 batches: 85.46105281829834
After 2300 batches, average loss/100 batches: 85.61431381225586
After 2400 batches, average loss/100 batches: 89.24904258728027
Epoch 44/47
After 100 batches, average loss/100 batches: 87.42599590301514
After 200 batches, average loss/100 batches: 88.62701236724854
After 300 batches, average loss/100 batches: 82.59353847503662
After 400 batches, average loss/100 batches: 89.94264068603516
After 500 batches, average loss/100 batches: 87.45118957519531
After 600 batches, average loss/100 batches: 89.47705894470215
After 700 batches, average loss/100 batches: 85.0923664855957
After 800 batches, average loss/100 b

In [250]:
learning_rate = learning_rate / 10
gru_losses = trainIters(encoder_gru, decoder_gru, dataloader, epochs=25, learning_rate = learning_rate)
np.save('gru2_losses.npy', gru_losses)

Epoch 1/25
After 100 batches, average loss/100 batches: 83.25332092285156
After 200 batches, average loss/100 batches: 77.87617252349854
After 300 batches, average loss/100 batches: 80.29739896774292
After 400 batches, average loss/100 batches: 84.31723602294922
After 500 batches, average loss/100 batches: 82.50146308898925
After 600 batches, average loss/100 batches: 83.69223701477051
After 700 batches, average loss/100 batches: 84.5814856338501
After 800 batches, average loss/100 batches: 83.75546901702882
After 900 batches, average loss/100 batches: 80.55284404754639
After 1000 batches, average loss/100 batches: 82.66737648010253
After 1100 batches, average loss/100 batches: 84.23172630310059
After 1200 batches, average loss/100 batches: 79.9211441040039
After 1300 batches, average loss/100 batches: 80.58261238098144
After 1400 batches, average loss/100 batches: 83.7788981628418
After 1500 batches, average loss/100 batches: 78.10280094146728
After 1600 batches, average loss/100 batc

After 1000 batches, average loss/100 batches: 78.06983642578125
After 1100 batches, average loss/100 batches: 85.19825824737549
After 1200 batches, average loss/100 batches: 79.49597217559814
After 1300 batches, average loss/100 batches: 79.69749286651611
After 1400 batches, average loss/100 batches: 82.00688556671143
After 1500 batches, average loss/100 batches: 83.80936943054199
After 1600 batches, average loss/100 batches: 79.35424644470214
After 1700 batches, average loss/100 batches: 80.48421611785889
After 1800 batches, average loss/100 batches: 80.40604885101318
After 1900 batches, average loss/100 batches: 85.70776134490967
After 2000 batches, average loss/100 batches: 82.1398987197876
After 2100 batches, average loss/100 batches: 81.08880558013917
After 2200 batches, average loss/100 batches: 83.3204210281372
After 2300 batches, average loss/100 batches: 81.63839599609375
After 2400 batches, average loss/100 batches: 77.04505462646485
Epoch 7/25
After 100 batches, average loss

After 1900 batches, average loss/100 batches: 83.39104915618897
After 2000 batches, average loss/100 batches: 76.655425491333
After 2100 batches, average loss/100 batches: 85.46368213653564
After 2200 batches, average loss/100 batches: 80.06365928649902
After 2300 batches, average loss/100 batches: 77.5754443359375
After 2400 batches, average loss/100 batches: 80.56542201995849
Epoch 12/25
After 100 batches, average loss/100 batches: 80.85513097763061
After 200 batches, average loss/100 batches: 79.75637836456299
After 300 batches, average loss/100 batches: 80.92876247406006
After 400 batches, average loss/100 batches: 80.67231349945068
After 500 batches, average loss/100 batches: 80.82845897674561
After 600 batches, average loss/100 batches: 78.635363407135
After 700 batches, average loss/100 batches: 74.73740753173828
After 800 batches, average loss/100 batches: 81.39788818359375
After 900 batches, average loss/100 batches: 76.17833911895752
After 1000 batches, average loss/100 batch

After 400 batches, average loss/100 batches: 84.92262863159179
After 500 batches, average loss/100 batches: 80.91072757720947
After 600 batches, average loss/100 batches: 78.08289093017578
After 700 batches, average loss/100 batches: 83.0232364654541
After 800 batches, average loss/100 batches: 75.88555599212647
After 900 batches, average loss/100 batches: 81.12280452728271
After 1000 batches, average loss/100 batches: 82.26556632995606
After 1100 batches, average loss/100 batches: 75.49871444702148
After 1200 batches, average loss/100 batches: 81.2570351409912
After 1300 batches, average loss/100 batches: 76.50247680664063
After 1400 batches, average loss/100 batches: 80.68050968170166
After 1500 batches, average loss/100 batches: 78.02974227905274
After 1600 batches, average loss/100 batches: 79.72847465515137
After 1700 batches, average loss/100 batches: 78.14262397766113
After 1800 batches, average loss/100 batches: 79.1760050201416
After 1900 batches, average loss/100 batches: 82.

After 1300 batches, average loss/100 batches: 80.07353569030762
After 1400 batches, average loss/100 batches: 74.77135082244872
After 1500 batches, average loss/100 batches: 82.13839933395386
After 1600 batches, average loss/100 batches: 77.42219871520996
After 1700 batches, average loss/100 batches: 80.92265396118164
After 1800 batches, average loss/100 batches: 77.8900549697876
After 1900 batches, average loss/100 batches: 79.22735229492187
After 2000 batches, average loss/100 batches: 79.45392837524415
After 2100 batches, average loss/100 batches: 77.9352583694458
After 2200 batches, average loss/100 batches: 75.49488655090332
After 2300 batches, average loss/100 batches: 80.49289291381837
After 2400 batches, average loss/100 batches: 78.91890731811523
Epoch 23/25
After 100 batches, average loss/100 batches: 80.23210235595702
After 200 batches, average loss/100 batches: 77.25273990631104
After 300 batches, average loss/100 batches: 81.64512329101562
After 400 batches, average loss/1

In [251]:
torch.save(encoder_gru.state_dict(), 'encoder2_gru.pth')
torch.save(decoder_gru.state_dict(), 'decoder2_gru.pth')

In [107]:
def get_batch(dataloader):
    for batch in dataloader:
        return batch

In [108]:
def evaluate(input_tensor, encoder, decoder):
    with torch.no_grad():
        encoder_hidden = encoder.initHidden(1)
        encoder.eval()
        decoder.eval()

        encoder_output, encoder_hidden = encoder(input_tensor.to(device), encoder_hidden)

        decoder_input =  torch.tensor([word2index[START]]*input_tensor.shape[0], dtype=torch.long, device=device).unsqueeze(0)
        try:
            encoder.lstm
            decoder_hidden = (encoder_hidden[0][1::2].contiguous(), encoder_hidden[1][1::2].contiguous())
        except AttributeError:
            decoder_hidden = encoder_hidden[1::2].contiguous()

        output_list = []
        attn_weight_list = np.zeros((max_len, max_len))
        for di in range(1, max_len):
            output, decoder_hidden, attn_weights = decoder(decoder_input,
                                                           decoder_hidden,
                                                           encoder_output)

            decoder_input = output.topk(1)[1].detach()
            output_list.append(output.topk(1)[1])
            word = index2word[output.topk(1)[1].item()]

            attn_weight_list[di] += attn_weights[0,0,:].cpu().numpy()
        return output_list, attn_weight_list

In [155]:
batch = get_batch(dataloader)
input_tensor = batch['input_tensor'][11].unsqueeze_(0)
gru_output_list, gru_attn = evaluate(input_tensor, encoder_gru, decoder_gru)

In [193]:
def predictScript(input):
    doc = nlp(input)
    input_script = [START] + [word.norm_.lower().strip() if word.norm_ != 'gonna' else 'a' for word in doc ] + [END]
    inp_tensor = torch.zeros(max_len, dtype=torch.long)
    for i, word in enumerate(input_script):
        if word in word2index and word_count[word] > 5:
            inp_tensor[i] = word2index[word]
        else:
            print("Missing Word: ", word)
    gru_output_list, _ = evaluate(inp_tensor.unsqueeze_(0), encoder_gru, decoder_gru)
    output = " ".join(input_script) + "\n"
    print("Input and Output: ")
    for index in gru_output_list:
        word = index2word[index[0,0].item()]
        if word != '</s>':
            output += ' ' + word
        else:
            output += ' ' + word 
            print(output.strip(), end = "\n\n")
            break
    

predictScript()

Input and Output: 
<s> sheldonspeaks hello , there pennyspeaks </s>
 <s> pennyspeaks what are you doing ? </s>



In [252]:
scripts = ["sheldonspeaks hello! pennyspeaks", 
           "sheldonspeaks Can you drive me to work? pennyspeaks", 
           "sheldonspeaks knock knock pennyspeaks", 
           "sheldonspeaks i want ice cream. pennyspeaks",
           "leonardspeaks where is amy?. sheldonspeaks",
           "howardspeaks shall we play paint ball? sheldonspeaks"
        ]

for script in scripts:
    predictScript(script)

Input and Output: 
<s> sheldonspeaks hello ! pennyspeaks </s>
 pennyspeaks hey , you , </s>

Input and Output: 
<s> sheldonspeaks can you drive me to work ? pennyspeaks </s>
 pennyspeaks no . </s>

Input and Output: 
<s> sheldonspeaks knock knock pennyspeaks </s>
 pennyspeaks oh , </s>

Input and Output: 
<s> sheldonspeaks i want ice cream . pennyspeaks </s>
 pennyspeaks yeah , i . </s>

Input and Output: 
<s> leonardspeaks where is amy ? . sheldonspeaks </s>
 sheldonspeaks it 's <unk> . </s>

Input and Output: 
<s> howardspeaks shall we play paint ball ? sheldonspeaks </s>
 sheldonspeaks we are <unk> . . . . . . . . . </s>



In [249]:
scripts = ["sheldonspeaks hello! pennyspeaks", 
           "sheldonspeaks Can you drive me to work? pennyspeaks", 
           "sheldonspeaks knock knock pennyspeaks", 
           "sheldonspeaks i want ice cream. pennyspeaks",
           "leonardspeaks where is amy?. sheldonspeaks",
           "howardspeaks shall we play paint ball? sheldonspeaks"
        ]

for script in scripts:
    predictScript(script)

Input and Output: 
<s> sheldonspeaks hello ! pennyspeaks </s>
 pennyspeaks hey , you , </s>

Input and Output: 
<s> sheldonspeaks can you drive me to work ? pennyspeaks </s>
 pennyspeaks no . </s>

Input and Output: 
<s> sheldonspeaks knock knock pennyspeaks </s>
 pennyspeaks oh , </s>

Input and Output: 
<s> sheldonspeaks i want ice cream . pennyspeaks </s>
 pennyspeaks yeah , i , . . </s>

Input and Output: 
<s> leonardspeaks where is amy ? . sheldonspeaks </s>
 sheldonspeaks <unk> . . </s>

Input and Output: 
<s> howardspeaks shall we play paint ball ? sheldonspeaks </s>
 sheldonspeaks i . . . . . . . . . </s>



In [240]:
scripts = ["sheldonspeaks hello! pennyspeaks", 
           "sheldonspeaks Can you drive me to work? pennyspeaks", 
           "sheldonspeaks knock knock pennyspeaks", 
           "sheldonspeaks i want ice cream. pennyspeaks",
           "leonardspeaks where is amy?. sheldonspeaks",
           "howardspeaks shall we play paint ball? sheldonspeaks"
        ]

for script in scripts:
    predictScript(script)

Input and Output: 
<s> sheldonspeaks hello ! pennyspeaks </s>
 <s> pennyspeaks hi . </s>

Input and Output: 
<s> sheldonspeaks can you drive me to work ? pennyspeaks </s>
 <s> pennyspeaks no . </s>

Input and Output: 
<s> sheldonspeaks knock knock pennyspeaks </s>
 <s> pennyspeaks what are you doing here ? </s>

Input and Output: 
<s> sheldonspeaks i want ice cream . pennyspeaks </s>
 <s> pennyspeaks oh - huh . </s>

Input and Output: 
<s> leonardspeaks where is amy ? . sheldonspeaks </s>
 <s> sheldonspeaks in this room , leonard and i are ready to get rid of you like pizza and a half . </s>

Input and Output: 
<s> howardspeaks shall we play paint ball ? sheldonspeaks </s>
 <s> sheldonspeaks in a <unk> </s>



In [238]:
[script for script in modified_scripts if bool(re.search("knock", script))]

['Scenespeaks The hallway. Leonard knocks on Penny’s door.',
 'Scenespeaks Sheldon’s bedroom. He is building a model of some kind of double helix. There is a knock on the door.',
 'Scenespeaks The apartment living room. There is a knock on the door.',
 'Scenespeaks The same, later. Leonard is dressed as Frodo. Howard appears to be Peter Pan. There is a knock on the door.',
 'Scenespeaks The hallway. Howard knocks on Penny’s door with his bow.',
 'Scenespeaks The apartment, there is a knock on the door.',
 'Scenespeaks Outside Penny’s door. Leonard knocks.',
 'Scenespeaks Sheldon’s office. He is making measurements on maps. There is a knock on the door.',
 'Howardspeaks And even if you can make it to Boston, what are you going to do, knock on the door and say to Mrs Bell, “hey Mrs Bell, big fan of your husband, can I come in and watch him invent the telephone?”',
 'Rajspeaks Mrs Bell was deaf, she’s not even going to hear you knock.',
 'Sheldonspeaks Same paradox. If you were to travel 

In [161]:
print('Input Script:')
output = ''
for index in input_tensor[0]:
    word = index2word[index.item()]
    if word != '</s>':
        output += ' ' + word
    else:
        output += ' ' + word
        print(output)
        break
        
print('Model Prediction:')
output = ''
for index in gru_output_list:
    word = index2word[index[0,0].item()]
    if word != '</s>':
        output += ' ' + word
    else:
        output += ' ' + word
        print(output)
        break
        
print('Output Script:')
output = ''
for index in batch['output_tensor'][11].unsqueeze_(0)[0]:
    word = index2word[index.item()]
    if word != '</s>':
        output += ' ' + word
    else:
        output += ' ' + word
        print(output)
        break

Input Script:
 <s> leonardspeaks i am sure she will still love him . sheldonspeaks </s>
Model Prediction:
 <s> sheldonspeaks well , i do not <unk> . . . . </s>
Output Script:
 <s> sheldonspeaks i would not . </s>
