In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/PhD/courses/ME592/project

/content/drive/MyDrive/PhD/courses/ME592/project


In [None]:
!nvidia-smi

Fri May  7 04:50:47 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    27W /  70W |   1822MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!python -m nltk.downloader -d /usr/share/nltk_data wordnet

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Import packages

In [None]:
import gensim
import gensim.downloader as api
import nltk
from nltk.corpus import wordnet

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import urllib.request
import zipfile

#wiki_embeddings = api.load('glove-wiki-gigaword-100')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Dataset and preprocessing

In [None]:
########## download data ###########
def download_data(corpus):
    if not os.path.exists(corpus):
        print('Downloading data ...')
        urllib.request.urlretrieve('http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip', 'cornell_movie_dialogs_corpus.zip')
        with zipfile.ZipFile('cornell_movie_dialogs_corpus.zip', 'r') as zip_ref:
            zip_ref.extractall(corpus)
        os.remove('cornell_movie_dialogs_corpus.zip')
corpus = 'cornell movie-dialogs corpus'
download_data(corpus)

In [None]:
##### See some examples of the original data file #####
corpus = os.path.join(corpus, corpus)
print(corpus)

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)
printLines(os.path.join(corpus, "movie_lines.txt"))

cornell movie-dialogs corpus/cornell movie-dialogs corpus
b'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n'
b'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n'
b'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n'
b'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n'
b"L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"
b'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n'
b"L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n"
b'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n'
b'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n'
b'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n'


In [None]:
def loadLines(fileName, fields):
    lines = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(' +++$+++ ')
            #print(values)
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]
            #print(lineObj) #{'lineID': 'L60728', 'characterID': 'u416', 'movieID': 'm25', 'character': 'HELEN', 'text': "-- No, that's okay. It's just Sean...\n"}
            lines[lineObj['lineID']] = lineObj
    return lines

MOVIE_LINES_FIELDS = ['lineID', 'characterID', 'movieID', 'character', 'text']
lines = loadLines(os.path.join(corpus, 'movie_lines.txt'), MOVIE_LINES_FIELDS)
print('one sample of lines dictionary output =\n', lines['L60017'])

one sample of lines dictionary output =
 {'lineID': 'L60017', 'characterID': 'u412', 'movieID': 'm25', 'character': 'BRIAN', 'text': 'Lucky guess.  And a case of scotch to a captain in station assignments.\n'}


In [None]:
printLines(os.path.join(corpus, "movie_conversations.txt"))

b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']\n"


make a list of all conversations

In [None]:
def loadConversations(fileName, lines, fields):
    conversations = []
    with open(fileName, 'r', encoding = 'iso-8859-1') as f:
        for line in f:
            values = line.split(' +++$+++ ')
            convObj = {}
            for i, field in enumerate(fields):
                convObj[field] = values[i]
            utterance_id_pattern = re.compile('L[0-9]+')
            lineIds = utterance_id_pattern.findall(convObj['utteranceIDs'])
            #print(lineIds)
            convObj['lines'] = []
            for lineId in lineIds:
                convObj['lines'].append(lines[lineId])
            conversations.append(convObj)
    return conversations

MOVIE_CONVERSATIONS_FIELDS = ['character1ID', 'character2ID', 'movieID', 'utteranceIDs']
conversations = loadConversations(os.path.join(corpus, 'movie_conversations.txt'),
                                         lines, MOVIE_CONVERSATIONS_FIELDS)

print('one sample of conversations list output =\n', conversations[10])

one sample of conversations list output =
 {'character1ID': 'u0', 'character2ID': 'u2', 'movieID': 'm0', 'utteranceIDs': "['L367', 'L368']\n", 'lines': [{'lineID': 'L367', 'characterID': 'u2', 'movieID': 'm0', 'character': 'CAMERON', 'text': 'How do you get your hair to look like that?\n'}, {'lineID': 'L368', 'characterID': 'u0', 'movieID': 'm0', 'character': 'BIANCA', 'text': "Eber's Deep Conditioner every two days. And I never, ever use a blowdryer without the diffuser attachment.\n"}]}


Extract question answer pairs

In [None]:
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations:
        for i in range(len(conversation['lines']) - 1):
            inputLine = conversation['lines'][i]['text'].strip()
            targetLine = conversation['lines'][i+1]['text'].strip()
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs
qa_pairs = extractSentencePairs(conversations)
print('one sample of question answer pairs list output =\n', qa_pairs[50])


one sample of question answer pairs list output =
 ['Is he oily or dry?', "Combination.  I don't know -- I thought he'd be different.  More of a gentleman..."]


write pairs of conversations in a file "formatted_movie_lines"

In [None]:
def extract_conversations(corpus):
    print('Extracting conversations ...')
    datafile = os.path.join(corpus, 'formatted_movie_lines.txt')
    if not os.path.exists(datafile):
        delimiter = '\t'
        delimiter = str(codecs.decode(delimiter, 'unicode_escape'))

        lines = {}
        conversations = {}
        MOVIE_LINES_FIELDS = ['lineID', 'characterID', 'movieID', 'character', 'text']
        MOVIE_CONVERSATIONS_FIELDS = ['character1ID', 'character2ID', 'movieID', 'utteranceIDs']

        print('\nProcessing corpus ... ')
        lines = loadLines(os.path.join(corpus, 'movie_lines.txt'), MOVIE_LINES_FIELDS)
        print('\nLoading conversations ...')
        conversations = loadConversations(os.path.join(corpus, 'movie_conversations.txt'),
                                         lines, MOVIE_CONVERSATIONS_FIELDS)

        print('\nWriting newly formatted file ...')
        with open(datafile, 'w', encoding = 'utf-8') as outputfile:
            writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
            for pair in extractSentencePairs(conversations):
                writer.writerow(pair)
    else:
      print('Formatted file was saved before')

    print('\nSample lines from file:')
    printLines(datafile)
    print('')
    return datafile
datafile = extract_conversations(corpus)

Extracting conversations ...
Formatted file was saved before

Sample lines from file:
b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tS

In [None]:
#### global variables
PAD_token = 0
SOS_token = 1
EOS_token = 2
MAX_LENGTH = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Vocabulary class

In [None]:
class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token:'PAD', SOS_token:'SOS', EOS_token:'EOS'}
        self.num_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def trim(self, min_count):
        if self.trimmed:
            return
        else:
            keep_words = []
            for k,v in self.word2count.items():
                if v >= min_count:
                    keep_words.append(k)

            print('keep_words {} / {} = {:.4f}'.format(
                len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
            ))

            self.word2index = {}
            self.word2count = {}
            self.index2word = {PAD_token:'PAD', SOS_token:'SOS', EOS_token:'EOS'}
            self.num_words = 3
            for word in keep_words:
                self.addWord(word)

            self.trimmed = True

In [None]:
########## String processing  ###########
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if
                  unicodedata.category(c) != 'Mn')

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r'[^\w\s]','',s)
    s = re.sub(r"([.!?])",r" \1", s)
    s = re.sub(r'[^\w\s]','',s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

def readVocs(datafile, corpus_name):
    print('Reading lines ...')
    lines = open(datafile, encoding='utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def loadPrepareData(corpus, corpus_name, datafile):
    print('Start preparing training data ...')
    voc, pairs = readVocs(datafile, corpus_name)
    print('Read {!s} sentence pairs'.format(len(pairs)))
    pairs = filterPairs(pairs)
    print('Trimmed to {!s} sentence pairs'.format(len(pairs)))
    print('Counting words ...')
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print('Counted words:', voc.num_words)
    return voc, pairs
corpus_name = 'cornell movie-dialogs corpus'
#### load conversations and make tensors
voc, pairs = loadPrepareData(corpus, corpus_name, datafile)
# print('\npairs:')
# for pair in pairs[:10]:
#     print(pair) 
print("index2word =", voc.index2word)
print("word2count =", voc.word2count)
print("word2index =", voc.word2index)

Start preparing training data ...
Reading lines ...
Read 221282 sentence pairs
Trimmed to 92244 sentence pairs
Counting words ...
Counted words: 26983


If you want to delete some words based on their number of replicates in dataset --> change MIN_COUNT to another number

In [None]:
def trimRareWords(voc, pairs, MIN_COUNT):
    voc.trim(MIN_COUNT)

    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True

        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        if keep_input:
            for word in output_sentence.split(' '):
                if word not in voc.word2index:
                    keep_output = False
                    break
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs
MIN_COUNT = 3
pairs = trimRareWords(voc, pairs, MIN_COUNT)
#print(pairs)
#print(len(voc.index2word.keys()))

keep_words 11140 / 26980 = 0.4129
Trimmed from 92244 pairs to 75192, 0.8151 of total


# Adversarial attack
### If you want to add adversarial attacks run this part

In [None]:
def tokenized_sentences():
  all_sent_token = []
  for pair in pairs:
    all_sent_token.append(gensim.utils.simple_preprocess(pair[0]))
    all_sent_token.append(gensim.utils.simple_preprocess(pair[1]))
  return all_sent_token
#tokenized_sentences()[0:10]

In [None]:
def w2v_model_gen():
    w2v_model = gensim.models.Word2Vec(tokenized_sentences(),
                                    size=50, ### size of the # vector
                                    window=5, ### window( words after or behind a word)
                                    min_count=1) ### minimum number of seeing a special word in our corpus)
    return w2v_model
def sim_words_corpus_gen(word,w2v_model):
  sim_words = w2v_model.wv.most_similar(word)
  return sim_words
#w2v_model = w2v_model_gen()
#sim_words_corpus_gen('want',w2v_model)

In [None]:
def sim_words_wiki(word, wiki_embeddings):
  if word in wiki_embeddings.wv.vocab:
    sim_words = wiki_embeddings.wv.most_similar(word)
    return sim_words
#sim_words_wiki('phone', wiki_embeddings)

In [None]:
def find_synonyms(word):
  synonyms = []
  for syn in wordnet.synsets(word):
    for l in syn.lemmas():
      if l.name() != word:
        synonyms.append(l.name())
  return synonyms
#find_synonyms('good')

In [None]:
def make_sen_tokenized(sen):
  sen = ' '.join([word for word in sen.split() if word not in (nltk.corpus.stopwords.words('english'))])
  word_list = gensim.utils.simple_preprocess(sen)
  return word_list
def select_random_word(sen):
  word_list = make_sen_tokenized(sen)
  if len(word_list)!= 0:
    selected_word = random.choice(word_list)
    #print(sen)
    rand_index = word_list.index(selected_word)
    return selected_word, rand_index
    
#select_random_word(pairs[50][0])

Replace Random word with a random word from vocabulary

In [None]:
def replace_random_word_of_sen_with_rand(sen):
  syn_words_corpus =[]
  list_adv_sens_rand = []
  rand_word_index = select_random_word(sen)
  print("----------------------------------")
  print("random word and its index in the sentence =", rand_word_index)
  if rand_word_index:
    rand_word = rand_word_index[0]
    rand_index = rand_word_index[1]
    token_sen = make_sen_tokenized(sen)
    print("original sentence=", ' '.join(token_sen))
    random_words_from_voc = random.choices(list(voc.word2index.keys()), k=10)
    print("selected random words from vocabulary =", random_words_from_voc)
    for random_word_voc in random_words_from_voc:
      token_sen[rand_index] = random_word_voc
      adv_sen = ' '.join(token_sen)
      list_adv_sens_rand.append(adv_sen)
    print("attacked sentences =", list_adv_sens_rand)
    return list_adv_sens_rand
print(voc.word2index.keys())
for i in range(30,40):
  replace_random_word_of_sen_with_rand(pairs[i][0])

----------------------------------
random word and its index in the sentence = None
----------------------------------
random word and its index in the sentence = ('babe', 2)
original sentence= like total babe
selected random words from vocabulary = ['heats', 'dyer', 'uhura', 'foley', 'episode', 'que', 'mississippi', 'cucamonga', 'stream', 'night']
attacked sentences = ['like total heats', 'like total dyer', 'like total uhura', 'like total foley', 'like total episode', 'like total que', 'like total mississippi', 'like total cucamonga', 'like total stream', 'like total night']
----------------------------------
random word and its index in the sentence = ('joey', 1)
original sentence= hate joey
selected random words from vocabulary = ['resetting', 'mcquire', 'higher', 'opportunity', 'sorry', 'brace', 'status', 'scrunchie', 'youd', 'appeal']
attacked sentences = ['hate resetting', 'hate mcquire', 'hate higher', 'hate opportunity', 'hate sorry', 'hate brace', 'hate status', 'hate scrunchi

Replace random word with synonym

In [None]:
def replace_random_word_of_sen_with_syn(sen):
  syn_words_corpus =[]
  list_adv_sens_syn = []
  rand_word_index = select_random_word(sen)
  print("----------------------------------")
  print("random word and its index in the sentence =", rand_word_index)
  if rand_word_index:
    rand_word = rand_word_index[0]
    rand_index = rand_word_index[1]
    syn_words = find_synonyms(rand_word)
    print(syn_words)
    if syn_words != None:
      for syn_word in syn_words:
        if syn_word in voc.word2index.keys():
          syn_words_corpus.append(syn_word)
      print("synonym words that are in corpus =", syn_words_corpus)
      token_sen = make_sen_tokenized(sen)
      print("original sentence=", ' '.join(token_sen))
      for syn_word_corpus in syn_words_corpus:
        token_sen[rand_index]=syn_word_corpus
        adv_sen = ' '.join(token_sen)
        list_adv_sens_syn.append(adv_sen)
      print("attacked sentences =", list_adv_sens_syn)
      return list_adv_sens_syn

for i in range(30,40):
  replace_random_word_of_sen_with_syn(pairs[i][0])

----------------------------------
random word and its index in the sentence = None
----------------------------------
random word and its index in the sentence = ('like', 0)
['the_like', 'the_likes_of', 'ilk', 'wish', 'care', 'similar', 'same', 'alike', 'similar', 'comparable', 'corresponding']
synonym words that are in corpus = ['wish', 'care', 'similar', 'same', 'alike', 'similar']
original sentence= like total babe
attacked sentences = ['wish total babe', 'care total babe', 'similar total babe', 'same total babe', 'alike total babe', 'similar total babe']
----------------------------------
random word and its index in the sentence = ('hate', 0)
['hatred', 'detest']
synonym words that are in corpus = ['detest']
original sentence= hate joey
attacked sentences = ['detest joey']
----------------------------------
random word and its index in the sentence = ('different', 1)
['unlike', 'dissimilar']
synonym words that are in corpus = []
original sentence= back different story
attacked se

Replace random word with a similar word from vocabulary


In [None]:
def replace_random_word_of_sen_with_sim(sen,w2v_model):
  sim_words_corpus =[]
  list_adv_sens_sim = []
  rand_word_index = select_random_word(sen)
  print("----------------------------------")
  print("random word and its index in the sentence =", rand_word_index)
  if rand_word_index:
    rand_word = rand_word_index[0]
    rand_index = rand_word_index[1]
    sim_words = sim_words_corpus_gen(rand_word,w2v_model)
    print("similar words to the randmly selected word =", sim_words)
    if len(sim_words) != 0:
      token_sen = make_sen_tokenized(sen)
      print("original sentence=", ' '.join(token_sen))
      for sim_word in sim_words:
        token_sen[rand_index]=sim_word[0]
        adv_sen = ' '.join(token_sen)
        list_adv_sens_sim.append(adv_sen)
      print("attacked sentences =", list_adv_sens_sim)
      return list_adv_sens_sim

w2v_model = w2v_model_gen()
for i in range(30,40):
  replace_random_word_of_sen_with_sim(pairs[i][0],w2v_model)

----------------------------------
random word and its index in the sentence = None
----------------------------------
random word and its index in the sentence = ('like', 0)
similar words to the randmly selected word = [('alike', 0.6980416774749756), ('rather', 0.635463297367096), ('indicate', 0.6013042330741882), ('hunk', 0.6005441546440125), ('mature', 0.592094361782074), ('want', 0.5866808891296387), ('remember', 0.5833290219306946), ('love', 0.5831726789474487), ('appreciate', 0.568580687046051), ('hate', 0.5570109486579895)]
original sentence= like total babe
attacked sentences = ['alike total babe', 'rather total babe', 'indicate total babe', 'hunk total babe', 'mature total babe', 'want total babe', 'remember total babe', 'love total babe', 'appreciate total babe', 'hate total babe']
----------------------------------
random word and its index in the sentence = ('hate', 0)
similar words to the randmly selected word = [('liked', 0.9249138236045837), ('loved', 0.9177319407463074)

## Training the Model

In [None]:
########## String to tensor  ###########
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

def zeroPadding(l):
    fillvalue=PAD_token
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l):
    value=PAD_token
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key = lambda x: len(x[0].split(' ')), reverse = True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

def show_some_tensors():
    small_batch_size = 5
    batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
    input_variable, lengths, target_variable, mask, max_target_len = batches

    print("input_variable:", input_variable)
    print("lengths:", lengths)
    print("target_variable:", target_variable)
    print("mask:", mask)
    print("max_target_len:", max_target_len)




########## training and evaluation  ###########
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()



# def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
         # encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
         encoder_optimizer, decoder_optimizer, batch_size, clip, teacher_forcing_ratio):
    max_length=MAX_LENGTH
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    loss = 0
    print_losses = []
    n_totals = 0

    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    decoder_hidden = encoder_hidden[:decoder.n_layers]

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_input = target_variable[t].view(1, -1)
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    loss.backward()
    # _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    # _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)
    nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()
    return sum(print_losses)/n_totals


def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
              embedding, encoder_n_layers, decoder_n_layers, hidden_size, save_dir, n_iteration, batch_size,
              print_every, save_every, clip, corpus_name, checkpoint_iter, teacher_forcing_ratio):
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if checkpoint_iter:
        start_iteration = checkpoint_iter + 1
    print('Training ...')
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                    decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip,
                    teacher_forcing_ratio)
        print_loss += loss

        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        if iteration % save_every == 0:
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'itereation': iteration,
                'en':encoder.state_dict(),
                'de':decoder.state_dict(),
                'en_opt':encoder_optimizer.state_dict(),
                'de_opt':decoder_optimizer.state_dict(),
                'loss':loss,
                'voc_dict':voc.__dict__,
                'embedding':embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))




# def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
def evaluate(encoder, decoder, searcher, voc, sentence):
    max_length = MAX_LENGTH
    indexes_batch = [indexesFromSentence(voc, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    tokens, scores = searcher(input_batch, lengths, max_length)
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words

def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            input_sentence = input('> ')
            if input_sentence == 'q' or input_sentence == 'quit': break
            input_sentence = normalizeString(input_sentence)
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot: ', ' '.join(output_words))
        except KeyError:
            print('Error: Encountered unknown word.')

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super().__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                         dropout=(0 if n_layers==1 else dropout), bidirectional=True)
    def forward(self, input_seq, input_lengths, hidden = None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.cpu())
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden


# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super().__init__()
        self.method = method
        if method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, 'is not an appropriate attention method.')
        self.hidden_size = hidden_size
        if method == 'general':
            self.attn = nn.Linear(hidden_size, hidden_size)
        elif method == 'concat':
            self.attn = nn.Linear(hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))
    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden*encoder_output, dim = 2)
    def genereal_socre(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim = 2)
    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expend(encoder_output.size(0),-1,-1),encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim = 2)
    def forward(self, hidden, encoder_outputs):
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        attn_energies = attn_energies.t()

        return F.softmax(attn_energies, dim=1).unsqueeze(1)



class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super().__init__()
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers==1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attn(attn_model, hidden_size)
    def forward(self, input_step, last_hidden, encoder_outputs):
        # Get embed
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output, hidden = self.gru(embedded, last_hidden)
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0,1))
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        return output, hidden


class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, input_seq, input_length, max_length):
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        decoder_hidden = encoder_hidden[:self.decoder.n_layers]
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        for _ in range(max_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        return all_tokens, all_scores

# Start calling functions

In [None]:
print('Device to use:', device)

########## variables (no need to change anything else besides these) ##########
#### folder/file name
corpus = 'cornell movie-dialogs corpus'
corpus_name = 'cornell movie-dialogs corpus'
save_dir = 'save'

#### vocabulary trimming
MIN_COUNT = 3  # for trimming rate words

#### models configuration
model_name = 'cb_model'
attn_model = 'dot' # general, concat
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

#### training configuration
clip = 50.0
teacher_forcing_ratio = 0.5 # 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 5000
print_every = 1000
save_every = 1000

#### continue training or new training
loadFilename = None
checkpoint_iter = None
#### if load from saved model, run the following
# checkpoint_iter = 38000
# loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))

########## end of variables ##########




#### download data and do extract conversations
download_data(corpus)
corpus = os.path.join(corpus, corpus)
datafile = extract_conversations(corpus)

#### load conversations and make tensors
voc, pairs = loadPrepareData(corpus, corpus_name, datafile)
# print('\npairs:')
# for pair in pairs[:10]:
#     print(pair)

pairs = trimRareWords(voc, pairs, MIN_COUNT)
# show_some_tensors()

#### build models
print('Building encoder and decoder ...')
embedding = nn.Embedding(voc.num_words, hidden_size)
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

encoder.train()
decoder.train()

if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))

    voc.__dict__ = checkpoint['voc_dict']
    embedding.load_state_dict(checkpoint['embedding'])
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])
    encoder_optimizer.load_state_dict(checkpoint['en_opt'])
    decoder_optimizer.load_state_dict(checkpoint['de_opt'])

for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()
for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

print('Starting Training!')
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, hidden_size,
          save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, checkpoint_iter,
          teacher_forcing_ratio)

NameError: ignored

In [None]:
checkpoint = torch.load('/content/drive/MyDrive/PhD/courses/ME592/project/save/cb_model/cornell movie-dialogs corpus/2-2_500/6000_checkpoint.tar')
encoder.load_state_dict(checkpoint['en'])
decoder.load_state_dict(checkpoint['de'])

<All keys matched successfully>

In [None]:
encoder.eval()
decoder.eval()
searcher = GreedySearchDecoder(encoder, decoder)

evaluateInput(encoder, decoder, searcher, voc)

> hello
Bot:  hi
> how are you
Bot:  im fine
> are you fine
Bot:  yes
> how are you doing
Bot:  im sorry
> do you like me
Bot:  yes
> do you hate me
Bot:  no
> do you love me
Bot:  yes
> are you ok
Bot:  im fine
> how many words do you know
Bot:  pupils
> can you chat
Bot:  sure


KeyboardInterrupt: ignored