# Chatbot with Pytorch

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals


import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import zipfile
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

## Load File path of the data

In [2]:
name="cornell_movie-dialogs_corpus"
corpus = os.path.join("data",name)

## Have a look at the Data files within the zip

As we can see movie_characters_metadata.txt, movie_conversations.txt, movie_lines.txt, movie_titles_metadata.txt would be the files we would be working with the most

In [3]:
formatter_movies_lines_file = os.path.join(corpus,"formatted_lines.txt")
delimiter ="\t"
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

lines={}
conversations=[]
LINE_FIELDs=["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]

## Segregate lines

In [4]:
def loadAndSeperatelines(filename, fields):
    lines={}
    with open(filename,'r',encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            line_objects={}
            for i, field in enumerate(fields):
                line_objects[field] = values[i]
                lines[line_objects['lineID']]=line_objects
    return lines
            
lines=loadAndSeperatelines(os.path.join(corpus, "movie_lines.txt"),LINE_FIELDs)

## Group lines based on conversations

In [5]:
def groupConversations(filename,lines,conversation_fields):
    conversations=[]
    with open(filename,'r',encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            conversation_object={}
            
            for i, field in enumerate(conversation_fields):
                conversation_object[field] = values[i]
                
            pattern_id=re.compile('L[0-9]+')
            line_ids= pattern_id.findall(conversation_object["utteranceIDs"])
            conversation_object['lines']= []

            for line_id in line_ids:
                conversation_object["lines"].append(lines[line_id])
            conversations.append(conversation_object)
    return conversations
conversations=groupConversations(os.path.join(corpus,"movie_conversations.txt"),lines,MOVIE_CONVERSATIONS_FIELDS)    

## Extract pairs form Sentances to conversations and write to file

In [6]:
def getSentances(conversations):
    question_answers=[]
    for conversation in conversations:
        for i in range(len(conversation['lines'])-1):
            questions=conversation['lines'][i]['text'].strip()
            answers= conversation['lines'][i+1]['text'].strip()
            
            if questions and answers:
                question_answers.append([questions,answers])
    return question_answers
            
            
with open(formatter_movies_lines_file, 'w', encoding='utf-8') as outputfile:
    writer= csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    
   
    for question_answer_pair in getSentances(conversations):
        
        writer.writerow(question_answer_pair)

In [7]:
# Print lines
# def printLines(file, n=10):
#     with open(file, 'rb') as datafile:
#         lines = datafile.readlines()
#     for line in lines[:n]:
#         print(line)
# printLines(formatter_movies_lines_file)

## Word to vector conversions

In [8]:
# tokens
PAD_token = 0   #Padding at the end 
SOS_token = 1   #Start of sentance token
EOS_token = 2   #End of sentance token

In [9]:
class Word_to_Vector:
    def __init__(self, name):
        self.name= name
        self.word_to_index={}
        self.index_to_word={PAD_token:'PAD',SOS_token:'SOS',EOS_token:'EOS'}
        self.word_count={}
        self.word_number=3
        self.trimmed=False
        
    def addSentance(self, sentance):
        for word in sentance.split(' '):
            self.convertWord(word)
            
    def convertWord(self,word):
        if word not in self.word_to_index:
            self.word_to_index[word] = self.word_number
            self.word_count[word]=1
            self.index_to_word[self.word_number] = word
            self.word_number+=1
        else:
            self.word_count[word]+=1
            
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed =True
        
        keep_words=[]
        
        for key, value in self.word_count.items():
            if value >=min_count:
                keep_words.append(key)
            
            
            self.word_to_index={}
            self.index_to_word={PAD_token:'PAD',SOS_token:'SOS',EOS_token:'EOS'}
            self.word_count={}
            self.word_number=3
            
            for word in keep_words:
                self.convertWord(word)


## Preprocess text

In [10]:
#Convert to Ascii
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [11]:
#Normalize string (Lower Case, remove non-letter characters,)
def normalize(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [12]:
# convert Text to vector object
def readtext(datafile,corpus_name):
    lines = open(formatter_movies_lines_file, encoding='utf-8').\
        read().strip().split('\n')
    pairs= [ [normalize(s) for s in l.split('\t')] for l in lines]
    
    vecs= Word_to_Vector(corpus_name)
    return vecs,pairs
    

In [13]:
max_length=10
def filterPair(p):
    return len(p[0].split(' ')) < max_length and len(p[1].split(' ')) < max_length

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def loadAndPrepareData(corpus, corpus_name, datafile, save_dir):
    vecs, pairs = readtext(datafile, corpus_name)
    pairs = filterPairs(pairs)
    
    for pair in pairs:
        vecs.addSentance(pair[0])
        vecs.addSentance(pair[1])
        
    return vecs, pairs

save_dir = os.path.join("data", "save")
voc, pairs = loadAndPrepareData(corpus, name, formatter_movies_lines_file, save_dir)

## Threshold rare words

In [14]:
minimum_count=3
def trimRareCountWords(voc,pairs,minimum_count):
    voc.trim(minimum_count)
    
    keep_pairs=[]
    
    for pair in pairs:
        questions= pair[0]
        answers= pair[1]
        
        keep_question=keep_answers=True
        
        for word in questions.split(' '):
            if word not in voc.word_to_index:
                keep_question=False
        
        for word in answers.split(' '):
            if word not in voc.word_to_index:
                keep_answers=False
                
        if keep_question and keep_answers:
            keep_pairs.append(pair)
    return keep_pairs

pairs= trimRareCountWords(voc,pairs,minimum_count)

## Preparing data
### Performing Batching

In [15]:
batch_size= 5
def indexesFromSentence(voc, question):
    return [voc.word_to_index[word] for word in question.split(' ')] + [EOS_token]

def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def questionsVar(questions,voc):
    indexes_batch = [indexesFromSentence(voc, question) for question in questions]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths
    
def answerVar(answers, voc):
    indexes_batch = [indexesFromSentence(voc, answer) for answer in answers]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

def convertBatches(voc, batch_pairs):
    batch_pairs.sort(key=lambda x : len(x[0].split(" ")),reverse= True)
    
    questions = []
    answers=[]
    for pair in batch_pairs:
        questions.append(pair[0])
        answers.append(pair[1])
        
    quests, lengths=questionsVar(questions,voc)
    ans,mask, max_target_len = answerVar(answers, voc)
    
    return quests, lengths,ans,mask, max_target_len

batches = convertBatches(voc, [random.choice(pairs) for _ in range(batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

## Defining the model

### Encoder

In [16]:
class EncoderRNN(nn.Module):
    def __init__(self,hidden_size,embeddings,n_layers=1,dropout=0):
        super(EncoderRNN,self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embeddings = embeddings
        
        self.gru = nn.GRU(input_size = hidden_size,hidden_size = hidden_size,num_layers = n_layers,dropout= (0 if n_layers==1 else dropout),bidirectional = True)
     
    def forward(self,input_seq,input_lengths,hidden = None):
        embedded = self.embeddings(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        
        outputs, hidden = self.gru(packed, hidden)
        
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum Outputs of bi-directional GRU
        
        return outputs, hidden

### Attention Layer

In [17]:
class Attn(nn.Module):
    def __init__(self,method,hidden_size):
        super(Attn,self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))
            
    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)
    
    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)
    
    
    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)
    
    def forward(self, hidden, encoder_outputs):
        
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)
            
        attn_energies = attn_energies.t()
        
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

### Decoder

In [18]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self,attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN,self).__init__()
        
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)
        
        
    def forward(self,input_step,last_hidden,encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        
        rnn_output, hidden = self.gru(embedded, last_hidden)
        
        attn_weights = self.attn(rnn_output, encoder_outputs)
        
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        
        return output, hidden

In [19]:
def maskBatchLLLoss(inp, target,mask):
    total = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp,1,target.view(-1, 1)).squeeze(1))
    
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    
    return loss, total.item()

### Setting up train

In [20]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=max_length):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    
    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0
    
    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
    
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    
    
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_input = target_variable[t].view(1, -1)
            
            mask_loss, nTotal = maskBatchLLLoss(decoder_output, target_variable[t], mask[t])
            
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_output)
            
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            
            mask_loss, nTotal = maskBatchLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    loss.backward()
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    return sum(print_losses) / n_totals
            

In [21]:
def training_iterations(model_name,voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):
    
    training_batches= [convertBatches(voc, [random.choice(pairs) for _ in range(batch_size)]) for _ in range(n_iteration)]
    
    print("Initializing..")
    
    start_iteration=  1
    print_loss=0
    
    if loadFilename:
        start_iteration= checkpoint['iteration'] + 1
    
    print("Training")
    
    for iteration in range(start_iteration,n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        
        input_variable, lengths, target_variable, mask, max_target_len = training_batch
        
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        
        print_loss += loss
        
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            
#             print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0
            
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            
            if not os.path.exists(directory):
                os.makedirs(directory)
            
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

### Defining Evaluation

In [22]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, input_seq, input_length, max_length):
        
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            
            decoder_input = torch.unsqueeze(decoder_input, 0)
            
            
        return all_tokens, all_scores
        
    

## Evaluate

In [23]:
def evaluate(encoder,decoder,searcher,voc,sentance,max_length= max_length ):
    index_batch =[indexesFromSentence(voc,sentance)]
    
    lengths = torch.tensor([len(indexes) for indexes in index_batch])
    
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(index_batch).transpose(0, 1)
    
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    
    tokens, scores = searcher(input_batch, lengths, max_length)
    
    decoded_words = [voc.index_to_word[token.item()] for token in tokens]
    
    return decoded_words


In [24]:
def evaluateInput(encoder,decoder,searcher, voc):
    input_sentance=''
    
    while(1):
        try:
            input_sentence = input('> ')
            
            if input_sentence == 'q' or input_sentence == 'quit': 
                break
            
            input_sentence = normalize(input_sentence)
            
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            
            print('Bot:', ' '.join(output_words))
            
        except KeyError:
            print("Error: Encountered unknown word.")

## Setting Hyper Parameters while runnibg model

In [25]:
model_name= "chatbot"
attention_model = 'general' #"dot" # 'general' 'concat'

hidden_size = 500
encoder_n_layers = 4 #2
decoder_n_layers = 4 #2
dropout = 0.1
batch_size = 64

loadFilename = None
checkpoint_iter = 4000

#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


if loadFilename:
    checkpoint = torch.load(loadFilename)
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']
    
print('Building encoder and decoder ...')

embedding = nn.Embedding(voc.word_number, hidden_size)

if loadFilename:
    embedding.load_state_dict(embedding_sd)
    
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attention_model, embedding, hidden_size, voc.word_number, decoder_n_layers, dropout)

if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
    
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built')

Building encoder and decoder ...
Models built


## Run Training

In [26]:
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

encoder.train()
decoder.train()

print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)
    
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()
            
print("Starting Training!")
training_iterations(model_name,voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, name, loadFilename)

Building optimizers ...
Starting Training!
Initializing..
Training


In [27]:
encoder.eval()
decoder.eval()


searcher = GreedySearchDecoder(encoder, decoder)

evaluateInput(encoder, decoder, searcher, voc)

> Hi
Bot: hi . . . .
> What is your name?
Bot: my name . . . .
> Yup
Bot: what ? what ? ! !
> Nothing
Bot: what ? what ? ! !
> How are you?
Bot: i m fine . . .
> Good bye
Bot: good bye . .
> quit
