In [1]:
import json 
import os
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import pandas as pd
import re
import unicodedata
import itertools
import random
from model import EncoderRNN,LuongAttnDecoderRNN 
from decoder import GreedySearchDecoder,BeamSearchDecoder
from train import train, trainIters 
from evals import evaluateInput
import config
from corpus import Corpus

In [2]:
wiki_path = '../WikiData'

corpus_name = 'seq+att'
conv_path = os.path.join('../Conversations',corpus_name)
dataCor = Corpus(conv_path,wiki_path)

Starting preparing training data...
Read 72922 sentence pairs
Trimmed to 21265 sentence pairs
Counting words...
Counted words: 10178
keep_words 7675 / 10174 = 0.7544

pairs:
[5, 0, 'hey have you seen the inception ? ', 'no i have not but have heard of it . what is it about']
[5, 0, 'hey have you seen the inception ? no i have not but have heard of it . what is it about ', 'it s about extractors that perform experiments using military technology o n people to retrieve info about their targets .']
[5, 0, 'hey have you seen the inception ? no i have not but have heard of it . what is it about it s about extractors that perform experiments using military technology o n people to retrieve info about their targets . ', 'sounds interesting do you know which actors are in it ?']
[5, 0, 'hey have you seen the inception ? no i have not but have heard of it . what is it about it s about extractors that perform experiments using military technology o n people to retrieve info about their targets .

In [8]:
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
# if config.loadFilename:
#     # If loading on same machine the model was trained on
#     checkpoint = torch.load(config.loadFilename)
#     # If loading a model trained on GPU to CPU
#     #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
#     encoder_sd = checkpoint['en']
#     sec_encoder_sd = checkpoint['sec_en']
#     decoder_sd = checkpoint['de']
#     encoder_optimizer_sd = checkpoint['en_opt']
#     sec_encoder_optimizer_sd = checkpoint['sec_en_opt']
#     decoder_optimizer_sd = checkpoint['de_opt']
#     embedding_sd = checkpoint['embedding']
#     voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, config.embedding_size)
# if config.loadFilename:
#     embedding.load_state_dict(embedding_sd)

# Initialize encoder & decoder models
encoder = EncoderRNN(config.embedding_size,config.hidden_size, embedding, config.encoder_n_layers, config.dropout)
sec_encoder = EncoderRNN(config.embedding_size,config.hidden_size, embedding, config.encoder_n_layers, config.dropout)
decoder = LuongAttnDecoderRNN(config.attn_model, embedding,config.embedding_size, config.encoder_n_layers,config.hidden_size, voc.num_words, config.decoder_n_layers, config.dropout)

# if config.loadFilename:
#     encoder.load_state_dict(encoder_sd)
#     sec_encoder.load_state_dict(sec_encoder_sd)
#     decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(config.device)
sec_encoder = sec_encoder.to(config.device)
decoder = decoder.to(config.device)
print('Models built and ready to go!')


# Ensure dropout layers are in train mode
encoder.train()
sec_encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=config.learning_rate)
sec_encoder_optimizer = optim.Adam(sec_encoder.parameters(), lr=config.learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=config.learning_rate * config.decoder_learning_ratio)
# if config.loadFilename:
#     encoder_optimizer.load_state_dict(encoder_optimizer_sd)
#     sec_encoder_optimizer.load_state_dict(sec_encoder_optimizer_sd)
#     decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Starting Training!")
trainIters(voc, pairs, wiki_strings, encoder, sec_encoder,decoder, encoder_optimizer,sec_encoder_optimizer,decoder_optimizer,embedding,save_dir)

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=config.learning_rate)
sec_encoder_optimizer = optim.Adam(sec_encoder.parameters(), lr=config.learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=config.learning_rate * config.decoder_learning_ratio)
# if config.loadFilename:
#     encoder_optimizer.load_state_dict(encoder_optimizer_sd)
#     sec_encoder_optimizer.load_state_dict(sec_encoder_optimizer_sd)
#     decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Set dropout layers to eval mode
encoder.eval()
sec_encoder.eval()
decoder.eval()


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 100; Percent complete: 0.3%; Average loss: 6.2517
Iteration: 200; Percent complete: 0.7%; Average loss: 5.4831
Iteration: 300; Percent complete: 1.0%; Average loss: 5.3722
Iteration: 400; Percent complete: 1.3%; Average loss: 5.3216
Iteration: 500; Percent complete: 1.7%; Average loss: 5.2460
Iteration: 600; Percent complete: 2.0%; Average loss: 5.2104
Iteration: 700; Percent complete: 2.3%; Average loss: 5.1505
Iteration: 800; Percent complete: 2.7%; Average loss: 5.0602
Iteration: 900; Percent complete: 3.0%; Average loss: 5.0556
Iteration: 1000; Percent complete: 3.3%; Average loss: 4.9946
Iteration: 1100; Percent complete: 3.7%; Average loss: 4.8683
Iteration: 1200; Percent complete: 4.0%; Average loss: 4.8578
Iteration: 1300; Percent complete: 4.3%; Average loss: 4.8109
Iteration: 1400; Percent complete: 4.7%; Average loss: 4.8205
Iterat

Iteration: 13000; Percent complete: 43.3%; Average loss: 3.5369
Iteration: 13100; Percent complete: 43.7%; Average loss: 3.5020
Iteration: 13200; Percent complete: 44.0%; Average loss: 3.5995
Iteration: 13300; Percent complete: 44.3%; Average loss: 3.5778
Iteration: 13400; Percent complete: 44.7%; Average loss: 3.4643
Iteration: 13500; Percent complete: 45.0%; Average loss: 3.4766
Iteration: 13600; Percent complete: 45.3%; Average loss: 3.4846
Iteration: 13700; Percent complete: 45.7%; Average loss: 3.3843
Iteration: 13800; Percent complete: 46.0%; Average loss: 3.4807
Iteration: 13900; Percent complete: 46.3%; Average loss: 3.4453
Iteration: 14000; Percent complete: 46.7%; Average loss: 3.3627
Iteration: 14100; Percent complete: 47.0%; Average loss: 3.6278
Iteration: 14200; Percent complete: 47.3%; Average loss: 3.2918
Iteration: 14300; Percent complete: 47.7%; Average loss: 3.3650
Iteration: 14400; Percent complete: 48.0%; Average loss: 3.3241
Iteration: 14500; Percent complete: 48.3

Iteration: 25900; Percent complete: 86.3%; Average loss: 2.9932
Iteration: 26000; Percent complete: 86.7%; Average loss: 2.9506
Iteration: 26100; Percent complete: 87.0%; Average loss: 2.8324
Iteration: 26200; Percent complete: 87.3%; Average loss: 2.6330
Iteration: 26300; Percent complete: 87.7%; Average loss: 3.0212
Iteration: 26400; Percent complete: 88.0%; Average loss: 2.9558
Iteration: 26500; Percent complete: 88.3%; Average loss: 2.9124
Iteration: 26600; Percent complete: 88.7%; Average loss: 3.0091
Iteration: 26700; Percent complete: 89.0%; Average loss: 2.7433
Iteration: 26800; Percent complete: 89.3%; Average loss: 2.9448
Iteration: 26900; Percent complete: 89.7%; Average loss: 2.9430
Iteration: 27000; Percent complete: 90.0%; Average loss: 3.1407
Iteration: 27100; Percent complete: 90.3%; Average loss: 3.0311
Iteration: 27200; Percent complete: 90.7%; Average loss: 2.9282
Iteration: 27300; Percent complete: 91.0%; Average loss: 3.0451
Iteration: 27400; Percent complete: 91.3

LuongAttnDecoderRNN(
  (embedding): Embedding(7679, 100)
  (embedding_dropout): Dropout(p=0.3)
  (gru): GRU(400, 300)
  (concat): Linear(in_features=600, out_features=300, bias=True)
  (concatsec): Linear(in_features=1200, out_features=300, bias=True)
  (out): Linear(in_features=300, out_features=7679, bias=True)
  (attn): Attn()
)

In [None]:
evaluateInput(encoder, sec_encoder, decoder, searcher, voc,wiki_strings)

In [11]:
from nltk.translate.bleu_score import sentence_bleu
from data_utils import normalizeString,indexesFromSentence

# def batch_bleu(ref_batch, candi_batch):
#     zip_batch = zip(ref_batch, candi_batch)
#     scores = [sentence_bleu([ref.split()],cand.split()) for ref,cand in zip_batch]
#     avg_score = sum(scores)/len(scores)
#     return avg_score

def evaluate(encoder, sec_encoder, decoder, searcher, voc, sentence, wikiSec):
    max_length=config.MAX_LENGTH
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)] #(1,L)
    sec_indexes = [indexesFromSentence(voc,wikiSec)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch]) #(1,)
    sec_lengths = torch.tensor([len(indexes) for indexes in sec_indexes])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1) #(L,1)
    sec_batch = torch.LongTensor(sec_indexes).transpose(0, 1) 
    # Use appropriate device
    input_batch = input_batch.to(config.device)
    sec_batch = sec_batch.to(config.device)
    lengths = lengths.to(config.device)
    sec_lengths = sec_lengths.to(config.device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, sec_batch, sec_lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token] for token in tokens if not (token == config.EOS_token or token == config.PAD_token)]
    return decoded_words

def batch_bleu(encoder,sec_encoder,decoder,searcher,voc, wiki_strings, dev_batch):
    scores = []
    for conv in dev_batch:
        doc_idx = conv[0]
        sec_idx = conv[1]
        candidate_sentence = evaluate(encoder,sec_encoder,decoder,searcher,voc,conv[2],wiki_strings[doc_idx][sec_idx])
#         print(candidate_sentence)
        refs = conv[3].split()
        s = sentence_bleu([refs],candidate_sentence)
        if s > 0.3:
            print(' '.join(candidate_sentence))
            print(' '.join(refs))
        scores.append(s)
    return sum(scores)/len(scores)
# Initialize search module
# searcher = GreedySearchDecoder(encoder, sec_encoder, decoder)
searcher = BeamSearchDecoder(encoder,sec_encoder,decoder)
res = batch_bleu(encoder,sec_encoder,decoder,searcher,voc,wiki_strings,pairs[:1000])
print(res)

have you seen this movie ?
have you seen this movie ?
i am great ! how are you ?
i am good how are you ?
i thought it was a very underrated movie . i thought it was very good .
i think it was a very underrated movie . i think the critic scores given were very low .
no i have not .
i have not . what is it about ?
it is a biographical movie about leonardo dicaprio who are basically in their dream world where they can bend reality
it is a really trippy movie with leonardo dicaprio . they are basically in a dream world where they can bend reality
can you tell me the name of the movie ?
can you tell me more about the movie ?
no i have not
no i have not
it has a rotten tomatoes score of 6 . 8 10 . 8 10 .
i highly recommend it it has a rotten tomatoes score of 6 . 8 10 .
it s about the antics of jordan belfort .
it s about the antics of jordan belfort .
who is the main actor ?
who is the main character
what did you think about the rotten tomatoes score ?
it was ! what did you think about the 