# Using short QA pairs, we can achieve surprising results on the Cornell Dataset.

## Import train/val sets

In [1]:
from process_cornell import ENCODING
from process_cornell import TRAIN_PATH
from process_cornell import VAL_PATH
from utils import load_data


print 'Preparing Data...'
train = load_data(ENCODING, TRAIN_PATH)
val = load_data(ENCODING, VAL_PATH)

Preparing Data...


## Create the vocabulary

In [2]:
from collections import Counter
from itertools import chain
from vocab import Vocab


max_vocab_size = 20000

tokens = []
for q,a in train:
    tokens.extend(chain(q, a))
    
counts = Counter(tokens)
most_common = [token for token, count in counts.most_common(max_vocab_size)]
vocab = Vocab()
for token in most_common:
    vocab.add_token(token)

## Convert strings to label encoded sequences

In [3]:
for i in xrange(len(train)):
    q, a = train[i]
    q.append(Vocab.EOS_TOKEN)
    a.append(Vocab.EOS_TOKEN)
    a.insert(0, Vocab.SOS_TOKEN)
    train[i] = (vocab.label_encode(q), vocab.label_encode(a))

for i in xrange(len(val)):
    q, a = val[i]
    q.append(Vocab.EOS_TOKEN)
    a.append(Vocab.EOS_TOKEN)
    a.insert(0, Vocab.SOS_TOKEN)
    val[i] = (vocab.label_encode(q), vocab.label_encode(a))

## Train Model

In [57]:
import time
import random

import numpy as np
import torch.cuda
from torch.optim import Adam

from models import NCM
from vocab import Vocab
from train import get_loss


torch.cuda.manual_seed_all(42)

print_iters = 100
iters = 15000

batch_size = 64
hidden_size = 256
embedding_size = 32
num_layers = 2


start_time = time.time()

model = NCM(len(vocab), embedding_size, hidden_size, num_layers).cuda()
optimizer = Adam(model.parameters())
torch.save(model.state_dict(), 'chat.init')

train_losses = []
val_losses = []

iter_start_time = time.time()
for i in xrange(1, iters + 1):
    train_batch = [random.choice(train) for _ in xrange(batch_size)]
    val_batch = [random.choice(val) for _ in xrange(batch_size)]

    train_loss = get_loss(model, train_batch)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    val_loss = get_loss(model, val_batch, inference_only=True)

    train_losses.append(train_loss.data[0])
    val_losses.append(val_loss.data[0])

    if i % print_iters == 0:
        iter_end_time = time.time()

        avg_train_loss = sum(train_losses[-print_iters:]) / print_iters
        avg_val_loss = sum(val_losses[-print_iters:]) / print_iters

        epoch = (batch_size * i) / len(train)

        string = 'epoch: {}, iters: {}, train loss: {:.2f}, val loss: {:.2f}, time: {:.2f} s'
        print string.format(epoch, i, avg_train_loss, avg_val_loss, iter_end_time - iter_start_time)

        iter_start_time = time.time()
        
    if i == 2000:  # val minimum
        torch.save(model.state_dict(), 'chat.min')

torch.save(model.state_dict(), 'chat.final')

end_time = time.time()
seconds_per_hour = 60.**2
print '\nTotal time: {:.2f} hours\n'.format((end_time - start_time) / seconds_per_hour)

epoch: 0, iters: 100, train loss: 4.81, val loss: 4.81, time: 4.87 s
epoch: 1, iters: 200, train loss: 3.97, val loss: 4.11, time: 4.77 s
epoch: 1, iters: 300, train loss: 3.78, val loss: 3.93, time: 4.73 s
epoch: 2, iters: 400, train loss: 3.67, val loss: 3.87, time: 4.82 s
epoch: 2, iters: 500, train loss: 3.54, val loss: 3.81, time: 4.73 s
epoch: 3, iters: 600, train loss: 3.49, val loss: 3.78, time: 4.68 s
epoch: 4, iters: 700, train loss: 3.40, val loss: 3.74, time: 4.76 s
epoch: 4, iters: 800, train loss: 3.34, val loss: 3.66, time: 4.74 s
epoch: 5, iters: 900, train loss: 3.27, val loss: 3.68, time: 4.77 s
epoch: 5, iters: 1000, train loss: 3.22, val loss: 3.70, time: 4.77 s
epoch: 6, iters: 1100, train loss: 3.17, val loss: 3.70, time: 5.11 s
epoch: 7, iters: 1200, train loss: 3.12, val loss: 3.72, time: 4.76 s
epoch: 7, iters: 1300, train loss: 3.06, val loss: 3.65, time: 4.76 s
epoch: 8, iters: 1400, train loss: 3.03, val loss: 3.63, time: 4.77 s
epoch: 8, iters: 1500, train 

epoch: 68, iters: 11700, train loss: 0.24, val loss: 6.23, time: 4.56 s
epoch: 69, iters: 11800, train loss: 0.23, val loss: 6.20, time: 4.49 s
epoch: 69, iters: 11900, train loss: 0.23, val loss: 6.25, time: 4.41 s
epoch: 70, iters: 12000, train loss: 0.23, val loss: 6.26, time: 4.48 s
epoch: 70, iters: 12100, train loss: 0.23, val loss: 6.34, time: 4.54 s
epoch: 71, iters: 12200, train loss: 0.24, val loss: 6.38, time: 4.41 s
epoch: 71, iters: 12300, train loss: 0.23, val loss: 6.38, time: 4.45 s
epoch: 72, iters: 12400, train loss: 0.23, val loss: 6.31, time: 4.40 s
epoch: 73, iters: 12500, train loss: 0.24, val loss: 6.42, time: 4.40 s
epoch: 73, iters: 12600, train loss: 0.28, val loss: 6.28, time: 4.44 s
epoch: 74, iters: 12700, train loss: 0.28, val loss: 6.38, time: 4.52 s
epoch: 74, iters: 12800, train loss: 0.26, val loss: 6.31, time: 4.41 s
epoch: 75, iters: 12900, train loss: 0.24, val loss: 6.48, time: 4.45 s
epoch: 76, iters: 13000, train loss: 0.23, val loss: 6.42, time:

## Plot losses

In [58]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()

p = figure(y_axis_label='perplexity', x_axis_label='iters')
p.line(range(iters), train_losses, legend='train')
p.line(range(iters), val_losses, legend='val', color='orange')
show(p)

## Let's look at the chat results at 3 different points in the training: before training, at the val loss minimum and at the end of training.

**Note:** Each question has 3 answers, one from each stage of training. The first, second and third answers are from before training, at the val loss minimum and at the end of training, respectively.

## Wait why do you care...

Perplexity is a rather simple/poor measure for this sort of task; just because a given answer, word for word,| doesn't match an expected answer doesn't mean it isn't a good answer. 

That being said, maybe the model's responses, on the validation set, do become better despite an incrasing val loss; I wanted to find out. 

In [56]:
from process_cornell import tokenize
from utils import chat

questions = ['Can you say no?', 'What is your name?', 'Where are we going?', 'What color is the sky?', 
            'How tall are you?', '2 plus 2 is', 'Will google hire me?', 'Tell me the alphabet', 
            'What is your gender?', 'What is love?', 'Tell me a joke.', 'What day is it?', 'Who are you?', 
            'How do you feel?', 'Are you tired?', 'Are you hungry?', 'Are you strong?']

for q in questions:
    print 'Q: {}'.format(q)
    for path in mdl_paths:
        model = NCM(len(vocab), embedding_size, hidden_size, num_layers).cuda()
        model.load_state_dict(torch.load(path))

        a = chat(q, tokenize, model, vocab).split()
        a = ' '.join(a[1:len(a) - 1])  # rm start & stop token
        print 'A: {}'.format(a.encode('utf-8'))
    print '\n'

Q: Can you say no?
A: extra what- what- scars scars scars feature reputation feature reputation feature feature scars scars feature reputation feature reputation feature feature lady scars feature
A: no .
A: what ?


Q: What is your name?
A: cecil sutphin dead assignment scars scars reputation feature reputation feature reputation feature feature scars scars feature reputation feature reputation feature feature lady scars
A: i do n't know .
A: bedevere , my liege .


Q: Where are we going?
A: kansas kansas scars scars scars scars reputation feature reputation feature feature scars reputation feature feature reputation feature lady feature scars scars feature reputation
A: i do n't know .
A: nowhere .


Q: What color is the sky?
A: cecil sutphin assignment scars scars scars reputation feature reputation feature feature reputation feature lady feature scars scars feature reputation feature reputation feature feature
A: i do n't know .
A: nothing .


Q: How tall are you?
A: in scars scars

# Results

The results from the untrained model are clearly gibberish as expected.
Looking at the other two checkpoints, the responses seem to be roughly equally sensible. However, the responses at the end of training tend to be more specific and interesting. On the other hand, responses from the model, at the min val loss, tend to be evasive and limited. 


# Where to go from here?

The next step would be to move to the opensubtitles dataset. This has two benefits:
  1. Larger/richer dataset
  2. Used in the original paper

# Just for fun, here is a particularly quirky model I came across.

In [47]:
for q in questions:
    path = 'chat_6174_32_256_2.mdl'
    model = NCM(len(vocab), embedding_size, hidden_size, num_layers).cuda()
    model.load_state_dict(torch.load(path))

    a = chat(q, tokenize, model, vocab).split()
    a = ' '.join(a[1:len(a) - 1])  # rm start & stop token
    print 'Q: {}\nA: {}\n'.format(q, a.encode('utf-8'))

Q: Can you say no?
A: no .

Q: What is your name?
A: bedevere , sir .

Q: Where are we going?
A: to take a picture .

Q: What color is the sky?
A: i 'm not finished .

Q: How tall are you?
A: not good .

Q: 2 plus 2 is
A: ?

Q: Will google hire me?
A: for a second ?

Q: Tell me the alphabet
A: tell much .

Q: What is your gender?
A: a funeral .

Q: What is love?
A: for .

Q: Tell me a joke.
A: tell me .

Q: What day is it?
A: it .

Q: Who are you?
A: i 'm the creator .

Q: How do you feel?
A: i 'm fine .

Q: Are you tired?
A: what ?

Q: Are you hungry?
A: what ?

Q: Are you strong?
A: yes .

