# Deep Learning CS6073 Assignment 8

    By Akhil Kanna Devarashetti
    
Question:

    This programming assignment is based on https://github.com/pytorch/examples/tree/master/word_language_model
    But we will only run the Transformer.
    Download train.txt, valid.txt, and test.txt to ./data/wikitext-2/.
    You may need to run python main.py with specification of the selection of Transformer, or python main2.py, 
    which along with model2.py, is a simplified version only for the Transformer and with a few epochs.
    We need data.py to start
    and generate.py to show the learning result.
    Show that you indeed have spent time in studying and running the programs.

In [1]:
import math
import torch
import torch.nn as nn
import data
import model2

In [2]:
bptt = 20
loginterval = 200

In [7]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data


def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i + seq_len]
    target = source[i + 1:i + 1 + seq_len].view(-1)
    return data, target

In [6]:
batch_size = 20
eval_batch_size = 10

corpus = data.Corpus('./data/wikitext-2')
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [8]:
emsize = 100
nhead = 2
nhid = 64
nlayers = 2
dropout = 0.2
ntokens = len(corpus.dictionary)

model = model2.TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout)
criterion = nn.NLLLoss()

In [9]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [10]:
def train():
    model.train()
    total_loss = 0.
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        model.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        total_loss += loss.item()

        if batch % loginterval == 0 and batch > 0:
            cur_loss = total_loss / loginterval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                cur_loss, math.exp(cur_loss)))
            total_loss = 0

In [11]:

# Loop over epochs.
lr = 20
best_val_loss = None
epochs = 5
# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs + 1):
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | valid loss {:5.2f} | '
              'valid ppl {:8.2f}'.format(epoch, val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open('model.pt', 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')


| epoch   1 |   200/ 5221 batches | lr 20.00 | loss 10.67 | ppl 42922.85
| epoch   1 |   400/ 5221 batches | lr 20.00 | loss 10.61 | ppl 40346.76
| epoch   1 |   600/ 5221 batches | lr 20.00 | loss 10.62 | ppl 40875.39
| epoch   1 |   800/ 5221 batches | lr 20.00 | loss 10.60 | ppl 40327.92
| epoch   1 |  1000/ 5221 batches | lr 20.00 | loss 10.61 | ppl 40474.58
| epoch   1 |  1200/ 5221 batches | lr 20.00 | loss 10.60 | ppl 40287.29
| epoch   1 |  1400/ 5221 batches | lr 20.00 | loss 10.61 | ppl 40642.39
| epoch   1 |  1600/ 5221 batches | lr 20.00 | loss 10.61 | ppl 40576.35
| epoch   1 |  1800/ 5221 batches | lr 20.00 | loss 10.61 | ppl 40575.26
| epoch   1 |  2000/ 5221 batches | lr 20.00 | loss 10.62 | ppl 40744.91
| epoch   1 |  2200/ 5221 batches | lr 20.00 | loss 10.61 | ppl 40584.58
| epoch   1 |  2400/ 5221 batches | lr 20.00 | loss 10.61 | ppl 40543.02
| epoch   1 |  2600/ 5221 batches | lr 20.00 | loss 10.60 | ppl 40302.77
| epoch   1 |  2800/ 5221 batches | lr 20.00 | loss

In [None]:
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)