<a href="https://colab.research.google.com/github/kyunghyuncho/ammi-2019-nlp/blob/master/01-day-LM/neural_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Language Modeling

In [1]:
import os
import sys
sys.path.append('utils/')
import loading_text_and_tokenization
import torch
import numpy as np
import torch.nn as nn
import random
import math

import utils.ngram_utils as ngram_utils
from utils.ngram_utils import NgramLM
from utils.amazon_dataset import AmazonDataset, pad, batchify
from torch.utils.data import DataLoader
from utils.neural_lm import BagOfNGrams, DecoderMLP, seq2seq
import utils.global_variables as gl
import torch
from tqdm import tqdm_notebook, tqdm
_tqdm = tqdm_notebook

In [2]:
torch.manual_seed(1)


<torch._C.Generator at 0x7fef081067b0>

In [3]:
use_cuda = True
device = torch.device("cuda" if (torch.cuda.is_available() and use_cuda) else "cpu")


In [4]:
# Read data from .txt files and create lists of reviews
train_data = []
# create a list of all the reviews 
with open('../data/train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
    
valid_data = []
# create a list of all the reviews 
with open('../data/valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
    

In [5]:
len(train_data), len(valid_data)

(222919, 27867)

In [6]:
train_data[0], valid_data[0]
train_data = train_data#[:100]
valid_data = valid_data#[:10]
train_data[0], type(train_data), len(train_data), type(train_data[0])

("this is a great tutu and at a really great price . it doesn ' t look cheap at all . i ' m so glad i looked on amazon and found such an affordable tutu that isn ' t made poorly . a + + ",
 list,
 222919,
 str)

In [7]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
train_data_tokenized[0], all_tokens_train[0]

(['this',
  'is',
  'a',
  'great',
  'tutu',
  'and',
  'at',
  'a',
  'really',
  'great',
  'price',
  '.'],
 'this')

In [9]:
N = 5

In [10]:
train_data_padded = ngram_utils.pad_dataset(train_data_tokenized, n=N)
valid_data_padded = ngram_utils.pad_dataset(valid_data_tokenized, n=N)

In [11]:
train_data_padded[0]

['<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 'this',
 'is',
 'a',
 'great',
 'tutu',
 'and',
 'at',
 'a',
 'really',
 'great',
 'price',
 '.',
 '<eos>',
 '<eos>',
 '<eos>',
 '<eos>']

In [12]:
vocab = ngram_utils.get_vocab(train_data_padded)
vocab_size = len(vocab)
vocab_size, vocab[:10]

(63993, ('<sos>', '<eos>', '.', 'the', 'i', ',', 'and', 'a', 'it', 'to'))

In [13]:
id2token, token2id = ngram_utils.get_dict(vocab)
len(id2token), len(token2id)

(63996, 63994)

In [14]:
train_data_ids = ngram_utils.get_ids(train_data_padded, token2id)
valid_data_ids = ngram_utils.get_ids(valid_data_padded, token2id)

In [15]:
train_dataset = AmazonDataset(train_data_ids, max_inp_length=None, use_cuda=True)
train_dataset_ngrams = []
for t in train_dataset:
    for i in range(len(t) - N):
        train_dataset_ngrams.append((t[i:i + N], t[i + N]))
train_loader = DataLoader(train_dataset_ngrams, batch_size=16384, collate_fn=batchify, shuffle=True)

100%|██████████| 1069724/1069724 [00:30<00:00, 34828.13it/s]


In [16]:
valid_dataset = AmazonDataset(valid_data_ids, max_inp_length=None, use_cuda=True)
valid_dataset_ngrams = []
for t in valid_dataset:
    for i in range(len(t) - N):
        valid_dataset_ngrams.append((t[i:i + N], t[i + N]))
valid_loader = DataLoader(valid_dataset_ngrams, batch_size=16384, collate_fn=batchify, shuffle=True)


100%|██████████| 124185/124185 [00:02<00:00, 44802.73it/s]


In [17]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7fee3b38b668>

In [18]:
num_train = len(train_dataset_ngrams)
num_valid = len(valid_dataset_ngrams)
num_train, num_valid

(19223314, 2162309)

In [None]:
encoder = BagOfNGrams(len(id2token), emb_dim=300, hidden_size=256, out_size=128, activation='ReLU', nlayers=2, reduce='mean', dropout=0.1, batch_norm=False)
encoder

BagOfNGrams(
  (embedding): EmbeddingBag(63996, 300, mode=mean)
  (layers): ModuleList(
    (0): Linear(in_features=300, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1)
    (3): Linear(in_features=256, out_features=128, bias=True)
  )
)

In [None]:
decoder = DecoderMLP(input_size=128, output_size=len(id2token), hidden_size=256)
decoder

DecoderMLP(
  (linear): Linear(in_features=128, out_features=256, bias=True)
  (out): Linear(in_features=256, out_features=63996, bias=True)
  (log_softmax): LogSoftmax()
)

In [None]:
model = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model

seq2seq(
  (encoder): BagOfNGrams(
    (embedding): EmbeddingBag(63996, 300, mode=mean)
    (layers): ModuleList(
      (0): Linear(in_features=300, out_features=256, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1)
      (3): Linear(in_features=256, out_features=128, bias=True)
    )
  )
  (decoder): DecoderMLP(
    (linear): Linear(in_features=128, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=63996, bias=True)
    (log_softmax): LogSoftmax()
  )
  (criterion): NLLLoss()
)

## Training

In [None]:
num_epochs = 10
log_interval = 10
best_eval_loss = np.inf

for epoch in range(num_epochs):
    # Train
    train_loss = 0   
    cur_loss = 0
    for i, (data, labels) in _tqdm(enumerate(train_loader), disable=True):
        prediction, loss = model.train_step(data, labels)
        train_loss += len(data) * loss
        cur_loss += loss
        
        if i % log_interval == 0 and i > 0:
            cur_loss = cur_loss / log_interval
            print('| Epoch {:3d} | Train Loss {:5.2f} | Train PPL {:8.2f} | {:5d}/{:5d} Batches'.format(
                epoch, cur_loss, math.exp(cur_loss), i, int(num_train/len(data))))
            cur_loss = 0
    
    train_loss = train_loss / num_train
    print('| Epoch {:3d} | Train Loss {:5.2f} | Train PPL {:8.2f}'.format(
            epoch, train_loss, math.exp(train_loss)))
    

    # Eval
    if epoch % 1 == 0:        
        eval_loss = 0
        for i, (data, labels) in _tqdm(enumerate(valid_loader), disable=True):
            prediction, loss = model.eval_step(data, labels, eval_mode=True)
            eval_loss += len(data) * loss
        eval_loss = eval_loss / num_valid 
        print('-' * 89)
        print('| Epoch {:3d} | Valid Loss {:5.2f} | Valid PPL {:8.2f}'.format(
            epoch, eval_loss, math.exp(eval_loss)))
        print('-' * 89)

        # Save the model if the validation loss is the best we've seen so far.
        if not best_eval_loss or eval_loss < best_eval_loss:
            with open('neural_lm_amazon_model_large' + '.pt', 'wb') as f:
                torch.save(model, f)
            best_eval_loss = eval_loss        


| Epoch   0 | Train Loss 10.89 | Train PPL 53665.67 |    10/ 1173 Batches
| Epoch   0 | Train Loss  8.48 | Train PPL  4813.20 |    20/ 1173 Batches


In [None]:
### Load the Pretrained Model

In [None]:
path_to_model = 'neural_lm_amazon_model_large.pt'

pretrained_model = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
pretrained_model.load_state_dict(torch.load(path_to_model))
pretrained_model.eval()


In [None]:
def score_sentence(sent):
    tokenized, _ = ngram_utils.tokenize_dataset(sent)
    sent_ids = ngram_utils.get_ids(tokenized, token2id)
    sent_tensor = torch.LongTensor(sent_ids).to(device)
    generated, scores = model.eval_step(sent_tensor, score_only=True)
    ppl = math.exp(scores)
    return ppl

In [None]:
sent = ['this', 'is', 'a', 'great', 'tutu', 'and', 'at', 'a', 'really', 'great', 'price', '.']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['this is a great tutu']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['this is a really great price']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['this is a great tutu']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['i have an older coat']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['i like pandas']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['i like the color of this coat']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['my old watch was not']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['i do not like this tutu']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['i will request a refund']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['i will return this watch']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['i do not understand']
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['this is a very cool watch .']
type(sent), len(sent), type(sent[0]), sent[0]
ppl = score_sentence(sent)
ppl

In [None]:
sent = ['blah watch blah what']
type(sent), len(sent), type(sent[0]), sent[0]
ppl = score_sentence(sent)
ppl

In [None]:
valid_data[:10]

In [None]:
def generate_sentence(context=None):
    if context is None:
        dummy_context = torch.LongTensor([[0]]).to(device)
        generated, scores = model.eval_step(dummy_context, use_context=False)
    else:
        tokenized, _ = ngram_utils.tokenize_dataset(context)
        context_ids = ngram_utils.get_ids(tokenized, token2id)
        context_tensor = torch.LongTensor(context_ids).to(device)
        generated, scores = model.eval_step(context_tensor, use_context=True)
    
    ppl = math.exp(scores)
    return generated, scores

In [None]:
generated, scores = generate_sentence()
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(context=['i like the'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(context=['this is the best i'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(context=['this is not my'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(context=['this is not what i'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(context=['this fits'])
print(' '.join(word[0] for word in generated))