<a href="https://colab.research.google.com/github/kyunghyuncho/ammi-2019-nlp/blob/master/01-day-LM/neural_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Language Modeling

In [1]:
import os
import sys
sys.path.append('utils/')
import loading_text_and_tokenization
import torch
import numpy as np
import torch.nn as nn
import random
import math

import utils.ngram_utils as ngram_utils
from utils.ngram_utils import NgramLM
from utils.amazon_dataset import AmazonDataset, pad, batchify
from torch.utils.data import DataLoader
from utils.neural_lm import BagOfNGrams, DecoderMLP, seq2seq
import utils.global_variables as gl
import torch
from tqdm import tqdm_notebook, tqdm
_tqdm = tqdm_notebook

In [2]:
torch.manual_seed(1)


<torch._C.Generator at 0x7fb37c1117b0>

In [3]:
# Read data from .txt files and create lists of reviews
train_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
    
valid_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
    

In [4]:
# train_data = split_into_sentences(train_data)
# valid_data = split_into_sentences(valid_data)

In [5]:
len(train_data), len(valid_data)

(222919, 27869)

In [6]:
train_data[0], valid_data[0]
train_data = train_data[:100]
valid_data = valid_data[:10]

In [7]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
train_data_tokenized[0], all_tokens_train[0]

(['this',
  'is',
  'a',
  'great',
  'tutu',
  'and',
  'at',
  'a',
  'really',
  'great',
  'price',
  '.'],
 'this')

In [9]:
N = 5

In [10]:
train_data_padded = ngram_utils.pad_dataset(train_data_tokenized, n=N)
valid_data_padded = ngram_utils.pad_dataset(valid_data_tokenized, n=N)

In [11]:
train_data_padded[0]

['<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 'this',
 'is',
 'a',
 'great',
 'tutu',
 'and',
 'at',
 'a',
 'really',
 'great',
 'price',
 '.',
 '<eos>',
 '<eos>',
 '<eos>',
 '<eos>']

In [12]:
vocab = ngram_utils.get_vocab(train_data_padded)
vocab_size = len(vocab)
vocab_size, vocab[:10]

(1980, ('<sos>', '<eos>', '.', ',', 'the', 'i', 'to', 'and', 'a', 'it'))

In [13]:
id2token, token2id = ngram_utils.get_dict(vocab)

In [14]:
train_data_ids = ngram_utils.get_ids(train_data_padded, token2id)
valid_data_ids = ngram_utils.get_ids(valid_data_padded, token2id)

In [15]:
train_dataset = AmazonDataset(train_data_ids, max_inp_length=None, use_cuda=True)
train_dataset_ngrams = []
for t in train_dataset:
    for i in range(len(t) - N):
        train_dataset_ngrams.append((t[i:i + N], t[i + N]))
train_loader = DataLoader(train_dataset_ngrams, batch_size=1024, collate_fn=batchify, shuffle=True)

100%|██████████| 844/844 [00:03<00:00, 271.37it/s]


In [16]:
# for t in train_dataset_ngrams:
#     print(t)
#     break

In [17]:
# train_data_ids[0]

In [18]:
# for i, (d, l) in enumerate(train_loader):
#     import pdb; pdb.set_trace()
#     print(d)
#     print(l)
#     break

In [19]:
valid_dataset = AmazonDataset(valid_data_ids, max_inp_length=None, use_cuda=True)
valid_dataset_ngrams = []
for t in valid_dataset:
    for i in range(len(t) - N):
        valid_dataset_ngrams.append((t[i:i + N], t[i + N]))
valid_loader = DataLoader(valid_dataset_ngrams, batch_size=1024, collate_fn=batchify, shuffle=True)


100%|██████████| 39/39 [00:00<00:00, 16439.99it/s]


In [20]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7fb38462d3c8>

In [21]:
num_train = len(train_dataset_ngrams)
num_valid = len(valid_dataset_ngrams)
num_train, num_valid

(18235, 659)

In [22]:
encoder = BagOfNGrams(len(id2token), emb_dim=300, hidden_size=512, out_size=256, activation='ReLU', nlayers=1, reduce='mean', dropout=0.1, batch_norm=False)
encoder

BagOfNGrams(
  (embedding): EmbeddingBag(1984, 300, mode=mean)
  (layers): ModuleList(
    (0): Linear(in_features=300, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1)
    (3): Linear(in_features=512, out_features=256, bias=True)
  )
)

In [23]:
decoder = DecoderMLP(input_size=256, output_size=len(id2token), hidden_size=512)
decoder

DecoderMLP(
  (linear): Linear(in_features=256, out_features=512, bias=True)
  (out): Linear(in_features=512, out_features=1984, bias=True)
  (log_softmax): LogSoftmax()
)

In [24]:
model = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N)
model

seq2seq(
  (encoder): BagOfNGrams(
    (embedding): EmbeddingBag(1984, 300, mode=mean)
    (layers): ModuleList(
      (0): Linear(in_features=300, out_features=512, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1)
      (3): Linear(in_features=512, out_features=256, bias=True)
    )
  )
  (decoder): DecoderMLP(
    (linear): Linear(in_features=256, out_features=512, bias=True)
    (out): Linear(in_features=512, out_features=1984, bias=True)
    (log_softmax): LogSoftmax()
  )
  (criterion): NLLLoss()
)

## Training

In [25]:
num_epochs = 3
log_interval = 1
best_eval_loss = np.inf

for epoch in range(num_epochs):
    # Train
    train_loss = 0        
    for i, (data, labels) in _tqdm(enumerate(train_loader), disable=True):
        prediction, loss = model.train_step(data, labels)
        train_loss += loss
    train_loss = train_loss / num_train
    print('| Epoch {:3d} | Train Loss {:5.2f} | Train PPL {:8.2f}'.format(
            epoch, train_loss, math.exp(train_loss)))

    # Eval
    if epoch % log_interval == 0:        
        eval_loss = 0
        for i, (data, labels) in _tqdm(enumerate(valid_loader), disable=True):
            prediction, loss = model.train_step(data, labels, eval_mode=True)
            eval_loss += loss
        eval_loss = eval_loss / num_valid
        print('-' * 89)
        print('| Epoch {:3d} | Valid Loss {:5.2f} | Valid PPL {:8.2f}'.format(
            epoch, eval_loss, math.exp(eval_loss)))
        print('-' * 89)
        print('-' * 89)

        # Save the model if the validation loss is the best we've seen so far.
        if not best_eval_loss or eval_loss < best_eval_loss:
            with open('neural_lm_amazon_model' + '.pt', 'wb') as f:
                torch.save(model, f)
            best_eval_loss = eval_loss        


| Epoch   0 | Train Loss  0.01 | Train PPL     1.01
-----------------------------------------------------------------------------------------
| Epoch   0 | Valid Loss  0.01 | Valid PPL     1.01
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| Epoch   1 | Train Loss  0.01 | Train PPL     1.01
-----------------------------------------------------------------------------------------
| Epoch   1 | Valid Loss  0.01 | Valid PPL     1.01
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| Epoch   2 | Train Loss  0.00 | Train PPL     1.00
-----------------------------------------------------------------------------------------
| Epoch   2 | Valid Loss  0.01 | Valid PPL     1.01
----------------------------------------------------------

## Scoring Sentences

In [None]:
use_context = True
score_only = True
K = 5
for i, (data, labels) in _tqdm(enumerate(valid_loader), disable=True):
    import pdb; pdb.set_trace()
    generated, scores = model.eval_step(data, use_context=use_context, score_only=score_only)            # batch predictions
    for k in range(K):
        if use_context:
            context = [model.v2t(d) for d in data][k]
            context = [c[0] for c in context]
            print("Context: ", ' '.join(context))  # print only one generated sentence out of the bsz 
        generated_str = [' '.join(g) for g in generated] # convert them to more readable strings     
        if not score_only:
            print("Generated ", generated_str[k])  # print only one generated sentence out of the bsz 
        print("Score:    ", math.exp(scores[k]))  # print only one generated sentence out of the bsz 
        print("")

In [None]:
scores

## Generation

### No Context

In [None]:
use_context = False
score_only = False
for i, (data, labels) in _tqdm(enumerate(train_loader), disable=True):
    generated, scores = model.eval_step(data, use_context=use_context, score_only=score_only)            # batch predictions
    import pdb; pdb.set_trace()
    for k in range(K):
        if use_context:
            context = [model.v2t(d) for d in data][0]
            context = [c[0] for c in context]
            print("Context: ", ' '.join(context))  # print only one generated sentence out of the bsz 
        generated_str = [' '.join(g) for g in generated] # convert them to more readable strings     
        print("Generated ", generated_str[0])  # print only one generated sentence out of the bsz 
        print("Score:    ", math.exp(scores[0]))  # print only one generated sentence out of the bsz 
        print("")

### Context

In [None]:
use_context = True
score_only = False
for i, (data, labels) in _tqdm(enumerate(train_loader), disable=True):
    generated, scores = model.eval_step(data, use_context=use_context, score_only=score_only)            # batch predictions

    for k in range(K):
        if use_context:
            context = [model.v2t(d) for d in data][k]
            context = [c[0] for c in context]
            print("Context: ", ' '.join(context))  # print only one generated sentence out of the bsz 
        generated_str = [' '.join(g) for g in generated] # convert them to more readable strings     
        print("Generated ", generated_str[k])  # print only one generated sentence out of the bsz 
        print("Score:    ", math.exp(scores[k]))  # print only one generated sentence out of the bsz 
        print("")