# Neural Language Modeling

## 1. Data Processing
## 2. Encoder: Bag of Words
## 3. Decoder: MLP
## 4. Training Neural LM
## 5. Sentence Score
## 6. Sentence Generation

In [1]:
# !wget http://ikulikov.name/neural_lm_amazon_model_N10.pt
# !wget http://ikulikov.name/neural_lm_amazon_model_N7.pt
# !wget http://ikulikov.name/neural_lm_amazon_model_N5.pt
# !wget http://ikulikov.name/neural_lm_amazon_model_N3.pt

In [2]:
import os
import sys
sys.path.append('utils/')
import torch
import numpy as np
import torch.nn as nn
import random
import math

import utils.ngram_utils as ngram_utils
from utils.ngram_utils import NgramLM
from utils.amazon_dataset import AmazonDataset, pad, batchify
from torch.utils.data import DataLoader
from utils.neural_lm import BagOfNGrams, DecoderMLP, seq2seq
import utils.global_variables as gl
import torch
from tqdm import tqdm_notebook, tqdm
_tqdm = tqdm_notebook

In [3]:
torch.manual_seed(1)


<torch._C.Generator at 0x7f809810b7d0>

In [4]:
use_cuda = True
device = torch.device("cuda" if (torch.cuda.is_available() and use_cuda) else "cpu")


# Data Processing

In [5]:
# Read data from .txt files and create lists of reviews
train_data = []
# create a list of all the reviews 
with open('../data/train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
    
valid_data = []
# create a list of all the reviews 
with open('../data/valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
    

In [6]:
train_data[0], valid_data[0]
train_data = train_data#[:100]
valid_data = valid_data#[:10]
train_data[0], type(train_data), len(train_data), type(train_data[0])

("this is a great tutu and at a really great price . it doesn ' t look cheap at all . i ' m so glad i looked on amazon and found such an affordable tutu that isn ' t made poorly . a + + ",
 list,
 222919,
 str)

In [7]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
N = 5

In [9]:
train_data_padded = ngram_utils.pad_dataset(train_data_tokenized, n=N)
valid_data_padded = ngram_utils.pad_dataset(valid_data_tokenized, n=N)

In [10]:
vocab = ngram_utils.get_vocab(train_data_padded)
vocab_size = len(vocab)
vocab_size, vocab[:10]

(63993, ('<sos>', '<eos>', '.', 'the', 'i', ',', 'and', 'a', 'it', 'to'))

In [11]:
id2token, token2id = ngram_utils.get_dict(vocab)
len(id2token), len(token2id)

(63996, 63994)

In [12]:
train_data_ids = ngram_utils.get_ids(train_data_padded, token2id)
valid_data_ids = ngram_utils.get_ids(valid_data_padded, token2id)

In [13]:
train_dataset = AmazonDataset(train_data_ids, max_inp_length=None, use_cuda=True)
train_dataset_ngrams = []
for t in train_dataset:
    for i in range(len(t) - N):
        train_dataset_ngrams.append((t[i:i + N], t[i + N]))
train_loader = DataLoader(train_dataset_ngrams, batch_size=131072, collate_fn=batchify, shuffle=True)

100%|██████████| 1069724/1069724 [00:37<00:00, 28807.40it/s]


In [14]:
valid_dataset = AmazonDataset(valid_data_ids, max_inp_length=None, use_cuda=True)
valid_dataset_ngrams = []
for t in valid_dataset:
    for i in range(len(t) - N):
        valid_dataset_ngrams.append((t[i:i + N], t[i + N]))
valid_loader = DataLoader(valid_dataset_ngrams, batch_size=131072, collate_fn=batchify, shuffle=True)

100%|██████████| 124185/124185 [00:04<00:00, 29578.38it/s]


In [15]:
num_train = len(train_dataset_ngrams)
num_valid = len(valid_dataset_ngrams)
num_train, num_valid

(19223314, 2162309)

# Encoder

In [16]:
encoder = BagOfNGrams(len(id2token), emb_dim=300, hidden_size=256, out_size=128, activation='ReLU', nlayers=2, reduce='mean', dropout=0.1, batch_norm=False)
encoder

BagOfNGrams(
  (embedding): EmbeddingBag(63996, 300, mode=mean)
  (layers): ModuleList(
    (0): Linear(in_features=300, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1)
    (3): Linear(in_features=256, out_features=128, bias=True)
  )
)

# Decoder

In [17]:
decoder = DecoderMLP(input_size=128, output_size=len(id2token), hidden_size=256)
decoder

DecoderMLP(
  (linear): Linear(in_features=128, out_features=256, bias=True)
  (out): Linear(in_features=256, out_features=63996, bias=True)
  (log_softmax): LogSoftmax()
)

# Model

In [18]:
model = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model

seq2seq(
  (encoder): BagOfNGrams(
    (embedding): EmbeddingBag(63996, 300, mode=mean)
    (layers): ModuleList(
      (0): Linear(in_features=300, out_features=256, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1)
      (3): Linear(in_features=256, out_features=128, bias=True)
    )
  )
  (decoder): DecoderMLP(
    (linear): Linear(in_features=128, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=63996, bias=True)
    (log_softmax): LogSoftmax()
  )
  (criterion): NLLLoss()
)

In [23]:
TRAIN = True

# Training

In [None]:
if TRAIN:
    num_epochs = 10
    log_interval = 10
    best_eval_loss = np.inf

    for epoch in range(num_epochs):
        # Train
        cur_loss = 0
        for i, (data, labels) in enumerate(train_loader):
            prediction, loss = model.train_step(data, labels)
            cur_loss += loss

            if i % log_interval == 0 and i > 0:
                cur_loss = cur_loss / log_interval
                print('| Epoch {:3d} | Train Loss {:5.2f} | Train PPL {:8.2f} | {:5d}/{:5d} Batches'.format(
                    epoch, cur_loss, math.exp(cur_loss), i, int(num_train/len(data))))
                cur_loss = 0

            # Eval
            if epoch % 1 == 0:        
                eval_loss = 0
                for i, (data, labels) in enumerate(valid_loader):
                    prediction, loss = model.eval_step(data, labels)
                    eval_loss += len(data) * loss
                eval_loss = eval_loss / num_valid 
                print('-' * 89)
                print('| Epoch {:3d} | Valid Loss {:5.2f} | Valid PPL {:8.2f}'.format(
                    epoch, eval_loss, math.exp(eval_loss)))
                print('-' * 89)

                # Save the model if the validation loss is the best we've seen so far.
                if not best_eval_loss or eval_loss < best_eval_loss:
                    model.save_model('neural_lm_amazon_model_all_N{}'.format(N) + '.pt')
                    best_eval_loss = eval_loss
                    
    pretrained_model = model

else:
    # Load Pretrained Model
    pretrained_model = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
    pretrained_model.load_model('neural_lm_amazon_model_all_N{}'.format(N) + '.pt')
    pretrained_model

| Epoch   0 | Train Loss 10.63 | Train PPL 41420.58 |    10/ 2346 Batches
| Epoch   0 | Train Loss  8.11 | Train PPL  3332.61 |    20/ 2346 Batches
| Epoch   0 | Train Loss  7.12 | Train PPL  1241.78 |    30/ 2346 Batches
| Epoch   0 | Train Loss  6.51 | Train PPL   669.93 |    40/ 2346 Batches
| Epoch   0 | Train Loss  6.09 | Train PPL   442.74 |    50/ 2346 Batches
| Epoch   0 | Train Loss  5.81 | Train PPL   334.13 |    60/ 2346 Batches
| Epoch   0 | Train Loss  5.57 | Train PPL   261.79 |    70/ 2346 Batches
| Epoch   0 | Train Loss  5.43 | Train PPL   228.28 |    80/ 2346 Batches
| Epoch   0 | Train Loss  5.30 | Train PPL   200.19 |    90/ 2346 Batches
| Epoch   0 | Train Loss  5.16 | Train PPL   174.17 |   100/ 2346 Batches
| Epoch   0 | Train Loss  5.06 | Train PPL   157.95 |   110/ 2346 Batches
| Epoch   0 | Train Loss  5.00 | Train PPL   147.98 |   120/ 2346 Batches
| Epoch   0 | Train Loss  4.95 | Train PPL   141.23 |   130/ 2346 Batches
| Epoch   0 | Train Loss  4.90 | Train

## Loaders for Some N

In [None]:
def get_loader(train_data_tokenized, valid_data_tokenized, N):
    train_data_padded = ngram_utils.pad_dataset(train_data_tokenized, n=N)
    valid_data_padded = ngram_utils.pad_dataset(valid_data_tokenized, n=N)
    
    vocab = ngram_utils.get_vocab(train_data_padded)
    id2token, token2id = ngram_utils.get_dict(vocab)
    train_data_ids = ngram_utils.get_ids(train_data_padded, token2id)
    
    valid_data_ids = ngram_utils.get_ids(valid_data_padded, token2id)
    train_dataset = AmazonDataset(train_data_ids, max_inp_length=None, use_cuda=True)
    
    train_dataset_ngrams = []
    for t in train_dataset:
        for i in range(len(t) - N):
            train_dataset_ngrams.append((t[i:i + N], t[i + N]))
    train_loader = DataLoader(train_dataset_ngrams, batch_size=2048, collate_fn=batchify, shuffle=True)

    valid_dataset = AmazonDataset(valid_data_ids, max_inp_length=None, use_cuda=True)
    valid_dataset_ngrams = []
    for t in valid_dataset:
        for i in range(len(t) - N):
            valid_dataset_ngrams.append((t[i:i + N], t[i + N]))
    valid_loader = DataLoader(valid_dataset_ngrams, batch_size=2048, collate_fn=batchify, shuffle=True)

    num_train = len(train_dataset_ngrams)
    num_valid = len(valid_dataset_ngrams)
    
    return train_loader, num_train, valid_loader, num_valid

### Get Loader for Some N

In [None]:
def get_loader(train_data_tokenized, valid_data_tokenized, N):
    # Tokenize the Datasets
    # TODO: this takes a really long time !! why?
    train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
    valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)
    
    train_data_padded = ngram_utils.pad_dataset(train_data_tokenized, n=N)
    valid_data_padded = ngram_utils.pad_dataset(valid_data_tokenized, n=N)
    
    vocab = ngram_utils.get_vocab(train_data_padded)
    id2token, token2id = ngram_utils.get_dict(vocab)
    
    train_data_ids = ngram_utils.get_ids(train_data_padded, token2id)
    valid_data_ids = ngram_utils.get_ids(valid_data_padded, token2id)

    train_dataset = AmazonDataset(train_data_ids, max_inp_length=None, use_cuda=True)
    train_dataset_ngrams = []
    for t in train_dataset:
        for i in range(len(t) - N):
            train_dataset_ngrams.append((t[i:i + N], t[i + N]))
    train_loader = DataLoader(train_dataset_ngrams, batch_size=2048, collate_fn=batchify, shuffle=True)

    valid_dataset = AmazonDataset(valid_data_ids, max_inp_length=None, use_cuda=True)
    valid_dataset_ngrams = []
    for t in valid_dataset:
        for i in range(len(t) - N):
            valid_dataset_ngrams.append((t[i:i + N], t[i + N]))
    valid_loader = DataLoader(valid_dataset_ngrams, batch_size=2048, collate_fn=batchify, shuffle=True)
    
    num_train = len(train_dataset_ngrams)
    num_valid = len(valid_dataset_ngrams)
    
    return train_loader, num_train, valid_loader, num_valid

## Perplexity (Train + Valid Data)

In [None]:
def get_perplexity(loader, num_data, model):
    total_loss = 0
    for i, (data, labels) in enumerate(loader):
        prediction, loss = model.eval_step(data, labels)
        total_loss += len(data) * loss
    total_loss = total_loss / num_data 
    ppl = math.exp(total_loss)
    return ppl

In [None]:
# Load Pretrained Model
N = 3
print("Ngrams: ", N)

model_n3 = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model_n3.load_model('neural_lm_amazon_model_all_N{}'.format(N) + '.pt')
print("Model: ", model_n3)

train_loader, num_train, valid_loader, num_valid = get_loader(train_data_tokenized, valid_data_tokenized, N)

valid_ppl = get_perplexity(valid_loader, num_valid, model_n3)
print("Validation Perplexity: ", valid_ppl)

train_ppl = get_perplexity(train_loader, num_train, model_n3)
print("Training Perplexity: ", valid_ppl)


In [None]:
# Load Pretrained Model
N = 5
print("Ngrams: ", N)

model_n5 = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model_n5.load_model('neural_lm_amazon_model_all_N{}'.format(N) + '.pt')
print("Model: ", model_n5)

train_loader, num_train, valid_loader, num_valid = get_loader(train_data_tokenized, valid_data_tokenized, N)

valid_ppl = get_perplexity(valid_loader, num_valid, model_n3)
print("Validation Perplexity: ", valid_ppl)

train_ppl = get_perplexity(train_loader, num_train, model_n3)
print("Training Perplexity: ", valid_ppl)


In [None]:
# Load Pretrained Model
N = 7
print("Ngrams: ", N)

model_n7 = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model_n7.load_model('neural_lm_amazon_model_all_N{}'.format(N) + '.pt')
print("Model: ", model_n7)

train_loader, num_train, valid_loader, num_valid = get_loader(train_data_tokenized, valid_data_tokenized, N)

valid_ppl = get_perplexity(valid_loader, num_valid, model_n3)
print("Validation Perplexity: ", valid_ppl)

train_ppl = get_perplexity(train_loader, num_train, model_n3)
print("Training Perplexity: ", valid_ppl)


In [None]:
# Load Pretrained Model
N = 10
print("Ngrams: ", N)

model_n10 = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model_n10.load_model('neural_lm_amazon_model_all_N{}'.format(N) + '.pt')
print("Model: ", model_n10)

train_loader, num_train, valid_loader, num_valid = get_loader(train_data_tokenized, valid_data_tokenized, N)

valid_ppl = get_perplexity(valid_loader, num_valid, model_n3)
print("Validation Perplexity: ", valid_ppl)

train_ppl = get_perplexity(train_loader, num_train, model_n3)
print("Training Perplexity: ", valid_ppl)


## Score Sentences

In [None]:
def score_sentence(sent, model):
    tokenized, _ = ngram_utils.tokenize_dataset(sent)
    sent_ids = ngram_utils.get_ids(tokenized, token2id)
    sent_tensor = torch.LongTensor(sent_ids).to(device)
    generated, scores = model.evaluate(sent_tensor, score_only=True)
    ppl = math.exp(scores)
    return ppl

In [None]:
sentence = ['i like pandas']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentence = ['tutu tutu is not my favorit']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentence = ['i really like this watch']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentence = ['training neural networks']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentence = ['this is a great tutu']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentence = ['my wife really likes the color of this dress']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

## Generate Sentences

In [None]:
def generate_sentence(model, context=None):
    if context is None:
        dummy_context = torch.LongTensor([[0]]).to(device)
        generated, scores = model.evaluate(dummy_context, use_context=False)
    else:
        tokenized, _ = ngram_utils.tokenize_dataset(context)
        context_ids = ngram_utils.get_ids(tokenized, token2id)
        context_tensor = torch.LongTensor(context_ids).to(device)
        generated, scores = model.evaluate(context_tensor, use_context=True)
    
    ppl = math.exp(scores)
    return generated, scores

### No Context

In [None]:
generated, scores = generate_sentence(model_n3)
print(' '.join(word[0] for word in generated)), scores

In [None]:
generated, scores = generate_sentence(model_n5)
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(model_n7)
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(model_n10)
print(' '.join(word[0] for word in generated))

### With Context

In [None]:
generated, scores = generate_sentence(model_n3, context=['this dress'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(model_n5, context=['this dress'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(model_n7, context=['this dress'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(model_n10, context=['this dress'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(model_n3, context=['i like'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(model_n5, context=['i like'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(model_n7, context=['i like'])
print(' '.join(word[0] for word in generated))

In [None]:
generated, scores = generate_sentence(model_n10, context=['i like'])
print(' '.join(word[0] for word in generated))