<a href="https://colab.research.google.com/github/kyunghyuncho/ammi-2019-nlp/blob/master/01-day-LM/ngram_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language Modeling

### Goal: compute a probabilty distribution over all possible sentences:


### $$p(W) = p(w_1, w_2, ..., w_T)$$

### This unsupervised learning problem can be framed as a sequence of supervised learning problems:

### $$p(W) = p(w_1) * p(w_2|w_1) * ... * p(w_T|w_1, ..., w_{T-1})$$

### If we have N sentences, each of them with T words / tokens, then we want to max:

### $$log p(W) = \sum_{n = 1}^N \sum_{i=1}^{T} log p(w_i | w_{<i})$$




# N-gram language model

### Goal: estimate the n-gram probabilities using counts of sequences of n consecutive words

### Given a sequence of words $w$, we want to compute

###  $$P(w_i|w_{i−1}, w_{i−2}, …, w_{i−n+1})$$

### Where $w_i$ is the i-th word of the sequence.

### $$P(w_i|w_{i−n+1}, ..., w_{i−2}, w_{i−1}) = \frac{p(w_{i−n+1}, ..., w_{i−2}, w_{i−1}, w_i)}{\sum_{w \in V} p(w_{i−n+1}, ..., w_{i−2}, w_{i−1}, w)}$$

### Key Idea: We can estimate the probabilities using counts of n-grams in our dataset 


In [1]:
# TODOs
#: implement the neural LM with concat instead of summation -- so that you have a fixed input etc.
# make a separate
# create some slides with pictures maybe explaining the model visualizations -- line by line
# get google cloud working
# make it work on gpu
# show them kenlm and how to use to do different stuff with it
# use the same sentences to generation and testing etc.
# explain perplexity
# ngram, ff, rnn, rnn+attention
# do sentence generation
# do long sentences
# compare different n-grams -- 2,3,more

### Install if needed

TODO: should we install as needed and import as needed or all at once?

In [2]:
# # run if you dont have it installed
# !pip install more_itertools
# !pip install spacy# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager\
# !python -m spacy download en_core_web_sm

# import sys
# sys.path.insert(0, "/home/roberta/ParlAI")
# print(sys.path)


### Imports

In [4]:
import numpy as np
import spacy
import torch
from torch.utils.data import Dataset, DataLoader
import random
import numpy
import itertools
from operator import itemgetter 
from glob import glob
from tqdm import tqdm_notebook, tqdm
_tqdm = tqdm_notebook
from collections import Counter
import torch.nn as nn
import torch.nn.functional as F
import string
import re
import more_itertools as mit  # not built-in package
import torch
import torchtext
import torchtext.data as data
from torchtext import vocab
from collections import Counter
import re
from torchtext.data import TabularDataset 
import pandas
import altair
from parlai.core.torch_agent import TorchAgent, Output
from torch import optim


In [5]:
torch.manual_seed(1)


<torch._C.Generator at 0x7f5e081b38f0>

In [6]:
# # Create .txt files with the reviews

# with open('../data/amazon_reviews_clothing_train.txt', 'w') as f:
#     for review in train_reviews:
#         for token in review:
#             f.write("%s " % token) 
#         f.write("\n")
        
# with open('../data/amazon_reviews_clothing_test.txt', 'w') as f:
#     for review in test_reviews:
#         for token in review:
#             f.write("%s " % token) 
#         f.write("\n")
        
# with open('../data/amazon_reviews_clothing_valid.txt', 'w') as f:
#     for review in valid_reviews:
#         for token in review:
#             f.write("%s " % token) 
#         f.write("\n")

### Load Data from .txt Files

In [7]:
# Read data from .txt files and create lists of reviews

train_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
# split each review into the tokens that compose it
# for review in reviews:
#     train_data.append(review.split())
    
test_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_test.txt', 'r') as f:
    test_data = [review for review in f.read().split('\n') if review]
# split each review into the tokens that compose it
# for review in reviews:
#     test_data.append(review.split())
    
valid_data = []
# create a list of all the reviews 
with open('../data/amazon_reviews_clothing_valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
# split each review into the tokens that compose it
# for review in reviews:
#     valid_data.append(review.split())


In [8]:
type(train_data), len(train_data), \
type(train_data[0]), len(train_data[0]), \
type(train_data[0][0]), len(train_data[0][0])

(list, 222919, str, 184, str, 1)

In [9]:
train_data[0], train_data[0][0]


("this is a great tutu and at a really great price . it doesn ' t look cheap at all . i ' m so glad i looked on amazon and found such an affordable tutu that isn ' t made poorly . a + + ",
 't')

### Process the Data

In [10]:
# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')               
punctuations = string.punctuation
# punctuations = '"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~' 
TAG_RE = re.compile(r'<[^>]+>') # get rid off HTML tags from the data

def remove_tags(text):
    return TAG_RE.sub('', text)

def lower_case(parsed):
    return [token.text.lower() for token in parsed] #and (token.is_stop is False)]

def remove_punc(parsed):
    return [token.text for token in parsed if (token.text not in punctuations)]

def lower_case_remove_punc(parsed):
    return [token.text.lower() for token in parsed if (token.text not in punctuations)] #and (token.is_stop is False)]

def tokenize_dataset(dataset):
   # tokenize each sentence -- each tokenized sentence will be an element in token_dataset
    token_dataset = []
    # tokenize all words -- each token will be an item in all_tokens (in the order given by the list of sentences)
    all_tokens = []     # all the tokens -- 

    for sample in _tqdm(tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=1)):
#         tokens = lower_case_remove_punc(sample)
        tokens = lower_case(sample)       # make words lower case
#         tokens = remove_punct(tokens)     # remove punctuation
        token_dataset.append(tokens)    
        all_tokens += tokens
        
    return token_dataset, all_tokens

In [11]:
punctuations, punctuations[0], \
type(punctuations), len(punctuations), type(punctuations[0]), 

('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~', '!', str, 32, str)

In [12]:
TAG_RE

re.compile(r'<[^>]+>', re.UNICODE)

In [13]:
# # TODO: for now only work with small subset of the data -- switch to all data later
# train_data = train_data[:800]
# test_data = test_data[:100]
# valid_data = valid_data[:100]

In [14]:
type(train_data), type(train_data[0]), type(train_data[0][0])

(list, str, str)

In [15]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = tokenize_dataset(train_data)
test_data_tokenized, all_tokens_test = tokenize_dataset(test_data)
valid_data_tokenized, all_tokens_valid = tokenize_dataset(valid_data)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Let's look at the tokenized data!

In [16]:
# Number of All Tokens
len(all_tokens_train), all_tokens_train[0], \
len(train_data_tokenized), train_data_tokenized[0]

(16412250,
 'this',
 222919,
 ['this',
  'is',
  'a',
  'great',
  'tutu',
  'and',
  'at',
  'a',
  'really',
  'great',
  'price',
  '.',
  'it',
  'doesn',
  "'",
  't',
  'look',
  'cheap',
  'at',
  'all',
  '.',
  'i',
  "'",
  'm',
  'so',
  'glad',
  'i',
  'looked',
  'on',
  'amazon',
  'and',
  'found',
  'such',
  'an',
  'affordable',
  'tutu',
  'that',
  'isn',
  "'",
  't',
  'made',
  'poorly',
  '.',
  'a',
  '+',
  '+'])

#### Build the Vocabulary 


In [17]:
# Build a vocabulary using all the tokens found in train data (90% of most common ones)
voc = list(set(all_tokens_train))
print('Word vocabulary size: {} words'.format(len(voc)))        

Word vocabulary size: 71128 words


### CORPUS ANALYSIS (Train + Valid Data)

#### Number of Tokens in the Corpus Data


In [18]:
print("Number of All Tokens ", len(all_tokens_train))

Number of All Tokens  16412250


In [19]:
print("Number of All UNIQUE Tokens ", len(voc))

Number of All UNIQUE Tokens  71128


#### Number of Sentences in the Train Data


In [20]:
print("Number of Sentences ", len(train_data_tokenized))

Number of Sentences  222919


## N-grams

In [21]:
n = 3 # trigrams

### Function for padding the sentences with special markers sentence beginning and end, i.e. $<bos>$ and $<eos>$

In [22]:
def pad_sentences(input_list, n):
    result_list = []
    for l in input_list:
        padded = ["<bos>" for i in range((n - 1))] + l +["<eos>" for i in range((n - 1))]
        result_list.append(padded)
    return result_list

In [23]:
train_padded = pad_sentences(train_data_tokenized, n)
valid_padded = pad_sentences(valid_data_tokenized, n)
test_padded = pad_sentences(test_data_tokenized, n)

In [24]:
# train_padded[:2]

### Function for finding all N-grams

In [25]:
def find_ngrams(input_list, n):
    result_list = []
    for l in input_list:
        result_list.append(list(zip(*[l[i:] for i in range(n)])))
    return result_list

### Function for Getting N-gram counts for already tokenized data

In [28]:
def ngram_counts(data, n, frac_vocab=0.9):    
    all_train_tokens = list(mit.flatten(data))
    counted_tokens = Counter(all_train_tokens)
    max_vocab_size = int(frac_vocab * len(counted_tokens))

    vocab, count = zip(*counted_tokens.most_common(max_vocab_size))
    
    return vocab, count

In [29]:
n = 3
train_padded = pad_sentences(train_data_tokenized, n)
train_ngram = find_ngrams(train_padded, n)
vocab_ngram, count_ngram = ngram_counts(train_ngram, n)

In [30]:
# train_padded, train_ngram, vocab_ngram, count_ngram

#### Trigrams, Bigrams, Unigrams

In [31]:
train_padded_trigram = pad_sentences(train_data_tokenized, 3)
train_trigram = find_ngrams(train_padded_trigram, 3)
vocab_trigram, count_trigram = ngram_counts(train_trigram, 3)

train_padded_bigram = pad_sentences(train_data_tokenized, 2)
train_bigram = find_ngrams(train_padded_bigram, 2)
vocab_bigram, count_bigram = ngram_counts(train_bigram, 2)

train_padded_unigram = pad_sentences(train_data_tokenized, 1)
train_unigram = find_ngrams(train_padded_unigram, 1)
vocab_unigram, count_unigram = ngram_counts(train_unigram, 1)


In [32]:
# vocab_bigram[:3], count_bigram[:3]

In [33]:
# vocab_unigram[:3], count_unigram[:3]

In [34]:
PAD_IDX = 0
UNK_IDX = 1 
BOS_IDX = 2
EOS_IDX = 3

### Function for Getting N-gram Dict

In [35]:
def ngram_dict(vocab):
    PAD_IDX = 0
    UNK_IDX = 1 
    BOS_IDX = 2
    EOS_IDX = 3
    
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(4, 4+len(vocab)))) 
    id2token = ['<pad>', '<unk>', '<bos>', '<eos>'] + id2token

    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    token2id['<bos>'] = BOS_IDX 
    token2id['<eos>'] = EOS_IDX

    return id2token, token2id

In [36]:
id2token_ngram, token2id_ngram = ngram_dict(vocab_ngram)

In [37]:
id2token_ngram[:10], \
# token2id_ngram['<unk>'], token2id_ngram['<eos>'], token2id_ngram[('rosetta', 'stone')]

(['<pad>',
  '<unk>',
  '<bos>',
  '<eos>',
  ('.', '<eos>', '<eos>'),
  ('<bos>', '<bos>', 'i'),
  ('.', '.', '.'),
  ('it', "'", 's'),
  ('!', '<eos>', '<eos>'),
  ('i', "'", 'm')],)

In [38]:
random_token_id = random.randint(0, len(id2token_ngram) - 1)
random_token = id2token_ngram[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token_ngram[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id_ngram[random_token]))

Token id 970218 ; token ('could', 'walk', 'with')
Token ('could', 'walk', 'with'); token id 970218


In [39]:
# # Function that combines all the above and goes from tokenized data to the ngram dataset
# def create_id_dataset(data, n):
#     padded_data = pad_sentences(data, n)
#     ngram_data = find_ngrams(padded_data, n)
    
#     vocab, count = ngram_counts(ngram_data, n)    
#     id2token, token2id = ngram_dict(vocab)
    
#     data_id = create_data_id(ngram_data, token2id)
#     data_id_merged = create_data_id_merged(data_id, token2id, n)
    
#     return data_id, data_id_merged

In [40]:
# all_data_id, all_data_id_merged = create_id_dataset(train_data_tokenized, n)

### Ngram Counts

In [41]:
vocab_ngram[:10], count_ngram[:10]

((('.', '<eos>', '<eos>'),
  ('<bos>', '<bos>', 'i'),
  ('.', '.', '.'),
  ('it', "'", 's'),
  ('!', '<eos>', '<eos>'),
  ('i', "'", 'm'),
  ('don', "'", 't'),
  ('&', '#', '34'),
  ('#', '34', ';'),
  ('<bos>', '<bos>', 'this')),
 (161218, 77237, 47023, 38312, 32534, 30372, 28225, 26413, 26413, 23103))

In [42]:
def get_ngram_count(ngram, vocab, count):
    if ngram in vocab:
        ngram_idx = vocab.index(ngram)
        return count[ngram_idx] 
    else:
        return 0

In [43]:
c = get_ngram_count(('i', 'like', 'this'), vocab_ngram, count_ngram)
c

1081

In [44]:
c = get_ngram_count(('i', 'like', 'pandas'), vocab_ngram, count_ngram)
c

0

### Function for computing the probability of a sentence

## N-gram Probabilities

## $$P(w|w_{−n}, ..., w_{−2}, w_{−1}) \approx \frac{c(w_{−n}, ..., w_{−2}, w_{−1}, w)}{\sum_{w \in V} c(w_{−n}, ..., w_{−2}, w_{−1}, w)}$$


In [45]:
def get_ngram_prob(ngram, vocab, count):
    c = get_ngram_count(ngram, vocab, count)
    all_counts = 0
    for t in vocab:
        if t[:-1] == ngram[:-1]:
#             print(t, get_ngram_count(t, vocab, count))
            all_counts += get_ngram_count(t, vocab, count)
    if all_counts > 0:
        return c / all_counts
    else:
        return 0

## Bigram Probabilities

## $$p(w_i | w_{i-1}) = \frac{c(w_{i-1}, w_i)}{\sum_{w_i} c(w_{i-1}, w_i)} $$


In [46]:
p = get_ngram_prob(('rosetta', 'stone', 'is'), vocab_ngram, count_ngram)
p

# p = get_ngram_prob(('i', 'am', 'rosetta'), vocab_ngram, count_ngram)
# p

# p = get_ngram_prob(('it', "'", 's'), vocab_ngram, count_ngram)
# p

# p = get_ngram_prob(('i', "like", 'this'), vocab_ngram, count_ngram)
# p, 1/(2+1+1+1+1)

0.13043478260869565

In [49]:
p = get_ngram_prob(('am', 'rosetta', 'stone'), vocab_ngram, count_ngram)
p

0

## Additive Smoothing

In [50]:
def get_ngram_prob_addditive_smoothing(ngram, vocab, count, delta=0.5):
    c = get_ngram_count(ngram, vocab, count) + delta*1
    all_counts = 0
    for t in vocab:
        if t[:-1] == ngram[:-1]:
#             print(t, get_ngram_count(t, vocab, count))
            all_counts += get_ngram_count(t, vocab, count)
    all_counts += delta*len(voc)
    if all_counts > 0:
        return c / all_counts
    else:
        return 0

In [51]:
p = get_ngram_prob_addditive_smoothing(('am', 'rosetta', 'stone'), vocab_ngram, count_ngram, delta=0.5)
p

1.4059160949274547e-05

## Add-One Smoothing

In [52]:
def get_ngram_prob_add_one_smoothing(ngram, vocab, count):
    c = get_ngram_count(ngram, vocab, count) + 1
    all_counts = 0
    for t in vocab:
        if t[:-1] == ngram[:-1]:
#             print(t, get_ngram_count(t, vocab, count))
            all_counts += get_ngram_count(t, vocab, count)
    all_counts += len(voc)
    if all_counts > 0:
        return c / all_counts
    else:
        return 0

In [53]:
p = get_ngram_prob_add_one_smoothing(('am', 'rosetta', 'stone'), vocab_ngram, count_ngram)
p

1.4059160949274547e-05

### Linear Interpolation Smoothing

#### TODO: add formula

In [54]:
# train_padded_trigram = pad_sentences(train_data_tokenized, 3)
# train_trigram = find_ngrams(train_padded_trigram, 3)
# vocab_trigram, count_trigram = ngram_counts(train_trigram, 3)

# train_padded_bigram = pad_sentences(train_data_tokenized, 2)
# train_bigram = find_ngrams(train_padded_bigram, 2)
# vocab_bigram, count_bigram = ngram_counts(train_bigram, 2)

# train_padded_unigram = pad_sentences(train_data_tokenized, 1)
# train_unigram = find_ngrams(train_padded_unigram, 1)
# vocab_unigram, count_unigram = ngram_counts(train_unigram, 1)

KeyboardInterrupt: 

In [55]:
def get_ngram_prob_interpolation_smoothing(ngram, vocab, count, prev_vocab, prev_count, alpha=0.5):
    c = get_ngram_count(ngram, vocab, count)
    all_counts = 0
    for t in vocab:
        if t[:-1] == ngram[:-1]:
#             print(t, get_ngram_count(t, vocab, count))
            all_counts += get_ngram_count(t, vocab, count)
    if all_counts > 0:
        prob_ngram = c / all_counts
    else:
        prob_ngram = 0
    
    prev_ngram = tuple(list(ngram[1:]))
    prev_c = get_ngram_count(prev_ngram, prev_vocab, prev_count)
#     print(prev_c)
    prev_all_counts = 0
    for prev_t in prev_vocab:
        if prev_t[:-1] == prev_ngram[:-1]:
#             print(prev_t, get_ngram_count(prev_t, prev_vocab, prev_count))
            prev_all_counts += get_ngram_count(prev_t, prev_vocab, prev_count)
    if prev_all_counts > 0:
        prob_prev_ngram = prev_c / prev_all_counts
    else:
        0
    return alpha*(prob_ngram) + (1-alpha)*prob_prev_ngram

In [56]:
p = get_ngram_prob_interpolation_smoothing(('am', 'rosetta', 'stone'), vocab_trigram, count_trigram, vocab_bigram, count_bigram, alpha=0.8)
p

0.17468354430379743

### Smoothing: Linear Interpolation with Absolute Discounting

### $$p_{bi}(w|v) = max ({ \frac{N(v, w) - b_{bi}}{N(v)}, 0)  + b_{bi} \frac{V - N_0(v, \cdot)}{N(v)} p_{uni}(w) \large}$$

### $$p_{uni}(w) = max ({ \frac{N(w) - b_{uni}}{N}, 0)  + b_{uni} \frac{V - N_0(\cdot)}{N} \frac{1}{V}}$$

### $$b_{bi} = \frac{N_1(\cdot, \cdot)}{N_1(\cdot, \cdot) + 2*N_2(\cdot, \cdot)}$$

### $$b_{uni} = \frac{N_1(\cdot)}{N_1(\cdot) + 2*N_2(\cdot)}$$


### $$N_r(\cdot) = \sum_{w: N(w) = r} 1$$

### $$N_r(\cdot, \cdot) = \sum_{v, w: N(v, w) = r} 1$$

### $$N_r(v, \cdot) = \sum_{w: N(v, w) = r} 1$$

### V is the number of words in the vocabulary

### $N_r(\cdot, \cdot)$ and $N_r(\cdot)$  are the count-counts for bigrams and unigrams respectively $


In [57]:
def get_unigram_count(r):
    return np.sum([1 for i in range(len(vocab_unigram)) if count_unigram[i] == r])

def get_bigram_count(r):
    return np.sum([1 for i in range(len(vocab_bigram)) if count_bigram[i] == r])

def get_biunigram_count(r, token):
    cc = 0
    for other_token in vocab_unigram:
        bigram = tuple([token] + [other_token])
        if bigram in vocab_bigram:
            bigram_idx = vocab_bigram.index(bigram) 
            if count_bigram[bigram_idx] == r:
                cc += 1
                
#     for bigram in vocab_bigram:
#         print(token, bigram[0])
#         if token == bigram[0]:
#             bigram_idx = vocab_bigram.index(bigram) 
#             if count_bigram[bigram_idx] == r:
#                 cc += 1
    return cc

def get_b_bi():
    bbi = get_bigram_count(1) / (get_bigram_count(1) + 2 * get_bigram_count(2))
    return bbi
    
def get_b_uni():
    buni = get_unigram_count(1) / (get_unigram_count(1) + 2 * get_unigram_count(2))
    return buni

def get_p_uni(w):
    if w in vocab_unigram:
        w_idx = vocab_unigram.index(w)
        N_w = count_unigram[w_idx]
    else:
        N_w = 0
        
    b_uni = get_b_uni()
    
    W = len(voc)
    N_0 = get_unigram_count(0)
    
    
    N = len(all_tokens_train) # TODO: double check the meaning of N 
    
    p_uni = max((N_w - b_uni / N), 0) + b_uni * (W - N_0) / N * 1 / W
    
    return p_uni

def get_p_bi(w, v):   # w given v
    if tuple([v] + [w]) in vocab_bigram:
        vw_idx = vocab_bigram.index(tuple([v] + [w]))
        N_vw = count_bigram[vw_idx]
    else:
        N_vw = 0
        
    if tuple([v]) in vocab_unigram:
        v_idx = vocab_unigram.index(tuple([v]))
        N_v = count_unigram[v_idx]
    else:
        N_v = 0  
        
    b_bi = get_b_bi()
    b_uni = get_b_uni()
    
    p_uni = get_p_uni(tuple([w]))
    
    W = len(voc)
    N_0 = get_biunigram_count(0, v)
    
    
    p_bi =  max((N_vw - b_bi) / N_v,  0) + \
         b_bi * (W - N_0) / N_v * p_uni
    
    return p_bi
        

In [58]:
x = 'stone'
y = 'rosetta'

z = get_p_bi(y, x)
z

KeyboardInterrupt: 

### Let's check that the probabilities sum up to one
### $$\sum_w p_{bi}(w|v) = \sum_w p_{uni}(w) = 1$$



TODO: add this check or leave as homework

### Bigram LM
###  $$p(s) = \prod_{i = 1} ^ {N + 1} p(w_i | w_{i-1})$$

### Likelihood of a Sentence

In [61]:
def get_prob_sentence(sentence, vocab, count, n):
    padded_sentence = pad_sentences(sentence, n)  # needs a list
#     print(padded_sentence)
    ngram_sentence = find_ngrams(padded_sentence, n)[0] # only one element in list
#     print(ngram_sentence)
    prob = 1
    for ngram in ngram_sentence:
        prob_ngram = get_ngram_prob(ngram, vocab, count)
#         print(ngram, prob_ngram)
        prob *= prob_ngram
    return prob

In [60]:
n = 3
sentence = [train_data_tokenized[0]]
print(sentence)
ps = get_prob_sentence(sentence, vocab_ngram, count_ngram, n)
ps

[['this', 'is', 'a', 'great', 'tutu', 'and', 'at', 'a', 'really', 'great', 'price', '.', 'it', 'doesn', "'", 't', 'look', 'cheap', 'at', 'all', '.', 'i', "'", 'm', 'so', 'glad', 'i', 'looked', 'on', 'amazon', 'and', 'found', 'such', 'an', 'affordable', 'tutu', 'that', 'isn', "'", 't', 'made', 'poorly', '.', 'a', '+', '+']]


KeyboardInterrupt: 

In [62]:
n = 3
sentence = [['this', 'is', 'a', 'great', 'tutu']]
print(sentence)
ps = get_prob_sentence(sentence, vocab_ngram, count_ngram, n)
ps

[['this', 'is', 'a', 'great', 'tutu']]


KeyboardInterrupt: 

### Examples
### Bigram LM: $$ p(i \; love \; this \; light) = p(i|\cdot) \; p(love|i)\;  p(this|love)\;  p(light|this) \\
\approx \frac{c(i, \cdot)}{\sum_w c(\cdot, \; w)} \; \frac{c(love, i)}{\sum_wc(i, \; w)}\;  \frac{c(this, love)}{\sum_wc(love, \;w)}\;  \frac{c(light, this)}{\sum_wc(this, \;w)}$$ 

### Trigram LM: $$ p(i \; love \; this  \;light) = p(i|\cdot, \cdot) \; p(love|\cdot, i) \; p(this|i, love)\;  p(light|love, this)$$ 



In [None]:
def get_prob_distr_ngram(prev_tokens, vocab_ngram, count_ngram, voc, print_nonzero_probs=False):
    pd = [0 for v in voc]
    for idx, token in enumerate(voc):
#         print("token: ", token)
#         print("prev ngram: ", prev_tokens)
#         print("both: ", tuple(list(prev_tokens) + [token]))
#         print("")
        token_ngram = tuple(list(prev_tokens) + [token])
        pd[idx] = get_ngram_prob(token_ngram, vocab_ngram, count_ngram)
#         if pd[idx] > 0 and print_nonzero_probs:
#             print(token_ngram, " ", pd[idx])
    return pd

In [None]:
# prob distr for the word following prev_tokens (i.e. tutu) 
# over all the words in the vocabulary 

# prev_tokens = train_data_tokenized[0][4] #[0]
prev_tokens = vocab_ngram[3][1:] #[0]   # need frmo 1 on so that this is a correct prev token
print(prev_tokens)
pd = get_prob_distr_ngram(prev_tokens, vocab_ngram, count_ngram, voc, print_nonzero_probs=True)
sum(pd)#, pd

In [None]:
def sample_from_pd(prev_tokens, vocab_ngram, count_ngram, voc, print_nonzero_probs=False):
    pd = get_prob_distr_ngram(prev_tokens, vocab_ngram, count_ngram, voc, print_nonzero_probs=print_nonzero_probs)
    idx_next_token = np.random.choice(len(voc), 1, p=pd)[0]
    return voc[idx_next_token]
    

In [None]:
print(prev_tokens)
next_token = sample_from_pd(prev_tokens, vocab_ngram, count_ngram, voc, print_nonzero_probs=True)
next_token

### Sentence Generation

In [None]:
def generate_sentence(num_tokens, vocab_ngram, count_ngram, voc, n):
    sentence = []
    prev_tokens = tuple(['<bos>'] * (n - 1))
#     print(prev_tokens)
    for i in range(num_tokens):
        next_token = sample_from_pd(prev_tokens, vocab_ngram, count_ngram, voc)
#         print(i, next_token)
#         print(i, prev_tokens[1:])
        prev_tokens = tuple(list(prev_tokens[1:]) + [next_token])
#         print(i, prev_tokens)
        sentence.append(next_token)
        print(' '.join(sentence))
    return ' '.join(sentence)

In [None]:
num_tokens = 5
generated_sentence = generate_sentence(num_tokens, vocab_ngram, count_ngram, voc, n)
generated_sentence

In [None]:
num_tokens = 5
generated_sentence = generate_sentence(num_tokens, vocab_ngram, count_ngram, voc, n)
generated_sentence


In [None]:
# TODOs
# show rank for each word in a sentence
# explain perplexity 

### Log-Likelihood
### $LL = \sum_{k=1}^{K} \sum_{n=1}^{N_k + 1} log p_{bi}(w_{k,n} | w_{k,n-1})$

### Perplexity

### $PP = exp(-\frac{LL}{\sum_k(N_k + 1)})$

In [136]:
def get_perplexity(test_sentences, vocab_ngram, count_ngram):
    ll = 0
    num_tokens = 0
    for s in (test_sentences):
        ll += get_prob_sentence([s], vocab_ngram, count_ngram, n)
        num_tokens += len(s) + 1

    ppl = np.exp(-ll/num_tokens)
    return ppl

In [137]:
ppl_test = get_perplexity(test_data_tokenized, vocab_ngram, count_ngram)
ppl_valid = get_perplexity(valid_data_tokenized, vocab_ngram, count_ngram)
ppl_train = get_perplexity(train_data_tokenized, vocab_ngram, count_ngram)


KeyboardInterrupt: 

In [None]:
ppl_test, ppl_valid, ppl_train
# TODO check whether this makes sense -- maybe it seems too good?

#### Let's look at some examples and see if they make sense

# Neural Language Modeling

## Make a PyTorch Dataset out of our set of dicts

In [63]:
class AmazonDataset(Dataset):
    def __init__(self, data_list, max_inp_length=None, use_cuda=True):
        """
        data_list is a list of tuples: (x,y) where x is a list of ids and y is a label
        """
        self.data = data_list
        self.max_len = max_inp_length
        self.data_tensors = []
        device = torch.device("cuda" if (torch.cuda.is_available() and use_cuda) else "cpu")
        for (i, t) in tqdm_notebook(self.data):
            print(i, t)
            self.data_tensors.append((torch.LongTensor(i[:self.max_len]).to(device), \
                                        torch.LongTensor([t]).to(device)))
                
    def __getitem__(self, key):
        (inp, tgt) = self.data_tensors[key]
        
        return inp, tgt, len(inp)

    def __len__(self):
        return len(self.data)

def pad(tensor, length, dim=0, pad=0):
    """Pad tensor to a specific length.
    :param tensor: vector to pad
    :param length: new length
    :param dim: (default 0) dimension to pad
    :returns: padded tensor if the tensor is shorter than length
    """
    if tensor.size(dim) < length:
        return torch.cat(
            [tensor, tensor.new(*tensor.size()[:dim],
                                length - tensor.size(dim),
                                *tensor.size()[dim + 1:]).fill_(pad)],
            dim=dim)
    else:
        return tensor
    
def batchify(batch):
    maxlen = max(batch, key = itemgetter(2))[-1]
    batch_list = []
    target_list = []
    for b in batch:
        batch_list.append(pad(b[0], maxlen, dim=0, pad=PAD_IDX))
        target_list.append(b[1])
    input_batch = torch.stack(batch_list, 0)
    target_batch = torch.stack(target_list, 0)
    
    return input_batch, target_batch

In [64]:
def create_data_id(data, token2id):
    data_id = []
    for d in data:
        data_id.append(_text2id(d, token2id))
    return data_id

def create_data_id_merged(data, token2id, n):
    data_id_merged = []
    for d in data:
        for i in range(len(d) - n):
            data_id_merged.append((d[i:i+n], d[i+n]))
    return data_id_merged

n = 1
train_padded_uni = pad_sentences(train_data_tokenized, n)
train_unigram = find_ngrams(train_padded_uni, n)
train_vocab_unigram, train_count_unigram = ngram_counts(train_unigram, n)
train_id2token_unigram, train_token2id_unigram = ngram_dict(train_vocab_unigram)

n = 1
valid_padded_uni = pad_sentences(valid_data_tokenized, n)
valid_unigram = find_ngrams(valid_padded_uni, n)
valid_vocab_unigram, count_unigram = ngram_counts(valid_unigram, n)
valid_id2token_unigram, valid_token2id_unigram = ngram_dict(valid_vocab_unigram)

N = 10
train_data_id = create_data_id(train_unigram, train_token2id_unigram)
train_data_id_merged = create_data_id_merged(train_data_id, train_token2id_unigram, N)

valid_data_id = create_data_id(valid_unigram, valid_token2id_unigram)
valid_data_id_merged = create_data_id_merged(valid_data_id, valid_token2id_unigram, N)



NameError: name '_text2id' is not defined

In [None]:
train_dataset = AmazonDataset(train_data_id_merged, max_inp_length=None, use_cuda=True)
train_loader = DataLoader(train_dataset, batch_size=512, collate_fn=batchify, shuffle=True)

valid_dataset = AmazonDataset(valid_data_id_merged, max_inp_length=None, use_cuda=True)
valid_loader = DataLoader(valid_dataset, batch_size=512, collate_fn=batchify, shuffle=True)

In [None]:
train_data_id_merged

In [None]:
train_dataset[0], train_dataset[0][0].shape, \
valid_dataset[0], valid_dataset[0][0].shape

## Model

In [None]:
class BagOfNGrams(nn.Module):
    def init_layers(self):
        for l in self.layers:
            if getattr(l, 'weight', None) is not None:
                torch.nn.init.xavier_uniform_(l.weight)
    
    def __init__(self, vocab_size, emb_dim=300, hidden_size=256, out_size=128, reduce='sum', nlayers=2, activation='ReLU', dropout=0.1, batch_norm=False):
        super(BagOfNGrams, self).__init__()
       
        self.emb_dim = emb_dim
        self.reduce = reduce
        self.nlayers = nlayers
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.activation = getattr(nn, activation)
        
        self.embedding = nn.EmbeddingBag(num_embeddings=vocab_size, embedding_dim=emb_dim, mode=reduce)
        if batch_norm is True:
            self.batch_norm = nn.BatchNorm1d(self.emb_dim)
        self.layers = nn.ModuleList([nn.Linear(self.emb_dim, self.hidden_size)])
        self.layers.append(self.activation())
        self.layers.append(nn.Dropout(p=dropout))
        
        for i in range(self.nlayers-2):
            self.layers.append(nn.Linear(self.hidden_size, self.hidden_size))
            self.layers.append(self.activation())
            self.layers.append(nn.Dropout(p=dropout)) 
        self.layers.append(nn.Linear(self.hidden_size, self.out_size))
        self.init_layers()
    
    def forward(self, x):
        postemb = self.embedding(x)
        if hasattr(self, 'batch_norm'):
            x = self.batch_norm(postemb)
        else:
            x = postemb
        for l in self.layers:
            x = l(x)
        
        return x

In [None]:
class DecoderMLP(nn.Module):
    """Generates a token in response to context."""

    def __init__(self, input_size=128, output_size=1024, hidden_size=256):
        """Initialize decoder.
        :param input_size: size of embedding
        :param output_size: size of vocabulary
        :param hidden_size: size of the linear layers
        """
        super().__init__()
            
        self.linear = nn.Linear(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input):
        """Return encoded state.
        :param input: batch_size x 1 tensor of token indices.
        :param hidden: past (e.g. encoder) hidden state
        """
        output = F.relu(self.linear(input))
        scores = self.softmax(self.out(output))
        return scores 

In [None]:
class seq2seq(nn.Module):
    def __init__(self, bag_of_ngrams, decoder, lr = 1e-3, use_cuda = True, 
                        longest_label = 20, 
                        clip = 0.3):
        super(seq2seq, self).__init__()

        device = torch.device("cuda" if (torch.cuda.is_available() and use_cuda) else "cpu")
        self.device = device;
        self.bag_of_ngrams = bag_of_ngrams.to(device);
        self.decoder = decoder.to(device)

        self.longest_label = longest_label

        # set up the criterion
        self.criterion = nn.NLLLoss()

        self.optims = {
             'nmt': optim.SGD(self.parameters(), lr=lr, nesterov=True, momentum = 0.99)
        }

        self.longest_label = longest_label
        self.clip = clip;
        
    def zero_grad(self):
        """Zero out optimizer."""
        for optimizer in self.optims.values():
            optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        if self.clip is not None:
            torch.nn.utils.clip_grad_norm_(self.bag_of_ngrams.parameters(), self.clip)
            torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), self.clip)
        for optimizer in self.optims.values():
            optimizer.step()
    
    def v2t(self, vector):
        return [train_id2token_unigram[i] for i in vector]
        
    def train_step(self, xs, ys):
        """Train model to produce ys given xs.
        :param batch: parlai.core.torch_agent.Batch, contains tensorized
                      version of observations.
        Return estimated responses, with teacher forcing on the input sequence
        (list of strings of length batchsize).
        """
        if xs is None:
            return
        xs = xs.to(self.device)
        ys = ys.to(self.device)

        self.zero_grad()
        self.bag_of_ngrams.train()
        self.decoder.train()
    
        bow_output = self.bag_of_ngrams(xs)
        decoder_output = self.decoder(bow_output)
            
        loss = self.criterion(decoder_output, ys.view(-1))

        loss.backward()
        self.update_params()

        _max_score, predictions = decoder_output.max(1)
        
        return self.v2t(predictions), loss.item() 

    def eval_step(self):
        """Generate a response to the input tokens.
        :param batch: parlai.core.torch_agent.Batch, contains tensorized
                      version of observations.
        Return predicted responses (list of strings of length batchsize).
        """
        # just predict
        self.bag_of_ngrams.eval()
        self.decoder.eval()

        predictions = []
        encoder_input = torch.LongTensor([BOS_IDX] * N).unsqueeze(0)

        for _ in range(self.longest_label):
            decoder_input = self.bag_of_ngrams(encoder_input)
            decoder_output = self.decoder(decoder_input)
            _max_score, next_token = decoder_output.max(1)
            
            predictions.append(next_token)
            
            prev_tokens = torch.cat([encoder_input.squeeze(0)[1:N]])

            encoder_input = torch.cat((prev_tokens, next_token), 0).unsqueeze(0)
            
            # stop if you've found the 
            if next_token.item() == EOS_IDX:
                break
                
        predictions = torch.cat(predictions, 0)
        return self.v2t(predictions)


In [None]:
num_epochs = 500
bag_of_ngrams = BagOfNGrams(len(train_id2token_unigram), emb_dim=300, hidden_size=256, out_size=128, activation='Tanh', nlayers=1, reduce='mean', dropout=0.0, batch_norm=False)
decoder = DecoderMLP(input_size=128, output_size=len(train_id2token_unigram), hidden_size=256)
model = seq2seq(bag_of_ngrams, decoder, use_cuda=False, lr=1e-1)

for epoch in range(num_epochs):
    # Train
    train_loss_epoch = 0
    for i, (data, labels) in _tqdm(enumerate(train_loader)):
        prediction, loss = model.train_step(data, labels)
        train_loss_epoch += loss
    if epoch % 10 == 0:
        print("Epoch {}: Loss {}".format(epoch, train_loss_epoch))
#         print("prediction ", prediction)

        generated = model.eval_step()
        generated_str = ' '.join([g[0] for g in generated])
        print("Generated Sentence: ", generated_str)
        


## Using KenLM