In [164]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm_notebook
from nltk.corpus import stopwords 

In [90]:
class NPLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, window_size, activation_size):
        super(NPLM, self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.window_size = window_size
        self.activation_size = activation_size
        
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(window_size * embed_dim, activation_size)
        self.tanh = nn.Tanh()
        
        self.fc2 = nn.Linear(activation_size + window_size * embed_dim, vocab_size)
        
        self.softmax = nn.Softmax()
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embeddings.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()
        
    def forward(self, x):
        X = self.embeddings(x).view(-1, self.embed_dim * self.window_size)
        tanh_X = self.tanh(self.fc(X))
        
        X = torch.cat((X, tanh_X), dim=1)
        X = self.fc2(X)
        
        return self.softmax(X)

# Data Preprocessing

In [18]:
from nltk.corpus import brown

In [21]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [168]:
stopwords = stopwords.words('english')

In [169]:
brown_texts, brown_cate = {}, brown.categories()
for category in brown_cate:
    brown_texts[category] = [word for word in brown.words(categories=category) if not word in stopwords] 

In [32]:
def generate_w2i(texts_dict):
    w2i, i2w = {}, {}
    i = 0
    for cate, words in brown_texts.items():
        for word in words:
            if word.lower() not in w2i:
                w2i[word.lower()] = i
                i2w[i] = word.lower()
                i += 1
    return w2i, i2w

In [171]:
w2i, i2w = generate_w2i(brown_texts)

In [85]:
def preprocessing_data_in_ngram(texts_dict, n, w2i):
    train_pairs, validation_pairs, test_pairs = [], [], []
    for category in texts_dict:
        print(category)
        cate_pairs = []
        for i in range(len(texts_dict[category])-n):
            cate_pairs.append(([w2i[w.lower()] for w in texts_dict[category][i:i+n]], 
                               w2i[texts_dict[category][i+n].lower()]))
        train_size = len(cate_pairs) * 7 // 10
        val_size = len(cate_pairs) * 9 // 10
        train_pairs.extend(cate_pairs[:train_size])
        validation_pairs.extend(cate_pairs[train_size:val_size])
        test_pairs.extend(cate_pairs[val_size:])
    return train_pairs, validation_pairs, test_pairs

In [172]:
train_set, validation_set, test_set = preprocessing_data_in_ngram(brown_texts, 3, w2i)

adventure
belles_lettres
editorial
fiction
government
hobbies
humor
learned
lore
mystery
news
religion
reviews
romance
science_fiction


In [173]:
train_set

[([0, 1, 2], 3),
 ([1, 2, 3], 4),
 ([2, 3, 4], 5),
 ([3, 4, 5], 6),
 ([4, 5, 6], 7),
 ([5, 6, 7], 8),
 ([6, 7, 8], 9),
 ([7, 8, 9], 10),
 ([8, 9, 10], 7),
 ([9, 10, 7], 8),
 ([10, 7, 8], 11),
 ([7, 8, 11], 12),
 ([8, 11, 12], 13),
 ([11, 12, 13], 14),
 ([12, 13, 14], 5),
 ([13, 14, 5], 7),
 ([14, 5, 7], 15),
 ([5, 7, 15], 16),
 ([7, 15, 16], 17),
 ([15, 16, 17], 18),
 ([16, 17, 18], 19),
 ([17, 18, 19], 20),
 ([18, 19, 20], 7),
 ([19, 20, 7], 21),
 ([20, 7, 21], 22),
 ([7, 21, 22], 7),
 ([21, 22, 7], 23),
 ([22, 7, 23], 24),
 ([7, 23, 24], 25),
 ([23, 24, 25], 26),
 ([24, 25, 26], 27),
 ([25, 26, 27], 5),
 ([26, 27, 5], 17),
 ([27, 5, 17], 28),
 ([5, 17, 28], 29),
 ([17, 28, 29], 30),
 ([28, 29, 30], 31),
 ([29, 30, 31], 7),
 ([30, 31, 7], 32),
 ([31, 7, 32], 33),
 ([7, 32, 33], 34),
 ([32, 33, 34], 35),
 ([33, 34, 35], 36),
 ([34, 35, 36], 37),
 ([35, 36, 37], 38),
 ([36, 37, 38], 39),
 ([37, 38, 39], 40),
 ([38, 39, 40], 7),
 ([39, 40, 7], 41),
 ([40, 7, 41], 42),
 ([7, 41, 42], 43),

In [174]:
train_X = torch.tensor([pair[0] for pair in train_set])
train_Y = torch.tensor([pair[1] for pair in train_set])

val_X = torch.tensor([pair[0] for pair in validation_set])
val_Y = torch.tensor([[pair[1]] for pair in validation_set])

In [175]:
def generate_batch(train_X, train_Y, size):
    a = np.random.choice(len(train_X), size, replace=False)
    return train_X[a], train_Y[a]

In [None]:
sentences = [ "i like dog", "i love coffee", "i hate milk"]

In [None]:
vocab_size = len(w2i)
embed_dim = 100
window_size = 3
activation_size = 150
size = 32

nplm = NPLM(vocab_size, embed_dim, window_size, activation_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nplm.parameters(), lr=0.01)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

epoch = 10

validation_loss = evaluate(val_X[:10000], val_Y[:10000])
print("Validation Loss Before training: {}".format(validation_loss))

for i in range(epoch):
    epoch_loss = 0
    for _ in tqdm_notebook(range(len(train_X) // size // 10)):
        optimizer.zero_grad()
        x, y = generate_batch(train_X, train_Y, size)
        pred_y = nplm(x)
        loss = criterion(pred_y, y)
        loss.backward()
        #print(loss)
        epoch_loss += loss.item()
        optimizer.step()
    #scheduler.step()
    validation_loss = evaluate(val_X[:10000], val_Y[:10000])
    print("Training Loss: {}, Validation Loss After epoch {}: {}".format(epoch_loss / (len(train_X)//size), i, validation_loss))

Validation Loss Before training: 10.815771451663972


HBox(children=(FloatProgress(value=0.0, max=1591.0), HTML(value='')))


Training Loss: 1.0728242405699149, Validation Loss After epoch 0: 10.716525092601776


HBox(children=(FloatProgress(value=0.0, max=1591.0), HTML(value='')))


Training Loss: 1.0722782241453053, Validation Loss After epoch 1: 10.716286082553864


HBox(children=(FloatProgress(value=0.0, max=1591.0), HTML(value='')))

In [135]:
def evaluate(val_X, val_Y):
    with torch.no_grad():
        validation_loss = 0
        for x, y in zip(val_X, val_Y):
            val_pred_y = nplm(x)
            loss = criterion(val_pred_y, y)
            validation_loss += loss.item()
    return validation_loss / len(val_X)