In [41]:
from typing import List
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
import pickle
import urllib
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandr.khvorov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def tokenize_to_words(text: str):
    return word_tokenize(text)

def tokenize_to_sents(text: str):
    return sent_tokenize(text)

def read_nips(path: str, documents_limit=None) -> List[List[str]]:
    df = pd.read_csv(path, compression='gzip', sep=',')
    docs = df['paper_text'].values.astype(np.str)
    sents = []
    for doc in docs if documents_limit is None else docs[:documents_limit]:
        sents += [tokenize_to_words(s) for s in tokenize_to_sents(doc)]
    return sents

In [11]:
data = read_nips("../resources/datasets/nips-papers.csv.gz", documents_limit=10)

In [12]:
print(data[0])
print(len(data))
vocabulary = set([w for s in data for w in s])
print(len(vocabulary))

['Boosting', 'Density', 'Estimation', 'Saharon', 'Rosset', 'Department', 'of', 'Statistics', 'Stanford', 'University', 'Stanford', ',', 'CA', ',', '94305', 'saharon', '@', 'stat.stanford.edu', 'Eran', 'Segal', 'Computer', 'Science', 'Department', 'Stanford', 'University', 'Stanford', ',', 'CA', ',', '94305', 'eran', '@', 'cs.stanford.edu', 'Abstract', 'Several', 'authors', 'have', 'suggested', 'viewing', 'boosting', 'as', 'a', 'gradient', 'descent', 'search', 'for', 'a', 'good', 'fit', 'in', 'function', 'space', '.']
2995
6032


In [17]:
def download_glove(directory):
    testfile.urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", directory)

def glove_embs(glove_path: str, vocab_size=400000, dim=50):
    words = []
    idx = 0
    word2idx = {}
    vectors = []
    assert dim in {50, 100, 200, 300}
    with open(f'{glove_path}/glove.6B.{dim}d.txt', 'rb') as f:
        for l in f:
            line = l.decode().split()
            word = line[0]
            words.append(word)
            word2idx[word] = idx
            idx += 1
            vect = np.array(line[1:]).astype(np.float)
            vectors.append(vect)
    vectors = np.array(vectors).reshape((vocab_size, dim))
    return words, word2idx, vectors
   
dim = 50
words, word2idx, vectors = glove_embs("../resources/models/glove.6B", dim=dim)    

In [20]:
UNK = "UNK"
vocab = vocabulary & set(words)
for s in data:
    for i in range(len(s)):
        if s[i] not in vocab:
            s[i] = UNK

In [23]:
unk_vec = np.mean(vectors, axis=0)
vocab.add(UNK)
words.append(UNK)
word2idx[UNK] = len(vectors)
vectors = np.vstack((vectors, unk_vec.reshape(1, dim)))

In [43]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, weights_matrix, context_size=2):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix))
        vocab_size = weights_matrix.shape[0]
        embedding_dim = weights_matrix.shape[1]
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [51]:
ngrams = []
CONTEXT_SIZE=2
for sent in data:
    for i in range(len(sent) - CONTEXT_SIZE):
        ngrams.append(([sent[i + j] for j in range(CONTEXT_SIZE)], sent[i + CONTEXT_SIZE]))

In [68]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(vectors, context_size=CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [70]:
for epoch in range(20):
    total_loss = 0
    for context, target in ngrams[35:45]:
        context_idxs = torch.tensor([word2idx[w] for w in context], dtype=torch.long)
        model.zero_grad()
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, torch.tensor([word2idx[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)
    print(epoch, total_loss)
print(losses)

0 73.68544435501099
1 60.876033306121826
2 49.5262188911438
3 37.369999408721924
4 27.38015604019165
5 20.479694843292236
6 15.609149932861328
7 11.868441581726074
8 9.150538444519043
9 7.463377952575684
10 6.3802995681762695
11 5.542942047119141
12 4.854039192199707
13 4.271095275878906
14 3.7719554901123047
15 3.347484588623047
16 2.9810791015625
17 2.6672191619873047
18 2.39827823638916
19 2.1705379486083984
[128.66961288452148, 126.67717933654785, 124.57047176361084, 122.18837261199951, 119.34571743011475, 115.8079195022583, 111.27924823760986, 105.3598518371582, 97.52157735824585, 87.12769889831543, 73.68544435501099, 60.876033306121826, 49.5262188911438, 37.369999408721924, 27.38015604019165, 20.479694843292236, 15.609149932861328, 11.868441581726074, 9.150538444519043, 7.463377952575684, 6.3802995681762695, 5.542942047119141, 4.854039192199707, 4.271095275878906, 3.7719554901123047, 3.347484588623047, 2.9810791015625, 2.6672191619873047, 2.39827823638916, 2.1705379486083984]


In [67]:
ngrams[35:45]

[(['authors', 'have'], 'suggested'),
 (['have', 'suggested'], 'viewing'),
 (['suggested', 'viewing'], 'boosting'),
 (['viewing', 'boosting'], 'as'),
 (['boosting', 'as'], 'a'),
 (['as', 'a'], 'gradient'),
 (['a', 'gradient'], 'descent'),
 (['gradient', 'descent'], 'search'),
 (['descent', 'search'], 'for'),
 (['search', 'for'], 'a')]

In [72]:
print(model(torch.tensor([word2idx[w] for w in ['suggested', 'viewing']], dtype=torch.long)).argmax().item())
print(word2idx['boosting'])

7130
7130


In [None]:
class FCNN22(nn.Module):
    def __init__(self, dim=50):
        super(FCNN22, self).__init__()
        self.fc1 = nn.Linear(4 * dim, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x