In [39]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import spacy
from spacy.lang.en.examples import sentences

In [45]:
def tokenize_to_words(text: str):
    return word_tokenize(text)

def tokenize_to_sents(text: str):
    return sent_tokenize(text)

def read_nips(path: str, documents_limit=None):
    df = pd.read_csv(path, compression='gzip', sep=',')
    docs = df['paper_text'].values.astype(np.str)
    random = np.random.RandomState(13)
    random.shuffle(docs)
    sents = []
    for doc in docs if documents_limit is None else docs[:documents_limit]:
        sents += [tokenize_to_words(s) for s in tokenize_to_sents(doc)]
    return sents

In [51]:
data = read_nips("../resources/datasets/nips-papers.csv.gz", documents_limit=None)

In [52]:
print(data[0])
print(len(data))
vocabulary = set([w for s in data for w in s])
print(len(vocabulary))

['Boosting', 'Density', 'Estimation', 'Saharon', 'Rosset', 'Department', 'of', 'Statistics', 'Stanford', 'University', 'Stanford', ',', 'CA', ',', '94305', 'saharon', '@', 'stat.stanford.edu', 'Eran', 'Segal', 'Computer', 'Science', 'Department', 'Stanford', 'University', 'Stanford', ',', 'CA', ',', '94305', 'eran', '@', 'cs.stanford.edu', 'Abstract', 'Several', 'authors', 'have', 'suggested', 'viewing', 'boosting', 'as', 'a', 'gradient', 'descent', 'search', 'for', 'a', 'good', 'fit', 'in', 'function', 'space', '.']
2895745
518761


In [59]:
def save_glove_embs(glove_path: str, vocab_size=400001, dim=50):
    words = []
    idx = 0
    word2idx = {}
    vectors = []
    with open(f'{glove_path}/glove.6B.50d.txt', 'rb') as f:
        for l in f:
            line = l.decode().split()
            word = line[0]
            words.append(word)
            word2idx[word] = idx
            idx += 1
            vect = np.array(line[1:]).astype(np.float)
            vectors.append(vect)
    vectors = np.array(vectors).reshape((vocab_size, dim))
    return words, word2idx, vectors
   
dim = 50
words, word2idx, vectors = save_glove_embs("../resources/models/glove.6B", dim=dim)    

In [63]:
UNK = "UNK"
vocab = vocabulary & set(words)
vocab.add(UNK)
# words = list(vocab)
for s in data:
    for i in range(len(s)):
        if s[i] not in vocab:
            s[i] = UNK

In [10]:
glove = {w: vectors[word2idx[w]] for w in words}

In [None]:
weights_matrix = np.zeros((len(vocab), dim))
for i, word in enumerate(vocab):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [None]:
class FCNN22(nn.Module):
def __init__(self, dim=50):
        super(FCNN22, self).__init__()
        self.fc1 = nn.Linear(4 * dim, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [14]:
spacy_md = spacy.load('en_core_web_md')

In [None]:
from nltk.model.ngram import NgramModel
lm = MLE(2)
lm.fit()