## Load Google's pretrained word2vec model 

In [1]:
"""
imports
"""
import numpy as np
import gensim                             # belongs to gensim package
from smart_open import open as smart_open # belongs to smart_open package

"""
load model, located in a 'models' folder
download it yourself, and do not put it in the github repo
(i.e. put it in your .gitignore file) because it's YUGE
download: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
"""
with smart_open('./models/GoogleNews-vectors-negative300.bin', 'rb') as word2vec_file:
    model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_file, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [2]:
"""
Putting model to use
notes:
 - model is made to produce 300D vectors
 - can't turn vectors into words
"""
model.get_vector("hi")
model.get_vector("Yagmur")

array([ 0.0612793 , -0.01312256, -0.02307129, -0.04736328,  0.02270508,
        0.12695312, -0.06396484, -0.22949219, -0.01116943,  0.04931641,
       -0.01977539, -0.06738281, -0.18457031,  0.02856445, -0.06445312,
       -0.07861328, -0.04321289, -0.08300781, -0.10009766, -0.08496094,
       -0.01409912,  0.12109375,  0.01855469,  0.03051758,  0.00610352,
       -0.11865234,  0.08105469, -0.01220703,  0.09716797, -0.16992188,
       -0.1171875 , -0.12011719,  0.10302734,  0.05249023, -0.01904297,
       -0.01696777, -0.11669922,  0.16503906, -0.10986328,  0.04785156,
        0.13769531, -0.00488281, -0.0534668 , -0.08789062,  0.12890625,
       -0.05029297, -0.12890625, -0.13769531, -0.03833008, -0.07226562,
       -0.10351562,  0.03930664,  0.07714844,  0.03686523,  0.0402832 ,
        0.05834961,  0.03442383,  0.17578125,  0.0324707 , -0.04882812,
        0.00497437, -0.078125  ,  0.02050781, -0.01116943,  0.14160156,
        0.00282288,  0.22851562, -0.05078125,  0.08642578, -0.02

## Load the arxiv data

In [4]:
import json
DATAPATH = "data/articles.json"

with open(DATAPATH, "r", encoding="utf-8") as f:
    articles = json.load(f)
    
all_categories = []
articles_dict = dict()
for article in articles:
    category = article["category"]
    if category not in all_categories:
        all_categories.append(category)
        articles_dict[category] = []
    articles_dict[category].append(article)
    
n_categories = len(articles_dict.keys())

## Turning abstracts into pytorch tensors

In [None]:
import re
text = "Hi David, I'm just testing this. Please pay attention to me."
y = lambda x: re.sub("[^\w'-]", " ",  x).split()
y(text)

In [7]:
import torch
import string
import unicodedata
import re

tokenize = lambda x: re.sub("[^\w'-]", " ",  x).split()


def lineToTensor(text):
    words = tokenize(text)
    vectors = []
    for word in words:
        try:
            vector = model.get_vector(word)
            vectors.append(vector)
        except:
            pass
        
    #create 3D tensor with the shape which is proper for the LSTM
    return torch.tensor([[v] for v in vectors])

print (lineToTensor("wow look at this hot potato").shape)

torch.Size([6, 1, 300])


## Creating the RNN architecture

In [None]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        
        self.i2h = nn.LSTM(input_size, hidden_size, 1)
        
        
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input):
        hidden , (h_t, c_t) = self.i2h(input)
        output = self.h2o(hidden[-1])
        output = self.softmax(output)
        return output
    
rnn=LSTM(300, 300, n_categories)

## Preparing for training

In [None]:
def categoryFromOutput(out):
    top_n, top_i = out.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

print(categoryFromOutput(rnn(lineToTensor("some random line here"))))

In [None]:
from random import choice

def randomTrainingExample():
    category = choice(all_categories)
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    article = choice(articles_dict[category])
    abstract = article["abstract"]
    abstract_tensor = lineToTensor(abstract)
    return category, abstract, category_tensor, abstract_tensor

randomTrainingExample()

## Training the network

In [None]:
import torch.optim as optim

learning_rate = 0.01

criterion = nn.NLLLoss()
optimizer = optim.SGD(rnn.parameters(), lr = learning_rate, momentum = 0.9)

def train(category_tensor, abstract_tensor):
    optimizer.zero_grad()
    
    output = rnn(abstract_tensor)
            
    loss = criterion(output, category_tensor)
    loss.backward()
    optimizer.step()
        
    return output, loss.item()

In [None]:
import time

n_iters = 1000000
print_every = 100
plot_every = 10

# keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = s//60
    s -= m*60
    return "{}m {}s".format(m, s)

start = time.time()

for it in range(1, n_iters+1):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss
    
    if not (it % print_every):
        guess, guess_i = categoryFromOutput(output)
        correct = "V" if guess == category else "X {}".format(category)
        print("{} {}% ({}) {:.4} / {} {}".format(it, it/n_iters*100, timeSince(start), loss, guess, correct))
        
    if not (it % plot_every):
        all_losses.append(current_loss / plot_every)
        current_loss = 0

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(all_losses)