## Load Google's pretrained word2vec model 

In [1]:
"""
imports
"""
import numpy as np
import gensim                             # belongs to gensim package
from smart_open import open as smart_open # belongs to smart_open package

"""
load model, located in a 'models' folder
download it yourself, and do not put it in the github repo
(i.e. put it in your .gitignore file) because it's YUGE
download: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
"""
with smart_open('./models/GoogleNews-vectors-negative300.bin', 'rb') as word2vec_file:
    model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_file, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [2]:
"""
Putting model to use
notes:
 - model is made to produce 300D vectors
 - can't turn vectors into words
"""
model.get_vector("hi")
model.get_vector("Yagmur")

array([ 0.0612793 , -0.01312256, -0.02307129, -0.04736328,  0.02270508,
        0.12695312, -0.06396484, -0.22949219, -0.01116943,  0.04931641,
       -0.01977539, -0.06738281, -0.18457031,  0.02856445, -0.06445312,
       -0.07861328, -0.04321289, -0.08300781, -0.10009766, -0.08496094,
       -0.01409912,  0.12109375,  0.01855469,  0.03051758,  0.00610352,
       -0.11865234,  0.08105469, -0.01220703,  0.09716797, -0.16992188,
       -0.1171875 , -0.12011719,  0.10302734,  0.05249023, -0.01904297,
       -0.01696777, -0.11669922,  0.16503906, -0.10986328,  0.04785156,
        0.13769531, -0.00488281, -0.0534668 , -0.08789062,  0.12890625,
       -0.05029297, -0.12890625, -0.13769531, -0.03833008, -0.07226562,
       -0.10351562,  0.03930664,  0.07714844,  0.03686523,  0.0402832 ,
        0.05834961,  0.03442383,  0.17578125,  0.0324707 , -0.04882812,
        0.00497437, -0.078125  ,  0.02050781, -0.01116943,  0.14160156,
        0.00282288,  0.22851562, -0.05078125,  0.08642578, -0.02

## Load the arxiv data

In [3]:
import json
DATAPATH = "data/articles.json"

with open(DATAPATH, "r", encoding="utf-8") as f:
    articles = json.load(f)
    
all_categories = []
articles_dict = dict()
for article in articles:
    category = article["category"]
    if category not in all_categories:
        all_categories.append(category)
        articles_dict[category] = []
    articles_dict[category].append(article)
    
n_categories = len(articles_dict.keys())

## Turning abstracts into pytorch tensors

In [4]:
import re
text = "Hi David, I'm just testing this. Please pay attention to me."
y = lambda x: re.sub("[^\w'-]", " ",  x).split()
y(text)

['Hi',
 'David',
 "I'm",
 'just',
 'testing',
 'this',
 'Please',
 'pay',
 'attention',
 'to',
 'me']

In [28]:
import torch
import string
import unicodedata
import re

tokenize = lambda x: re.sub("[^\w'-]", " ",  x).split()


def lineToTensor(text):
    words = tokenize(text)
    vectors = []
    for word in words:
        try:
            vector = model.get_vector(word)
            vectors.append(vector)
        except:
            pass
        
    #create 3D tensor with the shape which is proper for the LSTM
    if not vectors:
        raise Exception("This line did not produce any embedded words!")
    
    return torch.tensor([[v] for v in vectors])

print(lineToTensor("wow look at this hot potato").shape)

torch.Size([6, 1, 300])


## Creating the RNN architecture

In [29]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        
        self.i2h = nn.LSTM(input_size, hidden_size, 1)
        
        
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input):
        hidden , (h_t, c_t) = self.i2h(input)
        output = self.h2o(hidden[-1])
        output = self.softmax(output)
        return output
    
rnn=LSTM(300, 300, n_categories)

## Preparing for training

In [30]:
def categoryFromOutput(out):
    top_n, top_i = out.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

print(categoryFromOutput(rnn(lineToTensor("some random line here"))))

('Electrical Engineering and Systems Science', 3)


In [31]:
from random import choice

def randomTrainingExample():
    category = choice(all_categories)
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    article = choice(articles_dict[category])
    abstract = article["abstract"]
    abstract_tensor = lineToTensor(abstract)
    return category, abstract, category_tensor, abstract_tensor

randomTrainingExample()

('Statistics',
 'In this study, we consider the problem of selecting explanatory variables of\nfixed effects in linear mixed models under covariate shift, which is when the\nvalues of covariates in the model for prediction differ from those in the model\nfor observed data. We construct a variable selection criterion based on the\nconditional Akaike information introduced by Vaida and Blanchard (2005). We\nfocus especially on covariate shift in small area estimation and demonstrate\nthe usefulness of the proposed criterion. In addition, numerical performance is\ninvestigated through simulations, one of which is a design-based simulation\nusing a real dataset of land prices.',
 tensor([7]),
 tensor([[[ 0.0322,  0.1221,  0.2256,  ...,  0.0493, -0.0518, -0.1245]],
 
         [[ 0.1094,  0.1406, -0.0317,  ...,  0.0077,  0.1201, -0.1797]],
 
         [[-0.0598, -0.0422, -0.0791,  ..., -0.1123,  0.1206, -0.1543]],
 
         ...,
 
         [[ 0.0247, -0.0110,  0.3535,  ..., -0.3008,  0.0317,

## Training the network

In [None]:
import torch.optim as optim

learning_rate = 0.005

criterion = nn.NLLLoss()
optimizer = optim.SGD(rnn.parameters(), lr = learning_rate, momentum = 0.9)

def train(category_tensor, abstract_tensor):
    optimizer.zero_grad()
    
    output = rnn(abstract_tensor)
            
    loss = criterion(output, category_tensor)
    loss.backward()
    optimizer.step()
        
    return output, loss.item()

In [None]:
import time

n_iters = 100000
print_every = 100
plot_every = 10

# keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = s//60
    s -= m*60
    return "{}m {}s".format(m, s)

start = time.time()

for it in range(1, n_iters+1):
    try:
        category, line, category_tensor, line_tensor = randomTrainingExample()
    except:
        continue
        
    output, loss = train(category_tensor, line_tensor)
    current_loss += loss
    
    if not (it % print_every):
        guess, guess_i = categoryFromOutput(output)
        correct = "V" if guess == category else "X {}".format(category)
        print("{} {}% ({}) {:.4} / {} {}".format(it, it/n_iters*100, timeSince(start), loss, guess, correct))
        
    if not (it % plot_every):
        all_losses.append(current_loss / plot_every)
        current_loss = 0

100 0.1% (0.0m 21.116595029830933s) 2.073 / Computer Science X Quantitative Finance
200 0.2% (0.0m 41.806169509887695s) 2.114 / Quantitative Finance X Physics
300 0.3% (0.0m 59.503528356552124s) 1.976 / Physics X Electrical Engineering and Systems Science
400 0.4% (1.0m 17.612581968307495s) 2.077 / Quantitative Biology X Physics
500 0.5% (1.0m 34.4959933757782s) 1.967 / Computer Science V
600 0.6% (1.0m 51.91174864768982s) 2.176 / Electrical Engineering and Systems Science X Quantitative Finance
700 0.7000000000000001% (2.0m 9.275498390197754s) 2.232 / Economics X Computer Science
800 0.8% (2.0m 28.44747042655945s) 2.031 / Computer Science X Electrical Engineering and Systems Science
900 0.8999999999999999% (2.0m 48.95324516296387s) 2.112 / Computer Science X Mathematics
1000 1.0% (3.0m 5.708708763122559s) 2.312 / Mathematics X Statistics
1100 1.0999999999999999% (3.0m 30.709675073623657s) 2.058 / Quantitative Finance X Computer Science
1200 1.2% (3.0m 56.65896511077881s) 2.13 / Quanti

10200 10.2% (30.0m 57.08872175216675s) 1.096 / Electrical Engineering and Systems Science V
10300 10.299999999999999% (31.0m 14.055094480514526s) 1.608 / Physics V
10400 10.4% (31.0m 30.89304542541504s) 2.224 / Physics X Quantitative Biology
10500 10.5% (31.0m 49.58370327949524s) 1.412 / Economics V
10600 10.6% (32.0m 7.560282468795776s) 0.8005 / Quantitative Biology V
10700 10.7% (32.0m 24.49213719367981s) 0.7175 / Quantitative Biology V
10800 10.8% (32.0m 42.86898875236511s) 1.224 / Computer Science V
10900 10.9% (33.0m 0.8742561340332031s) 0.5153 / Quantitative Finance V
11000 11.0% (33.0m 18.770277738571167s) 0.7196 / Quantitative Finance V
11100 11.1% (33.0m 35.92694664001465s) 1.632 / Electrical Engineering and Systems Science V
11200 11.200000000000001% (33.0m 51.769145250320435s) 1.97 / Economics X Statistics
11300 11.3% (34.0m 9.047258377075195s) 5.303 / Quantitative Finance X Quantitative Biology
11400 11.4% (34.0m 25.901731967926025s) 2.636 / Economics X Statistics
11500 11.

20100 20.1% (61.0m 30.834396839141846s) 0.9691 / Physics V
20200 20.200000000000003% (61.0m 46.63622975349426s) 1.34 / Electrical Engineering and Systems Science V
20300 20.3% (62.0m 14.463703393936157s) 0.111 / Quantitative Biology V
20400 20.4% (62.0m 42.38825964927673s) 1.363 / Electrical Engineering and Systems Science X Computer Science
20500 20.5% (63.0m 15.064484596252441s) 2.094 / Computer Science X Statistics
20600 20.599999999999998% (63.0m 36.62802696228027s) 2.736 / Computer Science X Quantitative Biology
20700 20.7% (64.0m 0.020036697387695312s) 1.392 / Quantitative Biology V
20800 20.8% (64.0m 21.236376523971558s) 2.337 / Economics X Quantitative Biology
20900 20.9% (64.0m 55.33199453353882s) 1.738 / Statistics X Economics
21000 21.0% (65.0m 24.49144220352173s) 0.8905 / Economics V
21100 21.099999999999998% (66.0m 0.7660858631134033s) 0.742 / Statistics V
21200 21.2% (66.0m 18.331053018569946s) 1.275 / Quantitative Finance V
21300 21.3% (66.0m 35.050235748291016s) 1.716 /

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(all_losses)