# Loading Glove Vectors in Pytorch

In [1]:
from torch import nn
from torch.autograd import Variable
import torch
import numpy as np

from vectorizers import IndexVectorizer
from datasets import SubjObjDataset

## Set Paths

In [25]:
glove_path = '/home/austin/data/glove/glove.6B.50d.txt'
data_path = '/home/austin/data/SUBJDATA/train.csv'

## Glove helper functions
`load_glove` reads the glove.txt file line by line and creates a dictionary mapping words to vectors. For `glove.6B.50d.txt` this dictionary has 400k words each mapped to a 50 dimensional vector. We can use this to check the values of our pytorch embedding layer. When we use glove to initialize pytorch embedding layers we will only load the words in our corpus vocabulary rather than the full 400k. For my corpus, I only needed 19k vectors.

`load_glove_embeddings` takes a dictionary mapping words to indexes (must be computed from your training corpus) and returns a matrix of embeddings which we can use to initialize a Pytorch embedding layer.

In [3]:
def load_glove(path):
    """
    creates a dictionary mapping words to vectors from a file in glove format.
    """
    with open(path) as f:
        glove = {}
        for line in f.readlines():
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            glove[word] = vector
        return glove

In [6]:
def load_glove_embeddings(path, word2idx, embedding_dim=50):
    with open(path) as f:
        embeddings = np.zeros((len(word2idx), embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = word2idx.get(word)
            if index:
                vector = np.array(values[1:], dtype='float32')
                embeddings[index] = vector
        return torch.from_numpy(embeddings)

In [None]:
%time glove = load_glove(glove_path)

In [26]:
len(glove)

400000

In [19]:
glove['cat']

array([ 0.45280999, -0.50107998, -0.53714001, -0.015697  ,  0.22191   ,
        0.54601997, -0.67300999, -0.68910003,  0.63493001, -0.19726001,
        0.33684999,  0.77350003,  0.90094   ,  0.38488001,  0.38367   ,
        0.26570001, -0.08057   ,  0.61088997, -1.28939998, -0.22313   ,
       -0.61578   ,  0.21697   ,  0.35613999,  0.44499001,  0.60885   ,
       -1.16330004, -1.15789998,  0.36118001,  0.10466   , -0.78324997,
        1.43519998,  0.18629   , -0.26111999,  0.83275002, -0.23123001,
        0.32481   ,  0.14485   , -0.44552001,  0.33497   , -0.95946002,
       -0.097479  ,  0.48137999, -0.43351999,  0.69454998,  0.91043001,
       -0.28173   ,  0.41637   , -1.26090002,  0.71278   ,  0.23782   ], dtype=float32)

## Toy example

In [31]:
corpus = 'the cat slept on the mat .'
vocab = set(corpus.split()) # compute vocab, 6 words
word2idx = {word: idx for idx, word in enumerate(vocab)} # create word index

In [36]:
word2idx

{'.': 5, 'cat': 2, 'mat': 0, 'on': 4, 'slept': 3, 'the': 1}

In [33]:
toy_embeddings = load_glove_embeddings(glove_path, word2idx)

In [39]:
toy_embeddings # 6 words x 60 embedding dimensions
toy_embeddings.size()

torch.Size([6, 50])

### Init pytorch layer

In [20]:
toy_embedding = nn.Embedding(toy_embeddings.size(0), toy_embeddings.size(1))
toy_embedding.weight = nn.Parameter(toy_embeddings)

In [21]:
toy_embedding

Embedding(6, 50)

Get embedding for 'cat' (index 2)

In [40]:
toy_embedding(Variable(torch.LongTensor([2])))

Variable containing:

Columns 0 to 9 
 0.4528 -0.5011 -0.5371 -0.0157  0.2219  0.5460 -0.6730 -0.6891  0.6349 -0.1973

Columns 10 to 19 
 0.3368  0.7735  0.9009  0.3849  0.3837  0.2657 -0.0806  0.6109 -1.2894 -0.2231

Columns 20 to 29 
-0.6158  0.2170  0.3561  0.4450  0.6089 -1.1633 -1.1579  0.3612  0.1047 -0.7832

Columns 30 to 39 
 1.4352  0.1863 -0.2611  0.8328 -0.2312  0.3248  0.1449 -0.4455  0.3350 -0.9595

Columns 40 to 49 
-0.0975  0.4814 -0.4335  0.6945  0.9104 -0.2817  0.4164 -1.2609  0.7128  0.2378
[torch.DoubleTensor of size 1x50]

check against glove vector

In [41]:
glove.get('cat')

array([ 0.45280999, -0.50107998, -0.53714001, -0.015697  ,  0.22191   ,
        0.54601997, -0.67300999, -0.68910003,  0.63493001, -0.19726001,
        0.33684999,  0.77350003,  0.90094   ,  0.38488001,  0.38367   ,
        0.26570001, -0.08057   ,  0.61088997, -1.28939998, -0.22313   ,
       -0.61578   ,  0.21697   ,  0.35613999,  0.44499001,  0.60885   ,
       -1.16330004, -1.15789998,  0.36118001,  0.10466   , -0.78324997,
        1.43519998,  0.18629   , -0.26111999,  0.83275002, -0.23123001,
        0.32481   ,  0.14485   , -0.44552001,  0.33497   , -0.95946002,
       -0.097479  ,  0.48137999, -0.43351999,  0.69454998,  0.91043001,
       -0.28173   ,  0.41637   , -1.26090002,  0.71278   ,  0.23782   ], dtype=float32)

### optionally freeze embeddings

In [64]:
toy_embedding.weight.requires_grad = False

## with full sized 2000 document subj/obj dataset

In [49]:
vectorizer = IndexVectorizer()
train = SubjObjDataset(data_path, vectorizer)
word2idx = train.vectorizer.word2idx

In [59]:
word2idx['fast'], word2idx['sloth']

(799, 4623)

In [53]:
embeddings = load_glove_embeddings(glove_path, word2idx)

In [55]:
embedding = nn.Embedding(embeddings.size(0), embeddings.size(1), padding_idx=0)
embedding.weight = nn.Parameter(embeddings)

In [60]:
sentence = Variable(torch.LongTensor([799, 4623]))

In [61]:
embedding(sentence)

Variable containing:

Columns 0 to 9 
-0.2078 -0.8048  0.1001 -0.2891 -0.0268 -0.2786 -0.7156 -0.1995  0.8832  0.4722
 0.4544  0.0864 -0.7278  0.2280  0.7979  0.7398  0.4147 -1.0603 -0.4207 -0.3668

Columns 10 to 19 
-0.0274  0.1770 -0.6217  0.4158 -0.1281  0.4110  0.4076 -0.0434 -0.5830 -0.9055
 0.1635  1.1258 -0.0999 -0.4772  0.2275  0.3376  0.8762  0.8648 -0.8443  0.3187

Columns 20 to 29 
 0.0880  0.1283 -0.1299  0.5983  1.0866 -1.0565  0.4097 -0.1253  1.1902  0.5631
-1.3000 -0.8056 -0.0405 -0.4911 -0.3854  0.1839 -0.4309  0.8587  0.3448 -0.3127

Columns 30 to 39 
 3.2721  0.3893  0.3180  0.5927  0.3992 -0.1777 -0.0345  0.7122 -0.6254  0.1130
-0.9190  0.5490  0.6777  0.5153 -0.6939  0.6829 -0.9493 -1.2356  0.5153 -0.2031

Columns 40 to 49 
-0.3679  0.3658 -0.2309  0.3579  0.6080  0.3424  0.4804 -0.5778  0.1787  0.5947
-0.0951 -0.2065 -0.4152  0.9467  0.9807  0.0654  0.0235  0.5811  0.1710 -1.0500
[torch.DoubleTensor of size 2x50]

In [62]:
glove.get('fast')

array([-0.20784   , -0.80484998,  0.10014   , -0.28913   , -0.02678   ,
       -0.27864   , -0.71560001, -0.19953001,  0.88322002,  0.47222999,
       -0.027356  ,  0.17704   , -0.62175   ,  0.41580999, -0.12808   ,
        0.41097   ,  0.40763   , -0.043396  , -0.58298999, -0.90552002,
        0.087957  ,  0.12834001, -0.12993   ,  0.59827   ,  1.08659995,
       -1.05649996,  0.40970001, -0.12531   ,  1.19019997,  0.56308001,
        3.27209997,  0.38927999,  0.31797999,  0.59271997,  0.39923999,
       -0.17766   , -0.034477  ,  0.71217   , -0.62542999,  0.11305   ,
       -0.36787999,  0.36579001, -0.23086999,  0.35788   ,  0.60797   ,
        0.34241   ,  0.4804    , -0.57778001,  0.17871   ,  0.59465998], dtype=float32)

In [63]:
glove.get('sloth')

array([ 0.45442   ,  0.086417  , -0.72779   ,  0.22804999,  0.79790002,
        0.73975998,  0.41474   , -1.06029999, -0.42067999, -0.36678001,
        0.16350999,  1.12580001, -0.099862  , -0.47716999,  0.22751001,
        0.33763   ,  0.87620997,  0.86475998, -0.84430999,  0.31871   ,
       -1.29999995, -0.80563003, -0.040519  , -0.49107999, -0.38536999,
        0.18391   , -0.43094   ,  0.85870999,  0.3448    , -0.31274   ,
       -0.91903001,  0.54904997,  0.67769003,  0.51529998, -0.69395   ,
        0.68287998, -0.94928998, -1.23559999,  0.51529998, -0.2031    ,
       -0.095143  , -0.20654   , -0.41523999,  0.94669998,  0.98071003,
        0.06539   ,  0.023472  ,  0.58108997,  0.17095999, -1.04999995], dtype=float32)