In [335]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm
import numpy as np
import collections
import zipfile
import random
import pickle

In [235]:
class Dataset:
    def __init__(self, filename='./text8.zip', word_num=100, context_size=3):
        self.filename = filename
        self.word_num = word_num
        self.context_size = context_size
        self.words = list()
        self.count = [['UNK', -1]]
        self.word_dict = dict()
        self.rev_word_dict = dict()
        self.data = list()
        self.comat = np.zeros((word_num, word_num))
        self.comat_nz = np.array([])
        self.read_file()
        self.build_dataset()
        self.build_comat()
        
    def read_file(self):
        print('reading words ... ...')
        with zipfile.ZipFile(self.filename) as f:
            self.words = np.array(f.read(f.namelist()[0]).decode(encoding='utf-8').split())
            
    def build_dataset(self):
        print('counting words ... ...')
        self.count.extend(collections.Counter(self.words).most_common(self.word_num-1))
        # construct word_dict
        print('constructing word dict ... ...')
        for w, _ in self.count:
            self.word_dict[w] = len(self.word_dict)
        # transfer word into number, store in list 'data'
        print('word to index ... ...')
        unk_count = 0
        for w in self.words:
            index = self.word_dict.get(w, 0)
            if index == 0:
                unk_count += 1
            self.data.append(index)
        self.count[0][1] = unk_count
        # reverse word_dict
        print('reverse word dict')
        self.rev_word_dict = dict(zip(self.word_dict.values(), self.word_dict.keys()))
    
    
    def build_comat(self):
        print('building co-occurrences matrix')
        for i in range(len(self.data)):
            for j in range(1, self.context_size+1):
                if i-j > 0:
                    self.comat[self.data[i], self.data[i-j]] += 1.0/j
                elif i+j < len(self.data):
                    self.comat[self.data[i], self.data[i+j]] += 1.0/j
        self.comat_nz = np.transpose(np.nonzero(self.comat))
                    
    def gen_batch(self, batch):
        batch_idx = np.random.choice(np.arange(len(self.comat_nz)), size=batch, replace=False)
        x = []
        y = []
        for i in batch_idx:
            pos = tuple(self.comat_nz[i])
            x.append(pos[0])
            y.append(pos[1])
        return x, y 

In [336]:
class GloVe:
    def __init__(self, dataset, filename='./text8.zip', word_num=100, context_size=3, batch=8,
                 x_max=3, alpha=0.75, embed_dim=10, epoch=10, lr=0.001):
        self.batch = batch
        self.x_max = x_max
        self.alpha = alpha
        self.epoch = epoch
        self.lr = lr
        self.embed = Variable(torch.from_numpy(np.random.normal(0, 0.01, (word_num, embed_dim))), requires_grad = True)
        self.bias = Variable(torch.from_numpy(np.random.normal(0, 0.01, word_num)), requires_grad = True)
        self.dataset = Dataset(filename, word_num, context_size)
        self.optimizer = optim.Adam([self.embed, self.bias], lr = lr)
    
    
    def f(self, xx):
        return torch.DoubleTensor([(x / self.x_max)**self.alpha if x < self.x_max else 1 for x in xx ])
    
    def forward(self, x, y):
        embed_x = self.embed[x]
        embed_y = self.embed[y]
        bias_x = self.bias[x]
        bias_y = self.bias[y]
        j = torch.bmm(embed_x.unsqueeze(1), embed_y.unsqueeze(1).transpose(1,2))
        j = j.squeeze() + bias_x + bias_y
        j = (j - torch.from_numpy(np.log(self.dataset.comat[x, y])))**2
        j = sum(torch.mul(j, self.f(self.dataset.comat[x, y])))
        return j
        
    def train(self):
        batch_num = len(self.dataset.data)//self.batch
#         bar = tqdm(range(self.epoch * batch_num))
        bar = range(self.epoch * batch_num)
        for i in bar:
            x, y = self.dataset.gen_batch(self.batch)
            x = Variable(torch.LongTensor(x))
            y = Variable(torch.LongTensor(y))
            self.optimizer.zero_grad()
            loss = self.forward(x, y)
            loss.backward()
            self.optimizer.step()
            if(i%10000 == 0):
                print("loss: %0.8f" % loss.data)
                pickle.dump(self.embed, open('GloVe', 'wb'))

In [337]:
glove = GloVe(dataset)

In [338]:
glove.train()

loss: 168.97591698
loss: 8.19773423
loss: 4.18474745
loss: 2.50789056
loss: 6.96148531
loss: 5.02068088
loss: 9.84709086
loss: 2.91712105
loss: 16.22386195
loss: 5.23861526
loss: 7.14759923
loss: 9.93169313
loss: 3.03838092
loss: 9.09990096
loss: 5.36941801
loss: 9.36936930
loss: 17.68256018
loss: 4.10081534
loss: 3.89361619
loss: 2.83067324
loss: 17.81718377
loss: 8.24241190
loss: 5.26297510
loss: 3.54882998
loss: 3.96574006
loss: 6.40680882
loss: 5.47632699
loss: 15.19574197
loss: 3.81581298
loss: 10.62344888
loss: 6.93386667
loss: 8.69797757
loss: 4.75956658
loss: 3.27243788
loss: 6.68955872
loss: 2.63537668
loss: 5.63278802
loss: 5.94215877
loss: 3.88564234
loss: 6.13853237
loss: 8.33609799
loss: 5.44853396
loss: 2.53350559
loss: 2.98063592
loss: 5.19190615
loss: 3.62611778
loss: 5.24675856
loss: 6.26717868
loss: 17.12289633
loss: 14.31008109
loss: 11.14480082
loss: 7.63105003
loss: 5.71897481
loss: 10.06286735
loss: 2.02568094
loss: 3.08183974
loss: 4.58682467
loss: 10.99883410
lo

KeyboardInterrupt: 