In [None]:
# https://rguigoures.github.io/word2vec_pytorch/

In [53]:
import re
import nltk
nltk.download('brown')
from nltk.corpus import brown
import itertools
corpus = []

for cat in ['news']:
    for text_id in brown.fileids(cat):
        raw_text = list(itertools.chain.from_iterable(brown.sents(text_id)))
        text = ' '.join(raw_text)
        text = text.lower()
        text.replace('\n', ' ')
        text = re.sub('[^a-z ]+', '', text)
        corpus.append([w for w in text.split() if w != ''])
print(corpus)



[nltk_data] Downloading package brown to /home/openpose/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [56]:
from collections import defaultdict
def remove_frequent_items(book_word_list, percentage):
    """
    Remove frequently occured words

    Param_1: List of list containing strings
    Param_2: Above x percentage of occurance will be removed
    Output_1: Cleaned list
    """
    treshold = int(len(book_word_list) * percentage / 100)
    DF = defaultdict(int)
    for cleaned_list in book_word_list:
        for word in set(cleaned_list):
                DF[word] += 1
    words_to_remove = {k:v for k,v in DF.items() if v > treshold }
    # A new dictionary of items that only has count above treshold
    words_to_remove_as_list = set(words_to_remove.keys())
    freq_items_removed_book_word_list = []
    for book in book_word_list:
        freq_items_removed_list = [word for word in book if word not in words_to_remove_as_list]
        freq_items_removed_book_word_list.append(freq_items_removed_list)
    return freq_items_removed_book_word_list

In [57]:
#Subsample frequent words

from collections import Counter
import random, math

def subsample_frequent_words(corpus):
    filtered_corpus = []
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    sum_word_counts = sum(list(word_counts.values()))
    word_counts = {word: word_counts[word]/float(sum_word_counts) for word in word_counts}
    for text in corpus:
        filtered_corpus.append([])
        for word in text:
            if random.random() < (1+math.sqrt(word_counts[word] * 1e3)) * 1e-3 / float(word_counts[word]):
                filtered_corpus[-1].append(word)
    return filtered_corpus


In [58]:
corpus = remove_frequent_items(corpus, 75)
vocabulary = set(itertools.chain.from_iterable(corpus))

word_to_index = {w: idx for (idx, w) in enumerate(vocabulary)}
index_to_word = {idx: w for (idx, w) in enumerate(vocabulary)}


In [59]:
corpus

[['fulton',
  'county',
  'grand',
  'jury',
  'friday',
  'investigation',
  'atlantas',
  'recent',
  'primary',
  'election',
  'produced',
  'evidence',
  'any',
  'irregularities',
  'took',
  'place',
  'jury',
  'further',
  'termend',
  'presentments',
  'city',
  'executive',
  'committee',
  'overall',
  'charge',
  'election',
  'deserves',
  'praise',
  'thanks',
  'city',
  'atlanta',
  'manner',
  'election',
  'conducted',
  'septemberoctober',
  'term',
  'jury',
  'charged',
  'fulton',
  'superior',
  'court',
  'judge',
  'durwood',
  'pye',
  'investigate',
  'reports',
  'possible',
  'irregularities',
  'hardfought',
  'primary',
  'won',
  'mayornominate',
  'ivan',
  'allen',
  'jr',
  'relative',
  'handful',
  'such',
  'reports',
  'received',
  'jury',
  'considering',
  'widespread',
  'interest',
  'election',
  'number',
  'voters',
  'size',
  'city',
  'jury',
  'did',
  'find',
  'many',
  'georgias',
  'registration',
  'election',
  'laws',
  'outmod

In [60]:
#Bag of words
import numpy as np

context_tuple_list = []
w = 4

for text in corpus:
    for i, word in enumerate(text):
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:
                context_tuple_list.append((word, text[j]))
print("There are {} pairs of target and context words".format(len(context_tuple_list)))



There are 349478 pairs of target and context words


In [5]:
# import torch
# import torch.nn  as  nn
# import torch.autograd as autograd
# import torch.optim as optim
# import torch.nn.functional as F


# class Word2Vec(nn.Module):

#     def __init__(self, embedding_size, vocab_size):
#         super(Word2Vec, self).__init__()
#         self.embeddings = nn.Embedding(vocab_size, embedding_size)
#         self.linear = nn.Linear(embedding_size, vocab_size)
        
#     def forward(self, context_word):
#         emb = self.embeddings(context_word)
#         hidden = self.linear(emb)
#         out = F.log_softmax(hidden)
#         return out


In [61]:
# Stop loss if it is not improoving
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.1):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.
        
    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]
    
    def stop_training(self):
        if len(self.loss_list) == 1:
            return False
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print("Loss gain: {}%".format(round(100*gain,2)))
        if gain < self.min_percent_gain:
            return True
        else:
            return False

In [62]:
#Learning
# vocabulary_size = len(vocabulary)

# net = Word2Vec(embedding_size=2, vocab_size=vocabulary_size)
# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(net.parameters())
# early_stopping = EarlyStopping()
# context_tensor_list = []

# for target, context in context_tuple_list:
#     target_tensor = autograd.Variable(torch.LongTensor([word_to_index[target]]))
#     context_tensor = autograd.Variable(torch.LongTensor([word_to_index[context]]))
#     context_tensor_list.append((target_tensor, context_tensor))
    
# while True:
#     losses = []
#     for target_tensor, context_tensor in context_tensor_list:
#         net.zero_grad()
#         log_probs = net(context_tensor)
#         loss = loss_function(log_probs, target_tensor)
#         loss.backward()
#         optimizer.step()
#         losses.append(loss.data)
#     print("Loss: ", np.mean(losses))
#     early_stopping.update_loss(np.mean(losses))
#     if early_stopping.stop_training():
#         break


In [63]:

import random

def get_batches(context_tuple_list, batch_size=100):
    random.shuffle(context_tuple_list)
    batches = []
    batch_target, batch_context, batch_negative = [], [], []
    for i in range(len(context_tuple_list)):
        batch_target.append(word_to_index[context_tuple_list[i][0]])
        batch_context.append(word_to_index[context_tuple_list[i][1]])
        batch_negative.append([word_to_index[w] for w in context_tuple_list[i][2]])
        if (i+1) % batch_size == 0 or i == len(context_tuple_list)-1:
            tensor_target = autograd.Variable(torch.from_numpy(np.array(batch_target)).long())
            tensor_context = autograd.Variable(torch.from_numpy(np.array(batch_context)).long())
            tensor_negative = autograd.Variable(torch.from_numpy(np.array(batch_negative)).long())
            batches.append((tensor_target, tensor_context, tensor_negative))
            batch_target, batch_context, batch_negative = [], [], []
    return batches

In [64]:
#artificial negative examples
from numpy.random import multinomial

def sample_negative(sample_size):
    sample_probability = {}
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    for word in word_counts:
        sample_probability[word] = word_counts[word]**0.75 / normalizing_factor
    words = np.array(list(word_counts.keys()))
    while True:
        word_list = []
        sampled_index = np.array(multinomial(sample_size, list(sample_probability.values())))
        for index, count in enumerate(sampled_index):
            for _ in range(count):
                 word_list.append(words[index])
        yield word_list


In [65]:
# make this threaded
import numpy as np

context_tuple_list = []
w = 4
negative_samples = sample_negative(8)

for text in corpus:
    for i, word in enumerate(text):
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:
                context_tuple_list.append((word, text[j], next(negative_samples)))
print("There are {} pairs of target and context words".format(len(context_tuple_list)))


There are 349478 pairs of target and context words


In [11]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F


class Word2Vec(nn.Module):

    def __init__(self, embedding_size, vocab_size):
        super(Word2Vec, self).__init__()
        self.embeddings_target = nn.Embedding(vocab_size, embedding_size)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size)

    def forward(self, target_word, context_word, negative_example):
        emb_target = self.embeddings_target(target_word)
        emb_context = self.embeddings_context(context_word)
        emb_product = torch.mul(emb_target, emb_context)
        emb_product = torch.sum(emb_product, dim=1)
        out = torch.sum(F.logsigmoid(emb_product))
        emb_negative = self.embeddings_context(negative_example)
        emb_product = torch.bmm(emb_negative, emb_target.unsqueeze(2))
        emb_product = torch.sum(emb_product, dim=1)
        out += torch.sum(F.logsigmoid(-emb_product))
        return -out


In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [17]:
import time

vocabulary_size = len(vocabulary)

loss_function = nn.CrossEntropyLoss()
net = Word2Vec(embedding_size=200, vocab_size=vocabulary_size)
net.to(device)
optimizer = optim.Adam(net.parameters())
early_stopping = EarlyStopping(patience=5, min_percent_gain=1)

while True:
    losses = []
    context_tuple_batches = get_batches(context_tuple_list, batch_size=2000)
    for i in range(len(context_tuple_batches)):
        net.zero_grad()
        target_tensor, context_tensor, negative_tensor = context_tuple_batches[i]
        loss = net(target_tensor.to(device), context_tensor.to(device), negative_tensor.to(device))
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        
    print("Loss: ", np.mean(losses))
    early_stopping.update_loss(np.mean(losses))
    if early_stopping.stop_training():
        break


Loss:  42464.12479397416
Loss:  37695.19969672996
Loss gain: 11.23%
Loss:  33361.513243341244
Loss gain: 21.44%
Loss:  29348.918949433017
Loss gain: 30.89%
Loss:  25671.315491495254
Loss gain: 39.55%
Loss:  22374.98783211366
Loss gain: 40.64%
Loss:  19485.28477304193
Loss gain: 41.59%
Loss:  16999.16738775712
Loss gain: 42.08%
Loss:  14876.15226546018
Loss gain: 42.05%
Loss:  13064.581314280063
Loss gain: 41.61%
Loss:  11516.553422501318
Loss gain: 40.9%
Loss:  10188.228062368144
Loss gain: 40.07%
Loss:  9041.926502340453
Loss gain: 39.22%
Loss:  8048.3942901997625
Loss gain: 38.4%
Loss:  7183.739513284547
Loss gain: 37.62%
Loss:  6428.678305066587
Loss gain: 36.9%
Loss:  5765.574144580696
Loss gain: 36.24%
Loss:  5180.688827836564
Loss gain: 35.63%
Loss:  4661.983903200818
Loss gain: 35.1%
Loss:  4200.678068136867
Loss gain: 34.66%
Loss:  3789.1840294946596
Loss gain: 34.28%
Loss:  3420.4725934121175
Loss gain: 33.98%
Loss:  3089.112342802281
Loss gain: 33.74%
Loss:  2790.507594112605

In [38]:
import numpy as np

def get_closest_word1(word, topn=10):
    word_distance = []
    emb = net.embeddings_target
    
    pdist = nn.PairwiseDistance()
    i = word_to_index[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long).to(device)
    
    v_i = emb(lookup_tensor_i)
    for j in range(len(vocabulary)):
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long).to(device)
            v_j = emb(lookup_tensor_j)
            word_distance.append((index_to_word[j], float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]

In [46]:
get_closest_word1("candidate")

[('the', 14.757811546325684),
 ('of', 15.457853317260742),
 ('in', 15.514660835266113),
 ('he', 15.621011734008789),
 ('and', 15.776084899902344),
 ('to', 15.858137130737305),
 ('a', 16.031057357788086),
 ('his', 16.076339721679688),
 ('that', 16.206310272216797),
 ('is', 16.230836868286133)]