In [40]:
from gensim.corpora import Dictionary
import numpy as np

In [4]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]

In [10]:
corpus_token = [[line.split()[i] for i in range(len(line.split()))] for line in corpus]

In [11]:
corpus_token

[['he', 'is', 'a', 'king'],
 ['she', 'is', 'a', 'queen'],
 ['he', 'is', 'a', 'man'],
 ['she', 'is', 'a', 'woman'],
 ['warsaw', 'is', 'poland', 'capital'],
 ['berlin', 'is', 'germany', 'capital'],
 ['paris', 'is', 'france', 'capital']]

In [12]:
dictionary = Dictionary(corpus_token)

In [25]:
word2idx = dictionary.token2id
idx2word = {value: key for key, value in word2idx.items()}

In [28]:
vocabulary_size = len(dictionary)
vocabulary_size

15

# Make pairs data

In [63]:
def make_pairs(window_size, corpus_token, word2idx):
    idx_pairs = []

    #for each sentence
    for line in corpus_token:
        idxs = [word2idx[indx] for indx in line]

        #for each word, as center
        for center_word_pos in range(len(idxs)):
            #check window size:
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w

                if context_word_pos < 0 or context_word_pos >= len(idxs) or center_word_pos == context_word_pos:
                    continue
                #append (center_wobrd_idx, context_word_idx)
                idx_pairs.append((idxs[center_word_pos], idxs[context_word_pos]))
    return np.array(idx_pairs)   
    
idx_pairs = make_pairs(2, corpus_token, word2idx)
idx_pairs[:5]

array([[1, 2],
       [1, 0],
       [2, 1],
       [2, 0],
       [2, 3]])

In [64]:
len(idx_pairs)

70

# Model

In [200]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

## Data

In [201]:
class CustomDataset(Dataset):
    def __init__(self, pairs):
        self.x = pairs[:, 0]
        self.y = pairs[:, 1]
        self.size = len(pairs)
    
    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])
    
    def __len__(self):
        return self.size

In [202]:
dataset = CustomDataset(idx_pairs)

In [203]:
dataset[0]

(1, 2)

# Make real Custom

In [244]:
train_data = CustomDataset(idx_pairs)
train_loader = DataLoader(train_data, pin_memory = True)

In [245]:
for x, y in train_loader:
    break

In [246]:
x, y

(tensor([1]), tensor([2]))

## Net

In [257]:
class Net(nn.Module):
    def __init__(self, v_size, dim_embed = 5):
        super().__init__()
        
        self.embed = nn.Embedding(v_size, dim_embed)
        self.fc1 = nn.Linear(dim_embed, v_size)
        
    #x is a word
    def forward(self, x):
        embed = self.embed(x)
        out = self.fc1(embed)
        out = F.log_softmax(out)
        return out
    
    
    def get_embed(self, idx):
        return self.embed.weight.data[idx]

In [258]:
net = Net(v_size = vocabulary_size)
net

Net(
  (embed): Embedding(15, 5)
  (fc1): Linear(in_features=5, out_features=15, bias=True)
)

In [259]:
for data, target in train_loader:
    break
    
data

tensor([1])

In [260]:
net(data)

  if sys.path[0] == '':


tensor([[-2.8366, -2.9667, -3.4827, -3.0368, -3.2549, -2.4193, -2.6510, -2.6643,
         -2.7437, -2.3619, -2.2570, -2.5692, -3.3585, -2.9064, -2.1706]],
       grad_fn=<LogSoftmaxBackward>)

In [261]:
net.embed.weight.shape

torch.Size([15, 5])

70

# Train

In [325]:
epochs = 1000
embed_size = 5
learning_rate = 0.001

net = Net(v_size = vocabulary_size, dim_embed=embed_size)
net.cuda()

criterion = nn.NLLLoss()
optimizer = torch.optim.SGD(net.parameters(), lr = 0.001)


losses = []
for epoch in range(epochs):
    loss_val = 0
    for context, target in train_loader:
        context = context.cuda()
        target = target.cuda()
        
        y_pred = net(context)
        loss = criterion(y_pred, target)
        loss_val += loss.item()
        
        #parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
    loss_val/= train_data.size
    losses.append(loss_val)
    
    if epoch%100 == 0:
        
    
        print(f'Epoch: {epoch}  Loss: {loss_val}')
    

  if sys.path[0] == '':


Epoch: 0  Loss: 2.82853593315397
Epoch: 100  Loss: 2.3131393619946072
Epoch: 200  Loss: 2.1598001599311827
Epoch: 300  Loss: 2.0655481048992703
Epoch: 400  Loss: 1.9917369825499398
Epoch: 500  Loss: 1.9294971261705671
Epoch: 600  Loss: 1.8757747207369124
Epoch: 700  Loss: 1.8294639331953866
Epoch: 800  Loss: 1.7899069036756243
Epoch: 900  Loss: 1.7563524637903487


In [326]:
net.get_embed(0)

tensor([-1.7693,  1.5804,  1.1324,  0.0974, -0.6036], device='cuda:0')

In [327]:
net.embed.weight.data

tensor([[-1.7693,  1.5804,  1.1324,  0.0974, -0.6036],
        [-0.5919, -1.7285, -0.6342, -0.9270,  0.3097],
        [ 1.4267,  1.2348, -0.3616, -1.7160, -0.5362],
        [-0.6727, -1.0930, -0.0364, -1.3229, -0.2836],
        [ 0.1836,  0.3944, -0.5896,  0.1432,  1.7754],
        [-0.1902, -0.4395, -1.0029, -0.1735,  0.8215],
        [-1.2302,  0.5870,  0.0616, -0.9032,  0.7505],
        [-0.5739, -1.2524, -0.0195, -1.3695,  0.5792],
        [ 0.9533, -2.0017,  1.4673,  0.0081,  0.5492],
        [-1.0563, -0.8481,  0.0905, -0.3114, -2.1589],
        [-0.0128,  0.2989, -0.0523,  0.2534, -0.1739],
        [ 0.6727, -1.5024, -0.8055,  0.6373, -0.1328],
        [ 0.5118, -0.6419, -0.9790,  0.4873,  0.4657],
        [ 0.0172, -0.1245, -0.5040, -0.4147, -0.7439],
        [ 1.5319, -0.6502,  0.4326,  1.2661,  0.5440]], device='cuda:0')

In [328]:
weights = net.embed.weight.data

# Save weights and retrain

In [330]:
epochs = 1000
embed_size = 5
learning_rate = 0.001

net = Net(v_size = vocabulary_size, dim_embed=embed_size)
net.embed.weight.data = weights
net.cuda()

criterion = nn.NLLLoss()
optimizer = torch.optim.SGD(net.parameters(), lr = 0.001)


losses = []
for epoch in range(epochs):
    loss_val = 0
    for context, target in train_loader:
        context = context.cuda()
        target = target.cuda()
        
        y_pred = net(context)
        loss = criterion(y_pred, target)
        loss_val += loss.item()
        
        #parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    loss_val/= train_data.size
        
    losses.append(loss_val)
    
    if epoch%100 == 0:
        
    
        print(f'Epoch: {epoch}  Loss: {loss_val}')
    

  if sys.path[0] == '':


Epoch: 0  Loss: 2.971930813789368
Epoch: 100  Loss: 2.147501311983381
Epoch: 200  Loss: 1.927611494064331
Epoch: 300  Loss: 1.8337325726236615
Epoch: 400  Loss: 1.779344265801566
Epoch: 500  Loss: 1.7423444373267039
Epoch: 600  Loss: 1.7147527166775294
Epoch: 700  Loss: 1.692982372215816
Epoch: 800  Loss: 1.6751613514763968
Epoch: 900  Loss: 1.6601911698068892


# Generate dictionary of word2vec


In [331]:
idx2vec = {key: net.get_embed(key).cpu().numpy() for key in idx2word.keys()}
word2vec = {key: idx2vec[value] for key, value in word2idx.items()}

In [332]:
list(word2vec)[:3]

['a', 'he', 'is']

# Closest

In [333]:
def find_closest(word, k = 5, idx2vec = idx2vec):
    vector_word = word2vec[word]
    
    vectors = [value for key, value in word2vec.items()]
    target_word = np.tile(vector_word, (len(vectors), 1))
    dist = np.linalg.norm(vectors - target_word, axis = 1 )
    
    #k_closers
    k_indexes = np.argsort(dist)[1:k]
    k_words = [idx2word[index] for index in k_indexes]
    
    print(k_words)


In [335]:
find_closest('he', k = 5)

['woman', 'king', 'she', 'man']
