<a href="https://colab.research.google.com/github/mamintoosi/Text-Mining/blob/master/code/Word2Vec_GloVe/medium_word_embeddings_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd

from IPython.display import display, clear_output
import time

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
text_for_training = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'she is a daughter',
    'he is a son'
]

In [None]:
def create_word_list(corpus):
    tokens = [x.split() for x in corpus]

    flat_list = set()
    for sublist in tokens:
        for item in sublist:
            flat_list.add(item)

    return list(flat_list)

vocabulary = create_word_list(text_for_training)

"vocabulary list with {} elements: {}".format(len(vocabulary), vocabulary)

"vocabulary list with 10 elements: ['he', 'is', 'a', 'queen', 'daughter', 'she', 'son', 'woman', 'man', 'king']"

In [None]:
# Some helper methods to quickly map word index to words and vice versa
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

word2idx, idx2word

({'a': 2,
  'daughter': 4,
  'he': 0,
  'is': 1,
  'king': 9,
  'man': 8,
  'queen': 3,
  'she': 5,
  'son': 6,
  'woman': 7},
 {0: 'he',
  1: 'is',
  2: 'a',
  3: 'queen',
  4: 'daughter',
  5: 'she',
  6: 'son',
  7: 'woman',
  8: 'man',
  9: 'king'})

In [None]:
# Data classes that will be used to structure training data
class TrainingWord:
    idx: int
    word: str

    def __init__(self, idx, word):
        self.idx = idx
        self.word = word

    def __repr__(self):
        return f'TrainingWord [idx: {self.idx}, word: {self.word}]'

class TrainingElement:
    context: TrainingWord
    target: TrainingWord

    def __init__(self, context, target):
        self.context = context
        self.target = target

    def __repr__(self):
        return f'TrainingElement [target: {self.target}, context: {self.context}]'

# Training data will be a list of tupel that contains the target word and a word that is within its context
# This is a very simple approach to train a neuronal by looking at a lot of examples
def create_training_data(sentences):
    
    tokenized_corpus = [x.split() for x in sentences]
    
    window_size = 2
    training_elements = []
    # for each sentence
    for sentence in tokenized_corpus:
        indices = [word2idx[word] for word in sentence]
        # for each word, threated as center word
        for center_word_pos in range(len(indices)):
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                    continue

                context_word_idx = indices[context_word_pos]

                target_word = TrainingWord(indices[center_word_pos], idx2word[indices[center_word_pos]])
                context_word = TrainingWord(indices[context_word_pos], idx2word[indices[context_word_pos]])
                traing_element = TrainingElement(context_word, target_word)

                training_elements.append(traing_element)

    return training_elements # it will be useful to have this as numpy array

training_elements = create_training_data(text_for_training)
training_elements

[TrainingElement [target: TrainingWord [idx: 0, word: he], context: TrainingWord [idx: 1, word: is]],
 TrainingElement [target: TrainingWord [idx: 0, word: he], context: TrainingWord [idx: 2, word: a]],
 TrainingElement [target: TrainingWord [idx: 1, word: is], context: TrainingWord [idx: 0, word: he]],
 TrainingElement [target: TrainingWord [idx: 1, word: is], context: TrainingWord [idx: 2, word: a]],
 TrainingElement [target: TrainingWord [idx: 1, word: is], context: TrainingWord [idx: 9, word: king]],
 TrainingElement [target: TrainingWord [idx: 2, word: a], context: TrainingWord [idx: 0, word: he]],
 TrainingElement [target: TrainingWord [idx: 2, word: a], context: TrainingWord [idx: 1, word: is]],
 TrainingElement [target: TrainingWord [idx: 2, word: a], context: TrainingWord [idx: 9, word: king]],
 TrainingElement [target: TrainingWord [idx: 9, word: king], context: TrainingWord [idx: 1, word: is]],
 TrainingElement [target: TrainingWord [idx: 9, word: king], context: TrainingWor

In [None]:
# This is the first layer of our neuronal network that gets the context word as an input
# To feed it to the neuronal network we need a vector that represents the context words' index
# For example if the vocabulary size is 10, the index of the context word is 3 then then vector is
# (0 0 0 1 0 0 0 0 0 0)
def get_one_hot_encoded_word_idx(word_idx, vocabulary):
    x = torch.zeros(len(vocabulary)).float()
    x[word_idx] = 1.0
    return x

one_hot_encoded_vector = get_one_hot_encoded_word_idx(3, vocabulary)

In [None]:
def create_connections_between_layers(row_count, col_count, initial_value):
    #tensor = torch.Tensor(row_count, col_count).float()
    tensor = torch.randn(row_count, col_count)
    #tensor = tensor.random_(5, 15) / 100
    return Variable(tensor, requires_grad=True)

In [None]:
# input layer -> hidden layer

# the embedding size is important to define how much meaning can be encoded in word embeddings
# to get a certain embedding vector size we can set the the hidden layer size
embedding_size = 5

W1 = create_connections_between_layers(embedding_size, len(vocabulary), 0.1)
z1 = torch.matmul(W1, one_hot_encoded_vector)

W1, z1

(tensor([[ 1.9063,  1.0872, -1.0287, -1.2518, -1.7934, -1.3401,  0.4342,  0.8470,
           0.5749,  1.0749],
         [ 0.1473, -0.4498,  0.9945,  1.3327, -0.6022, -1.2650, -0.5767, -0.0976,
           0.4298,  0.7898],
         [-1.5625,  1.6560, -0.8751,  0.5024, -0.3000, -0.4764,  0.4342, -0.1882,
           1.4419,  0.1132],
         [ 1.1845,  0.6651, -1.7722,  1.7793, -1.2839, -0.1214,  2.3869, -0.1618,
          -0.5196,  0.5470],
         [ 0.2431, -0.7124, -0.4566,  1.3189,  1.1945, -0.3801, -0.9234,  1.1798,
          -0.5479, -1.3284]], requires_grad=True),
 tensor([-1.2518,  1.3327,  0.5024,  1.7793,  1.3189], grad_fn=<MvBackward>))

In [None]:
# hidden layer -> output layer
# we need to map our hidden layer to an output to train the model to give us 
# target word predictions
W2 = create_connections_between_layers(len(vocabulary), embedding_size, 0.1)

z2 = torch.matmul(W2, z1)
output = F.softmax(z2, dim=0)

prediction_probability, prediction_idx = torch.max(output, 0)
prediction_word = idx2word[prediction_idx.item()]

prediction_word, W2

('woman', tensor([[-2.3251, -2.6886, -0.0734,  0.8497,  1.0210],
         [ 1.0094, -0.0777,  0.7146,  0.1369,  0.2719],
         [-0.0478, -1.7538,  1.3308,  0.8908, -0.9964],
         [ 0.0624,  0.0561, -1.3981,  0.3419,  1.2341],
         [ 0.3496,  1.6610, -1.3633, -0.0212,  1.4733],
         [ 1.0205, -1.2951,  0.3399, -0.8108, -0.9458],
         [ 0.2198, -1.7058,  0.8295,  0.3008,  0.3217],
         [ 1.4985,  1.0535,  2.9125,  1.8740,  0.6487],
         [-1.6549, -0.2782,  0.5403,  0.0856,  1.1854],
         [-2.2175, -0.7362,  1.2824,  0.9653,  0.3898]], requires_grad=True))

In [None]:
def update_model_using_backprob(learning_rate, W1, W2):
    W1.data -= learning_rate * W1.grad.data
    W2.data -= learning_rate * W2.grad.data

    W1.grad.data.zero_()
    W2.grad.data.zero_()

def predict(training_element, W1, W2):
    x = Variable(get_one_hot_encoded_word_idx(training_element.context.idx, vocabulary)).float()
    y_true = Variable(torch.from_numpy(np.array([training_element.target.idx])).long())

    z1 = torch.matmul(W1, x)
    z2 = torch.matmul(W2, z1)

    prediction = F.softmax(z2, dim=0)

    return prediction

def store_parameters_to_file(name, df): 
    fig = plt.figure(facecolor='w', edgecolor='k')
    sns.heatmap(df, annot=True, cmap='viridis', cbar=False)
    # plt.savefig(f'/content/drive/My Drive/Colab Notebooks/medium-word-embeddings/weights/{name}.png')
    plt.close()


num_epochs = 10000
learning_rate = 0.1
calculate_loss = nn.CrossEntropyLoss()

for epo in range(num_epochs):
    loss_val = 0
    for training_element in training_elements:
        
        prediction = predict(training_element, W1, W2)

        y_true = Variable(torch.from_numpy(np.array([training_element.target.idx])).long())

        loss = calculate_loss(prediction.view(1,-1), y_true)
        loss.backward()

        update_model_using_backprob(learning_rate, W1, W2)

        loss_val += loss
        
    if epo % 10 == 0:
        # print(f'Loss at epo {epo}: {loss_val/len(training_elements)}')
        df = pd.DataFrame(W1.data.transpose(0,1).numpy())
        #store_parameters_to_file('w1-epoch-{:05d}'.format(epo), df)
        print(df)


          0         1         2         3         4
0  1.918688  0.121109 -1.523312  1.186724  0.220336
1  0.972538 -0.677316  1.531357  0.580886 -0.849397
2 -1.030513  0.903067 -0.818386 -1.777384 -0.509315
3 -1.256797  1.330122  0.497224  1.776434  1.319246
4 -1.791117 -0.594822 -0.297724 -1.285573  1.194215
5 -1.323865 -1.250576 -0.463785 -0.122373 -0.393563
6  0.408443 -0.639606  0.407299  2.368208 -0.966417
7  0.854152 -0.108268 -0.173106 -0.163214  1.168369
8  0.563593  0.406103  1.420216 -0.535521 -0.560808
9  1.061219  0.762879  0.086494  0.527801 -1.342015
          0         1         2         3         4
0  2.341826 -0.184863 -1.134574  1.193705 -0.159644
1  0.817761 -1.108093  1.467372  0.566922 -1.214457
2 -0.777691  0.408100 -0.576102 -1.922977 -0.945769
3 -1.306295  1.301733  0.449770  1.751798  1.317928
4 -1.772674 -0.528086 -0.276671 -1.299512  1.191518
5 -0.914064 -1.095496 -0.145812 -0.080510 -0.795975
6  0.408613 -0.757512  0.398003  2.358387 -1.087765
7  1.076115 

In [None]:
def get_word_embedding_from_weight_matrix(word, weight_matrix):
    idx = word2idx[word]

    rows, cols = weight_matrix.shape

    if rows == 5:
        word_embedding = weight_matrix[:, idx].detach().numpy()
    else:
        word_embedding = weight_matrix[idx, :].detach().numpy()
    return word_embedding

king = get_word_embedding_from_weight_matrix('king', W1)
queen = get_word_embedding_from_weight_matrix('queen', W1)
he = get_word_embedding_from_weight_matrix('he', W1)
she = get_word_embedding_from_weight_matrix('she', W1)

print("King: {}".format(king))#(king_word_embedding)
print("Queen: {}".format(queen)) #(king_word_embedding)

NameError: ignored

In [None]:
def cosine_similarity(u, v):
    """
    Cosine similarity reflects the degree of similariy between u and v
        
    Arguments:
        u -- a word vector of shape (n,)          
        v -- a word vector of shape (n,)

    Returns:
        cosine_similarity -- the cosine similarity between u and v defined by the formula above.
    """
    
    distance = 0.0
    
    ### START CODE HERE ###
    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u,v)
    # Compute the L2 norm of u (≈1 line)
    norm_u = np.sqrt(np.dot(u,u))
    
    # Compute the L2 norm of v (≈1 line)
    norm_v = np.sqrt(np.dot(v,v))
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / (norm_u*norm_v)
    ### END CODE HERE ###
    
    return cosine_similarity

In [None]:
print("cosine_similarity(king, queen) = ", cosine_similarity(king, queen))
print("cosine_similarity(he, she) = ", cosine_similarity(he, she))
print("cosine_similarity(king, she) = ", cosine_similarity(king, she))
print("cosine_similarity(king, he) = ", cosine_similarity(king, he))

In [None]:
def complete_analogy(word_a, word_b, word_c, weight_matrix):
    """
    Performs the word analogy : a is to b as c is to ____. 
    
    Arguments:
    word_a -- a word, string
    word_b -- a word, string
    word_c -- a word, string

    Returns:
    best_word --  the word such that v_b - v_a is close to v_best_word - v_c, as measured by cosine similarity
    """
    
    # convert words to lower case
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
    
    ### START CODE HERE ###
    # Get the word embeddings v_a, v_b and v_c (≈1-3 lines)
    e_a, e_b, e_c = get_word_embedding_from_weight_matrix(word_a, weight_matrix), get_word_embedding_from_weight_matrix(word_b, weight_matrix), get_word_embedding_from_weight_matrix(word_c, weight_matrix)
    ### END CODE HERE ###

    words = vocabulary
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None, it will help keep track of the word to output

    # loop over the whole word vector set
    for w in words:        
        # to avoid best_word being one of the input words, pass on them.
        if w in [word_a, word_b, word_c] :
            continue
        
        # Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c)  (≈1 line)
        cosine_sim = cosine_similarity(e_b - e_a, get_word_embedding_from_weight_matrix(w, weight_matrix) - e_c)
        
        # If the cosine_sim is more than the max_cosine_sim seen so far,
            # then: set the new max_cosine_sim to the current cosine_sim and the best_word to the current word (≈3 lines)
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
        ### END CODE HERE ###
        
    return best_word

In [None]:
triads_to_try = [('queen', 'woman', 'king')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad, W1)))