# Word2Vec

Let's work on skipgram-based implementation of word2vec.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

## 1. Load Data

In [2]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Ekkar\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
from nltk.corpus import brown
corpus = brown.sents(categories='news')

In [4]:
corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [5]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
# vocab

In [6]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)



In [7]:
#vocab size
voc_size = len(vocab)
print(voc_size)

14394


In [8]:
#append UNK
vocab.append('<UNK>')

In [9]:
# vocab

In [10]:
word2index['<UNK>'] = 0

In [11]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Prepare train data

In [12]:
# for c in corpus:
#     print(c)

In [13]:
def random_batch(batch_size, word_sequence):
    
    window_size = 2
    # Make skip gram of one size window
    skip_grams = []
    # loop each word sequence
    # we starts from 1 because 0 has no context
    # we stop at second last for the same reason
    for sent in corpus:
        for i in range(window_size, len(sent) - window_size):
            target = word2index[sent[i]]

            context = []
            for j in range(1, window_size):
                context.append(word2index[sent[i - j]])
                context.append(word2index[sent[i + j]])

            # for each outside word, append to a skip_grams
            for w in context:
                skip_grams.append([target, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
    return np.array(random_inputs), np.array(random_labels)

### Testing the method

In [14]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch(batch_size, corpus)

print("Input: ", input_batch)
print("Target: ", target_batch)

#we will convert them to tensor during training, so don't worry...

Input:  [[14327]
 [ 3389]]
Target:  [[ 8079]
 [14187]]


## 3. Model

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [15]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size, word2index):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
        self.word2index  = word2index
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs) #   [batch_size, voc_size, emb_size]
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood
    
    def get_embed(self, word):
        word2index = self.word2index
        
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']
            
        word = torch.LongTensor([index])
        
        embed_c = self.embedding_v(word)
        embed_o = self.embedding_u(word)
        embed   = (embed_c + embed_o) / 2
        
        return embed[0][0].item(), embed[0][1].item()
        

## 4. Training

In [16]:
batch_size     = 2 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Skipgram(voc_size, embedding_size, word2index)

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [17]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#use for the normalized term in the probability calculation
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab))  # [batch_size, voc_size]
all_vocabs.shape

torch.Size([2, 14395])

In [18]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [19]:
import time

# Training
num_epochs = 100

start = time.time()
for epoch in range(num_epochs):

    #get batch
    input_batch, target_batch = random_batch(batch_size, corpus)
    input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch) #[batch_size, 1]

    #predict
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    #backprogate
    loss.backward()

    #update alpha
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    #print the loss
    if (epoch + 1) % 10 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs:.6f}s")

print(f"\nComplete: \nTotal Loss: {loss:2.2f} | Time Taken: {epoch_mins} minutes and {epoch_secs} seconds")


Epoch: 10 | cost: 10.656777 | time: 0m 4.000000s
Epoch: 20 | cost: 12.350922 | time: 0m 8.000000s
Epoch: 30 | cost: 11.702185 | time: 0m 13.000000s
Epoch: 40 | cost: 10.465929 | time: 0m 17.000000s
Epoch: 50 | cost: 9.686190 | time: 0m 21.000000s
Epoch: 60 | cost: 10.120502 | time: 0m 26.000000s
Epoch: 70 | cost: 10.984463 | time: 0m 30.000000s
Epoch: 80 | cost: 11.165031 | time: 0m 34.000000s
Epoch: 90 | cost: 9.309835 | time: 0m 39.000000s
Epoch: 100 | cost: 10.027914 | time: 0m 43.000000s

Complete: 
Total Loss: 10.03 | Time Taken: 0 minutes and 43 seconds


## 5. Testing

In [20]:
def open_file(path_to_file):
    content = []  # Initialize content to an empty list to avoid returning None
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()  # Read all lines of the file into a list
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")  # File not found error
    except Exception as e:
        print(f"An error occurred: {e}")  # Handle any other exceptions (e.g., permission issues)

    return content  # Return content even if it's empty, but not None


In [21]:
file_path = "file/word-test.v1.1.txt"

content = open_file(file_path)

semantic = []
syntatic = []

current_test = semantic
for sent in content:
    if sent[0] == ':':
        current_test = syntatic
        continue
    
    current_test.append(sent.strip())

In [22]:
vector_space = []

for word in vocab:
    vector_space.append(model.get_embed(word))

vector_space = np.array(vector_space)

In [23]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    norm_a = a / np.linalg.norm(a)  # Normalize vector a
    norm_b = b / np.linalg.norm(b)  # Normalize vector b
    return 1 - spatial.distance.cosine(norm_a, norm_b)  # Cosine similarity after normalization


def cos_sim_scores(vector_space, target_vector):
    scores = []
    for each_vect in vector_space:
        scores.append(cos_sim(target_vector, each_vect))

    return np.array(scores)

def similarity(model, test_data):
    words = test_data.split(" ")
    embeddings = [np.array(model.get_embed(word)) for word in words[:3]]  # Precompute embeddings for all words
    embed0, embed1, embed2 = embeddings  # Unpack embeddings
    similar_vector = embed1 - embed0 + embed2  # Perform vector arithmetic

    similarity_scores = cos_sim_scores(vector_space, similar_vector)
    max_score_idx = np.argmax(similarity_scores)
    similar_word = index2word[max_score_idx]

    return similar_word == words[3]  # Directly return the result

Semantic accuracy

In [24]:
sem_total = len(semantic)
sem_correct = 0
for sent in semantic:
    if similarity(model, sent):
        sem_correct += 1

In [25]:
sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.2f}")

Semantic accuracy: 0.00


Syntactic Accuracy

In [26]:
syn_total = len(syntatic)
syn_correct = 0
for sent in syntatic:
    if similarity(model, sent):
        syn_correct += 1

In [27]:
syn_accuracy = syn_correct / syn_total
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")

Syntatic accuracy: 0.00


Similarity Accuracy

In [28]:
file_path = "file/wordsim_similarity_goldstandard.txt"

content = open_file(file_path)

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [29]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [45]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [46]:
from scipy.stats import spearmanr

correlation = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation between the dataset similarity metrics and models’ dot product is {correlation:2.2f}.")

Correlation between the dataset similarity metrics and models’ dot product is -0.07.


## 6. Save model

In [40]:
import torch
import pickle

# Define the folder where want to save the files
model_folder = 'model'  # Change this to your desired folder path

# Save the model's state_dict
torch.save(model.state_dict(), f'{model_folder}/skipgram.model')

# Save the arguments (such as voc_size, emb_size, word2index)
skipgram_args = {
    'vocab_size': voc_size,
    'emb_size': embedding_size,
    'word2index': word2index,
}
with open(f'{model_folder}/skipgram.args', 'wb') as f:
    pickle.dump(skipgram_args, f)

print(f"Model and arguments saved to {model_folder}")


Model and arguments saved to model


In [43]:
# Save the word2index dictionary as a pickle file
with open('model/word2index.pkl', 'wb') as f:
    pickle.dump(word2index, f)

print("word2index.pkl saved successfully!")

word2index.pkl saved successfully!


In [41]:
import torch
import pickle

# Define the folder where the files are saved
model_folder = 'model'  # Change this to the folder where you saved the files

# Load the arguments from the pickle file
with open(f'{model_folder}/skipgram.args', 'rb') as f:
    skipgram_args = pickle.load(f)

# Define the model class and initialize it with the loaded arguments
# Make sure the model class and arguments match the training code
model_skipgram = Skipgram(**skipgram_args)  # Assuming you have a skipgram model class

# Now, load the model weights (this should be from skipgram.model, not skipgram.args)
model_skipgram.load_state_dict(torch.load(f'{model_folder}/skipgram.model'))

# Now the model is loaded with the arguments and weights, and you're ready to use it
model_skipgram.eval()  # Set the model to evaluation mode if you're not training

print("Model loaded successfully.")


Model loaded successfully.


  model_skipgram.load_state_dict(torch.load(f'{model_folder}/skipgram.model'))


In [48]:
model_skipgram.get_embed('the')

(-0.09687146544456482, -0.5613371133804321)