In [1]:
# Word2Vec Implementation

# Import necessary libraries

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

import nltk
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [Errno 104] Connection
[nltk_data]     reset by peer>


False

In [3]:

# Define the Word2Vec model class

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)
    
    def forward(self, target_word, context_word):
        target_embed = self.in_embed(target_word)
        context_embed = self.out_embed(context_word)
        return target_embed, context_embed

In [4]:
# Define the training function

def train_word2vec(corpus, window_size, embedding_dim, num_epochs, learning_rate):
    # Preprocess the corpus and build the vocabulary
    sentences = [sent.strip().replace('!', '') for sent in corpus.split('.')]
    sentences = [f'<bos> {sent} <eos>' for sent in sentences]
    
    vocab = []
    for sent in sentences:
        vocab += word_tokenize(sent)
    
    vocab = list(set(vocab))
    stoi = {v:k for k, v in enumerate(vocab)}
    itos = {k:v for k, v in enumerate(vocab)}
    
    # Create the target-context word pairs
    training_pairs = []
    for sent in sentences:
        sent_toks = word_tokenize(sent)
        
        for i in range(len(sent_toks)-window_size):
            window_toks = sent_toks[i:i+window_size]
            
            target = window_toks[window_size//2]
            for context in window_toks:
                if context == target:
                    continue
                training_pairs.append((torch.tensor(stoi[target]), torch.tensor(stoi[context])))
            
    # Initialize the Word2Vec model
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = Word2Vec(len(vocab), embedding_dim)
    model = model.to(device)
    
    # Define the loss function and optimizer
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), learning_rate)
    
    for epoch in range(num_epochs):
        total_loss = 0.0
        for target_word, context_word in training_pairs:
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            target_word = target_word.to(device)
            context_word = context_word.to(device)
            target_emb, context_emb = model(target_word, context_word)
            
            # Compute the loss
            loss = loss_func(target_emb, context_emb)
            
            # Backward pass
            loss.backward()
            
            # Update the model parameters
            optimizer.step()
            
            # Accumulate the loss
            total_loss += loss.item()
            
        # Print the average loss for the epoch
        print(f"Epoch {epoch+1} Loss: {round(total_loss/len(training_pairs), 3)}")
        
    # Return the trained Word2Vec model
    return model, vocab, stoi

In [5]:
def most_similar(word, embeds, stoi):
    word_embed = embeds[stoi[word]]
    best_word = ''
    best_score = -1
    
    for w in stoi.keys():
        i = stoi[w]
        if w == word:
            continue
        
        # cosine sim
        s = np.dot(word_embed, embeds[i]) / (np.linalg.norm(word_embed, 2) * np.linalg.norm(embeds[i], 2))
        
        if s > best_score:
            best_score = s
            best_word = w

    return best_word

In [10]:
# Define the main function

def main():
    # Set hyperparameters
    corpus = "I love to learn deep learning. It is fascinating!"
    window_size = 3
    embedding_dim = 10
    num_epochs = 50
    lr = 0.01

    # Load and preprocess the corpus
    ### DONE in training function
    
    # Train the Word2Vec model
    model, vocab, stoi = train_word2vec(corpus, window_size, embedding_dim, num_epochs, lr)
    embeds = model.in_embed.weight.detach().cpu().numpy()
    
    # Evaluate the trained model using word similarity or analogy tasks
    w = most_similar('deep', embeds, stoi)
    print(f'most similar words to learn: {w}')
    w = most_similar('love', embeds, stoi)
    print(f'most similar words to love: {w}')
    
    # Print the learned word embeddings
    print('\n')
    for w in stoi.keys():
        i = stoi[w]
        print(f'{w}: {embeds[i]}')
    
    # Save the trained model
    torch.save(model.state_dict(), 'word2vec_model.pt')
    
# Run the main function
if __name__ == "__main__":
    main()


Epoch 1 Loss: 1.621
Epoch 2 Loss: -1.014
Epoch 3 Loss: -3.616
Epoch 4 Loss: -6.285
Epoch 5 Loss: -9.044
Epoch 6 Loss: -11.919
Epoch 7 Loss: -14.944
Epoch 8 Loss: -18.152
Epoch 9 Loss: -21.583
Epoch 10 Loss: -25.275
Epoch 11 Loss: -29.266
Epoch 12 Loss: -33.591
Epoch 13 Loss: -38.281
Epoch 14 Loss: -43.365
Epoch 15 Loss: -48.865
Epoch 16 Loss: -54.8
Epoch 17 Loss: -61.189
Epoch 18 Loss: -68.045
Epoch 19 Loss: -75.382
Epoch 20 Loss: -83.21
Epoch 21 Loss: -91.54
Epoch 22 Loss: -100.379
Epoch 23 Loss: -109.736
Epoch 24 Loss: -119.618
Epoch 25 Loss: -130.028
Epoch 26 Loss: -140.965
Epoch 27 Loss: -152.422
Epoch 28 Loss: -164.393
Epoch 29 Loss: -176.868
Epoch 30 Loss: -189.835
Epoch 31 Loss: -203.285
Epoch 32 Loss: -217.208
Epoch 33 Loss: -231.592
Epoch 34 Loss: -246.427
Epoch 35 Loss: -261.704
Epoch 36 Loss: -277.41
Epoch 37 Loss: -293.534
Epoch 38 Loss: -310.063
Epoch 39 Loss: -326.987
Epoch 40 Loss: -344.294
Epoch 41 Loss: -361.975
Epoch 42 Loss: -380.021
Epoch 43 Loss: -398.424
Epoch 44 