# Word Embeddings: CBOW

In [1]:
# # connect to google drive and move to the selected directory
# from google.colab import drive
# drive.mount('/content/gdrive')
# %cd "/content/gdrive/Myµ Drive" #select the current working derectory
# !pwd 
# !ls 

In [82]:
#REQUIRED: ACTIVATE the gpu: (In the menu tabs, "Runtime" => "Change runtime type and select gpu) 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import trange
import matplotlib.pyplot as plt
torch.manual_seed(1)
CUDA = torch.cuda.is_available()
print("avaible GPUs:",torch.cuda.device_count())
# print("GPU name:",torch.cuda.get_device_name())

avaible GPUs: 0


In [104]:
text = '''We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells'''

with open("data/en.txt", "r") as f:
    text=f.read()


UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 469: character maps to <undefined>

In [None]:

class CBOW(nn.Module):
    """"
    Word2Vec CBOW model
    """
    def __init__(self, vocab_size, embedding_size, context_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        
        if CUDA:
            self.embeddings = self.embeddings.cuda()
        self.linear1 = nn.Linear(embedding_size, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        
    def forward(self, inputs):
        out = self.embeddings(inputs).mean( dim=0).view((1, -1))
        
        out=self.linear1(out)
        out=F.relu(out)
        out=self.linear2(out)
        out=F.log_softmax(out,dim=1)
        return out

    def predict(self,input):
        ##
        return 0

def preprocess_text(text, context_size):
    '''
    Convert text to data:(context, target) for training cbow model

    Parameters:
        text (String): text to preprocess
        context_size (int): the context window 
    
    Return:
        data (tuple): data in form of (context, target)
        words_to_idx(dict): dict containing a mapping word->index
    '''
    text=text.lower().split()
    #build contexts and targets
    data = list()   
    for i in range(context_size, len(text) - context_size):
        context = [text[i+j] for j in range(-context_size, context_size+1) if i+j != i]
        target = text[i]  
        data.append((context, target))
    
    # map words to index
    vocab=set(text)
    words_to_idx = {w: i for i, w in enumerate(vocab)}
    return data,words_to_idx

def words_to_tensor(words,word_to_idx):
    '''
    Retrieve the indexes of given words

    Parameters:
        words (list of string): 
    
    Return:
        tensor (Tensor): tensor of indexes 
    '''

    tensor =torch.LongTensor([word_to_idx[word] for word in words])

    if CUDA:
        tensor = tensor.cuda()

    return tensor   


In [None]:
CONTEXT_SIZE=2
EMBEDDING_SIZE=300
EPOCHS=5
LEARNING_RATE = 0.001

model=CBOW(len(words_to_idx),EMBEDDING_SIZE,CONTEXT_SIZE) 
data,words_to_idx=preprocess_text(text,context_size=2)
idx_to_words = {v: k for k, v in words_to_idx.items()}

def train(model,data,words_to_idx):
    '''
    Train a model 

    Parameters:
        model (nn.module):
        data(list of tuples):  
        words_to_idx() :dict containing a mapping word->index
    Return:
        tensor (Tensor): 
    '''
    loss_func = torch.nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    losses=[]

    for epoch in trange(EPOCHS):
        total_loss=0
        for context,target in data:
            context_idx=words_to_tensor(context,words_to_idx)
            target_idx=words_to_tensor([target],words_to_idx)
            model.zero_grad
            output=model(context_idx)
            loss = loss_func(output, target_idx)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        losses.append(total_loss)
    
    
    plt.figure()
    plt.plot(losses)
    plt.show()
train(model,data,words_to_idx)

In [None]:

def get_prediction(context, model,words_to_idx,idx_to_words):
    """"
    Predict a word from a given context

    Parameters: 
        context(List of string): 
        model:
        words_to_idx(Tensor):
        idx_to_words(dict):dict containing a mapping index->word
    Return:
        (String): predicted word
    """
    model.eval()
    prediction = model(words_to_tensor(context, words_to_idx))
    _, index = torch.max(prediction, 1)
    return idx_to_words[index.item()]

def check_accuracy(model,data,words_to_idx,idx_to_words):

    """"
    Check accuracy

    Parameters: 
        data(list): list of tuples(context,target) 
        model:
        words_to_idx(dict):dict containing a mapping word->index
        idx_to_words(dict):dict containing a mapping index->word
    Return:
        (String): predicted word
    """

    correct = 0
    for context, target in data:
        prediction = get_prediction(context, model,words_to_idx,idx_to_words)
        if prediction == target:
            correct += 1
    return correct/len(data)

print(check_accuracy(model,data,words_to_idx,idx_to_words))

In [None]:
print(get_prediction(['as' ,'the','strange' ,'beings'],model,words_to_idx,idx_to_words))