# Natural Language Processing

## Word2Vec
### CBOW

Let's work on skipgram-based implementation of word2vec.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pickle

## 1. Define some very simple data for understanding

In [2]:
with open('wordtotrain_use.atikeep','rb') as pic:
    corpus,vocab,word2index,index2word = pickle.load(pic)
flatten = lambda l: [item for sublist in l for item in sublist]
voc_size = len(vocab)


batch_size = 2

def prepare_seqeunce(seq, word2index):
    #map(fucntion, list of something)
    #map will look at each of element in this list, and apply this function
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_seqeunce(list(vocab),word2index).expand(batch_size, voc_size)
all_vocabs.shape


torch.Size([2, 5820])

## 2. Prepare train data

In [3]:
def random_batch(batch_size, corpus, window_size=1,architecture='skipgram',corpus_tokenized = corpus):
    skipgrams = []
    cbows = []
    #for each corpus
    for sent in corpus_tokenized:
        #for each sent ('apple', 'banana', 'fruit')
        for i in range(window_size,len(sent)-window_size): #start from 2 to second last
            context_word = []
            # print(sent[i])
            center_word = word2index[sent[i]]
            for j in range(window_size):
                outside_word = [word2index[sent[i-j-1]],word2index[sent[i+j+1]]] #window_size adjustable
                #here we want to create (banana, apple), (banana, fruit) append to some list
                for o in outside_word:
                    context_word.append(o)
                    skipgrams.append([center_word,o])
                cbows.append([context_word,center_word])

    if architecture == 'skipgram':
        arch = skipgrams
    else:
        arch = cbows
        
    #only get a batch, not the entire lsit
    random_index = np.random.choice(range(len(arch)),batch_size,replace=False)
    #appending some list of inputs and labels
    random_inputs, random_labels = [] , []
    for index in random_index:
        # print(arch[index])
        random_inputs.append([arch[index][0]]) #center words, this will be as shape of (1,) -> (1,1) for modeling
        random_labels.append([arch[index][1]])

    return np.array(random_inputs),np.array(random_labels)

### Model blueprint 

In [4]:
class CBOW(nn.Module): #same as skipgram
    def __init__(self,voc_size, emb_size):
        super(CBOW,self).__init__()
        self.embedding_center_word = nn.Embedding(voc_size, emb_size) #is a lookup table mapping all ids in voc_size, into some vector of size emb_size
        self.embedding_outside_word = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center_word, outside_word, all_vocabs):
        #center_word, outside_word: (batch_size,1)
        #all_vocabs : (batch_size, voc_size)
        #convert them into embedding
        center_word_embed = self.embedding_center_word(center_word)     #v_c (batch_size,1, emb_size)
        outside_word_embed = self.embedding_outside_word(outside_word)  #u_o (batch_size,1, emb_size)
        all_vocabs_embed = self.embedding_outside_word(all_vocabs)      #u_w (batch_size,voc_size, emb_size)
        # print(center_word_embed.shape,outside_word_embed.shape,all_vocabs_embed.shape)
        #bmm is basically @ or .dot but across batches (ie., ignore the batch dimension)
        top_term = outside_word_embed.bmm(center_word_embed.transpose(1,2)).squeeze(2)
        #(batch_size,1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) ===> (batch_size, 1)
        top_term_exp = torch.exp(top_term) #exp(uo vc)
        #(batch_size, 1)
        lower_term = all_vocabs_embed.bmm(center_word_embed.transpose(1,2)).squeeze(2)
        #(batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) ===> (batch_size, voc_size)
        lower_term_sum = torch.sum(torch.exp(lower_term)) #sum exp(uw, vc)
        #(batch_size, 1)
        loss_fn = -torch.mean(torch.log(top_term_exp/lower_term_sum))
        #(batc_size,1) / (batch_size,1) ==mena==> scalar
        return loss_fn

In [7]:
batch_size = 2 #why? no reason
emb_size = 2 #why? no reason; usually 50,100, 300 but 2 so we can plot (50 can also plot, but need PCA)
model = CBOW(voc_size,emb_size)
window_size=2
criterion = nn.CrossEntropyLoss() #-log
optimizer = optim.Adam(model.parameters(), lr=0.001)
input_batch, label_batch = random_batch(batch_size,corpus,window_size=2,architecture='cbow')
input_batch = torch.LongTensor(input_batch).view(batch_size,window_size*2)
label_batch = torch.LongTensor(label_batch)
input_batch.shape,label_batch.shape,all_vocabs.shape


(torch.Size([2, 4]), torch.Size([2, 1]), torch.Size([2, 5820]))

## 4. Training

In [8]:
import time
num_epochs = 500
#for epoch
start = time.time()
for epoch in range(num_epochs):
    
    #get random batch
    input_batch, label_batch = random_batch(batch_size,corpus,window_size=2,architecture='cbow')
    input_batch = torch.LongTensor(input_batch).view(batch_size,window_size*2)
    label_batch = torch.LongTensor(label_batch).view(-1,1)

    # print(input_batch.shape,label_batch.shape,all_vocabs.shape)
    # break

    #loss = model
    loss = model(input_batch,label_batch,all_vocabs)
    
    #backpropagate
    loss.backward()
    #update alpha
    optimizer.step()

    end = time.time()
    #print epoch loss
    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {end-start}")
    # break

Epoch: 100 | cost: 11.299652 | time: 17.72466206550598
Epoch: 200 | cost: 10.979570 | time: 35.26954507827759
Epoch: 300 | cost: 11.599943 | time: 55.95752930641174
Epoch: 400 | cost: 11.446724 | time: 75.82228302955627
Epoch: 500 | cost: 11.278444 | time: 94.97708511352539


In [9]:
import pickle
with open('myhob_C_Bro.atikeep', 'wb') as handle:
    pickle.dump((model), handle)

## 5. Plotting the embeddings