In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt



# 1.Load data


In [5]:
# specify the sentences /corpus

corpus = ['apple banana fruit', 'banana apple fruit', 'banana fruit apple','dog cat animal', 'cat dog animal','cat animal dog']

# tokenize 
corpus_tokened = [sent.split(' ') for sent in corpus]
print(corpus_tokened)

# numericalize
faltten_func = lambda l: [item for sublist in l for item in sublist]

print('***'*15)
vocabs = list(set(faltten_func(corpus_tokened)))
print(vocabs)



[['apple', 'banana', 'fruit'], ['banana', 'apple', 'fruit'], ['banana', 'fruit', 'apple'], ['dog', 'cat', 'animal'], ['cat', 'dog', 'animal'], ['cat', 'animal', 'dog']]
*********************************************
['fruit', 'cat', 'dog', 'apple', 'animal', 'banana']


In [11]:
# assign id to all these vocab
word2idx = {v:idx for idx,v in enumerate(vocabs)}
vocabs.append('<UNK>')
word2idx['<UNK>'] = -1

In [12]:

idx2word = {v:k for k,v in word2idx.items()}

In [13]:
print(word2idx)
print(idx2word)


{'fruit': 0, 'cat': 1, 'dog': 2, 'apple': 3, 'animal': 4, 'banana': 5, '<UNK>': -1}
{0: 'fruit', 1: 'cat', 2: 'dog', 3: 'apple', 4: 'animal', 5: 'banana', -1: '<UNK>'}


## 2. Prep train data

In [17]:

# # ## move along the corpus 

# # skipgrams = []
# # for sent in corpus_tokened:
# #     for i in range(1,len(sent)-1):
# #         center_word = sent[i]
# #         outside_word = [sent[i-1],sent[i+1]]
# #         for o in outside_word:
# #             skipgrams.append([center_word,o]) 
# def random_batch(batch_size, word_sequence):
    
#     # Make skip gram of one size window
#     skip_grams = []
#     # loop each word sequencei
#     # we starts from 1 because 0 has no context
#     # we stop at second last for the same reason
#     for sent in corpus:
#         for i in range(1, len(sent) - 1):
#             target = word2idx[sent[i]]
#             context = [word2idx[sent[i - 1]], word2idx[sent[i + 1]]]
#             for w in context:
#                 skip_grams.append([target, w])
    
#     random_inputs = []
#     random_labels = []
#     random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
#     for i in random_index:
#         random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
#         random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
#     return np.array(random_inputs), np.array(random_labels)

In [33]:
# print(skipgrams)
#let's make what we have made into a function (batch function)
#return a batches of data, e.g., =2 --> ['banana', 'apple'], ['banana', 'fruit']
#also i want these batches to be id, NOT token   --> [5, 4]

def random_batch(batch_size, corpus_tokenized,word2index = word2idx):
    
    skipgrams = []

    #for each corpus
    for sent in corpus_tokenized:
        #for each sent ["apple", "banana", "fruit"]
        for i in range(1, len(sent) - 1): #start from 1 to second last
            center_word = word2index[sent[i]]
            outside_words = [word2index[sent[i-1]], word2index[sent[i+1]]]  #window_size = 1
            for o in outside_words:
                skipgrams.append([center_word, o])
                
    #only get a batch, not the entire list
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
             
    #appending some list of inputs and labels
    random_inputs, random_labels = [], []   
    for index in random_index:
        random_inputs.append([skipgrams[index][0]])  #center words, this will be a shape of (1, ) --> (1, 1) for modeling
        random_labels.append([skipgrams[index][1]])
        
    return np.array(random_inputs), np.array(random_labels)
    

In [27]:
random_batch(10, corpus_tokened,word2idx)

(array([[4],
        [2],
        [3],
        [3],
        [5],
        [5],
        [4],
        [1],
        [0],
        [1]]),
 array([[1],
        [1],
        [5],
        [0],
        [0],
        [3],
        [2],
        [4],
        [3],
        [2]]))

In [28]:
input, label = random_batch(5, corpus_tokened,word2idx)

print(f"{input=}")
print(f"{label=}")

input=array([[2],
       [3],
       [4],
       [2],
       [3]])
label=array([[1],
       [5],
       [1],
       [4],
       [0]])


## 3. model 

In [29]:
# #the model will accept three vectors - u_o, v_c, u_w
# #u_o - vector for outside words
# #v_c - vector for center word
# #u_w - vectors of all vocabs

# class Skipgram(nn.Module):
    
#     def __init__(self, voc_size, emb_size):
#         super(Skipgram, self).__init__()
#         self.embedding_center_word = nn.Embedding(voc_size, emb_size)  #is a lookup table mapping all ids in voc_size, into some vector of size emb_size
    
#     def forward(self, center_word, outside_word):
#         #center_word, outside_word: (batch_size, 1)
#         #all_vocabs: (batch_size, voc_size)
        
#         #convert them into embedding
#         center_word_embed = self.embedding_center_word(center_word)   #(batch_size, 1, emb_size)
        
#         return center_word_embed

In [6]:
#the model will accept three vectors - u_o, v_c, u_w
#u_o - vector for outside words
#v_c - vector for center word
#u_w - vectors of all vocabs

class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center_word  = nn.Embedding(voc_size, emb_size)  #is a lookup table mapping all ids in voc_size, into some vector of size emb_size
        self.embedding_outside_word = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center_word, outside_word, all_vocabs):
        #center_word, outside_word: (batch_size, 1)
        #all_vocabs: (batch_size, voc_size)
        
        #convert them into embedding
        center_word_embed  = self.embedding_center_word(center_word)     #(batch_size, 1, emb_size)
        outside_word_embed = self.embedding_outside_word(outside_word)   #(batch_size, 1, emb_size)
        all_vocabs_embed   = self.embedding_outside_word(all_vocabs)     #(batch_size, voc_size, emb_size)
        
        #bmm is basically @ or .dot , but across batches (i.e., ignore the batch dimension)
        top_term = outside_word_embed.bmm(center_word_embed.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) ===> (batch_size, 1)
        
        top_term_exp = torch.exp(top_term)  #exp(uo vc)
        #(batch_size, 1)
        
        lower_term = all_vocabs_embed.bmm(center_word_embed.transpose(1, 2)).squeeze(2)
         #(batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size)
         
        lower_term_sum = torch.sum(torch.exp(lower_term), 1) #sum exp(uw vc)
        #(batch_size, 1)
        
        loss_fn = -torch.mean(torch.log(top_term_exp / lower_term_sum))
        #(batch_size, 1) / (batch_size, 1) ==mean==> scalar
        
        return loss_fn

In [None]:
#preparing all_vocabs

batch_size = 1

def prepare_sequence(seq, word2index):
    #map(function, list of something)
    #map will look at each of element in this list, and apply this function
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs.shape