# Project 4: **Implementing Word2Vec**
- Creating embedding layer weights to be used in the RNN models

In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize

In [2]:
df_corpus = pd.read_csv("DataFrames/df_corpus_pfams.csv")
df_corpus.head()                      

Unnamed: 0,text,label
0,PF02518 PF00512 PF13561 PF00106 PF08659 PF0050...,0
1,PF07728 PF00004 PF13165 PF13353 PF04055 PF1339...,0
2,PF00072 PF00501 PF00550 PF00890 PF13450 PF0137...,0
3,PF00005 PF13561 PF00106 PF08659 PF00501 PF0010...,1
4,PF00440 PF08541 PF01551 PF01613 PF03050 PF0162...,0


In [3]:
df_corpus.shape

(444, 2)

In [4]:
type(df_corpus.text[0]), len(df_corpus.text[0])

(str, 303)

### Each row of `df_corpus` is One Big string consists of words
- It's not a list of words

### Let's check for duplicates

In [5]:
# finds the number of occurance of each element in a given row
def duplicate_finder(df, row_number):
    from collections import Counter

    pfams2  = []
    for txt in df["text"]:
        row = []
        row.extend(word_tokenize(txt))
        pfams2.append(row)

    return Counter(pfams2[row_number]).most_common()

In [6]:
# all numbers should be 1 otherwise, we have duplicates which is not correct!
duplicate_finder(df_corpus, 200)

[('PF00072', 1),
 ('PF00005', 1),
 ('PF00501', 1),
 ('PF13450', 1),
 ('PF13193', 1),
 ('PF02775', 1),
 ('PF04397', 1),
 ('PF14501', 1),
 ('PF07992', 1),
 ('PF00070', 1),
 ('PF14691', 1),
 ('PF00682', 1),
 ('PF00037', 1),
 ('PF13738', 1),
 ('PF04647', 1),
 ('PF01195', 1),
 ('PF02559', 1),
 ('PF04851', 1),
 ('PF00270', 1),
 ('PF00271', 1),
 ('PF03461', 1),
 ('PF01558', 1),
 ('PF01855', 1),
 ('PF02776', 1),
 ('PF01063', 1),
 ('PF09992', 1),
 ('PF01095', 1)]

In [8]:
def find_duplicates(df, numbers_to_check = 3, max_rows = 100):
    
    print(f"Checking {numbers_to_check} random rows:")    
    for i in range(numbers_to_check):
            print()
            row_number = np.random.randint(max_rows)
            print("checking row number:", row_number, end = " ") 
            for j in range(len(duplicate_finder(df, row_number))):
                if j%10 == 0:
                    print(".", end = " " )
                if duplicate_finder(df, row_number)[j][1] > 1:
                    print("duplicate found")
                    break
    print()
    print()
    print("No duplicate found")        

In [9]:
find_duplicates(df_corpus, numbers_to_check=3, max_rows=444)

Checking 3 random rows:

checking row number: 152 . . . . . . . 
checking row number: 164 . . . 
checking row number: 385 . . . . . 

No duplicate found


### Let's create a `corpus` which is a list of lists
- It should have 444 elements which each is a list of words in each article
- Each row is called an artice

In [9]:
import itertools
import re
import nltk
from nltk import word_tokenize

corpus = []
corpus = [word_tokenize(article) for article in df_corpus.text]

In [10]:
len(corpus)

444

In [11]:
len(corpus[100])

21

In [12]:
corpus[100]

['PF13165',
 'PF13353',
 'PF04055',
 'PF00890',
 'PF02502',
 'PF13394',
 'PF13186',
 'PF00156',
 'PF07992',
 'PF14681',
 'PF13616',
 'PF00639',
 'PF00849',
 'PF01624',
 'PF00488',
 'PF05192',
 'PF05190',
 'PF05188',
 'PF06133',
 'PF01938',
 'PF00919']

### Vocabulary is a list of all unique words in corpus
- We have 1456 unique words in our vocabulary

In [13]:
vocabulary = set(itertools.chain.from_iterable(corpus))

In [14]:
len(vocabulary)

1456

### `word_counts` is a dictionary of all words and their frequencies 
- (number of occrance)

In [15]:
from collections import Counter
word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))

len(word_counts)

1456

In [16]:
# test an example
# print the numbr of occurance of an example
print(f"The frequency (number of occrance) for 'PF13394' is: {word_counts['PF13394']}")

The frequency (number of occrance) for 'PF13394' is: 70


In [17]:
# show the index of a given word
word_to_index = {w: idx for (idx, w) in enumerate(vocabulary)}

# show the actual word for a given index
index_to_word = {idx: w for (idx, w) in enumerate(vocabulary)}

In [18]:
word_to_index['PF13394']

1398

In [81]:
index_to_word[1398]

'PF13394'

In [20]:
# to check these 2 lengthes are equal
assert len(word_to_index) == len(index_to_word), print("something's wrong!")

In [82]:
# first 20 elemets in word_to_index
dict(itertools.islice(word_to_index.items(), 20))

{'PF07993': 0,
 'PF06397': 1,
 'PF01032': 2,
 'PF13604': 3,
 'PF13384': 4,
 'PF01434': 5,
 'PF02837': 6,
 'PF05594': 7,
 'PF00588': 8,
 'PF16347': 9,
 'PF00122': 10,
 'PF00772': 11,
 'PF08843': 12,
 'PF10423': 13,
 'PF00216': 14,
 'PF00441': 15,
 'PF06874': 16,
 'PF02588': 17,
 'PF08659': 18,
 'PF03061': 19}

In [89]:
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.1):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.
        
    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]
    
    def stop_training(self):
        if len(self.loss_list) == 1:
            return False
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print("Loss gain: {}%".format(round(100*gain,2)))
#         if gain < self.min_percent_gain:
#             return True
        if min(self.loss_list) < 1:
            return True
        
#         else:
#             return False
        

In [90]:
import random

def get_batches(context_tuple_list, batch_size=100):
    
    random.shuffle(context_tuple_list)
    batches = []
    batch_target, batch_context, batch_negative = [], [], []
    
    for i in range(len(context_tuple_list)):
        batch_target.append(word_to_index[context_tuple_list[i][0]])
        batch_context.append(word_to_index[context_tuple_list[i][1]])
        batch_negative.append([word_to_index[w] for w in context_tuple_list[i][2]])
        if (i+1) % batch_size == 0 or i == len(context_tuple_list)-1:
            tensor_target = torch.from_numpy(np.array(batch_target)).long()
            tensor_context = torch.from_numpy(np.array(batch_context)).long()
            tensor_negative = torch.from_numpy(np.array(batch_negative)).long()
            batches.append((tensor_target, tensor_context, tensor_negative))
            batch_target, batch_context, batch_negative = [], [], []
    return batches

### Creating negative samples

In [91]:
from numpy.random import multinomial

def sample_negative(sample_size):
    sample_probability = {}
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    for word in word_counts:
        sample_probability[word] = word_counts[word]**0.75 / normalizing_factor
    words = np.array(list(word_counts.keys()))
    while True:
        word_list = []
        sampled_index = np.array(multinomial(sample_size, list(sample_probability.values())))
        for index, count in enumerate(sampled_index):
            for _ in range(count):
                 word_list.append(words[index])
        yield word_list

### Creating tuples of `(target, context)` and 8 negative samples

In [92]:
import numpy as np

context_tuple_list = []
w = 4 # window size of 4
negative_samples = sample_negative(8) # cerates 8 random samples for negative sampling

print("Adding to the context_tuple_list:")
for article in corpus:
    for i, word in enumerate(article):
        if i%5000 == 0 :
            print(".", end = ' ')
        
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(article))
        
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:                
                context_tuple_list.append((word, article[j], next(negative_samples)))
print()                
print("There are {} pairs of target and context words".format(len(context_tuple_list)))


Adding to the context_tuple_list:
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
There are 80277 pairs of target and context words


### Save and Load a list
- `context_tuple_list` is a list of tuples of target, context and 8 negative samples

In [93]:
import pickle

with open("context_tuple_list.txt", "wb") as fp:   #Pickling
    pickle.dump(context_tuple_list, fp)

In [94]:
import pickle

with open("context_tuple_list.txt", "rb") as fp:   # Unpickling
    context_tuple_list = pickle.load(fp)

In [95]:
context_tuple_list[20]
# the 1st element is the target wird
# the 2nd element is the context word
# the 3rd element is a list of 8 negative samples (incorrect contex words)

('PF08659',
 'PF13561',
 ['PF13279',
  'PF03444',
  'PF00037',
  'PF13673',
  'PF01068',
  'PF02901',
  'PF07486',
  'PF02361'])

In [86]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F


class Word2Vec(nn.Module):

    def __init__(self, embedding_size, vocab_size):
        super(Word2Vec, self).__init__()
        self.embeddings_target = nn.Embedding(vocab_size, embedding_size)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size)

    def forward(self, target_word, context_word, negative_example):
        emb_target = self.embeddings_target(target_word)
        emb_context = self.embeddings_context(context_word)
        emb_product = torch.mul(emb_target, emb_context) # element-wise multipicatipn(batch_size, emb)
        emb_product = torch.sum(emb_product, dim=1) # sum of all elements in each row (1, batch_dize)
        out = torch.sum(F.logsigmoid(emb_product))
        emb_negative = self.embeddings_context(negative_example)
        
        # torch.bmm: Performs a batch matrix-matrix product of matrices
        # both must be 3-D tensors each containing the same number of matrices.
        emb_product = torch.bmm(emb_negative, emb_target.unsqueeze(2))
        emb_product = torch.sum(emb_product, dim=1)
        out += torch.sum(F.logsigmoid(-emb_product))
        return -out

In [87]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### We set the `embedding_size = 200`

In [None]:
import time

vocabulary_size = len(vocabulary)

net = Word2Vec(embedding_size=200, vocab_size=vocabulary_size).to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters())

early_stopping = EarlyStopping(patience=5, min_percent_gain=0.5)

while True:
    losses = []
    context_tuple_batches = get_batches(context_tuple_list, batch_size=2000)
    for i in range(len(context_tuple_batches)):
        net.zero_grad()
        target_tensor, context_tensor, negative_tensor = context_tuple_batches[i]
        
        target_tensor = target_tensor.to(device)
        context_tensor = context_tensor.to(device)
        negative_tensor = negative_tensor.to(device)
        
        loss = net(target_tensor, context_tensor, negative_tensor)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print("Loss: ", np.mean(losses))
    early_stopping.update_loss(np.mean(losses))
    if early_stopping.stop_training():
        break

### Save the the calculated weights of the model `state_dict`


In [None]:
torch.save(net.state_dict(), "linear_model.pkl")

In [26]:
vocabulary_size = len(vocabulary)

# re-create the model with the same parameters
net_loaded = Word2Vec(embedding_size=200, vocab_size=vocabulary_size)


# load the savded state_dict
net_loaded.load_state_dict(torch.load('linear_model.pkl'))

print(net_loaded)

Word2Vec(
  (embeddings_target): Embedding(1456, 200)
  (embeddings_context): Embedding(1456, 200)
)


In [27]:
import numpy as np

def get_closest_word(word, topn=5):
    word_distance = []
    emb = net_loaded.embeddings_target.to('cpu') # embeddings_target is a layer's name in net 
    pdist = nn.PairwiseDistance()
    i = word_to_index[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long)
    v_i = emb(lookup_tensor_i)
    for j in range(len(vocabulary)):
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long)
            v_j = emb(lookup_tensor_j)
            word_distance.append((index_to_word[j], float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]

In [28]:
# the 10th word in the 6th element (row) of the corpus
corpus[6][10]

'PF07993'

In [29]:
# this function gives the embedding of any given tensor
def get_embedding(example):  
    word_idx = word_to_index[example]
    tensor_i = torch.tensor([word_idx], dtype=torch.long)
    emb = net_loaded.embeddings_target.to('cpu')
    v_i = emb(tensor_i)
    print(v_i.shape)
    print(v_i)

In [30]:
example = corpus[55][0]
example

'PF00501'

In [31]:
get_embedding('PF00501')

torch.Size([1, 200])
tensor([[ 0.7189, -0.2708,  0.7607, -0.4706,  0.2762,  0.1573, -1.1624,  2.3537,
          0.5249, -1.9238,  0.9707,  0.6295, -0.0648, -1.4088,  0.0157, -0.1950,
         -0.6292, -0.3629, -0.7835,  1.2085,  0.7593, -0.1814, -0.8467,  0.7955,
         -0.7397, -0.5660, -0.3464, -1.8314, -0.1965, -0.0588,  0.0325, -0.0115,
          0.0462, -0.6450, -1.6439,  0.1170,  0.4583, -0.6088, -0.8033,  0.0345,
          0.1159, -0.7268, -0.9968,  1.1516, -0.4881, -0.9967, -0.0648,  0.2730,
          0.4092,  0.6889, -0.3140,  0.6734,  2.0125, -0.2978,  0.5582,  1.1533,
          0.1928,  0.2110,  0.7036, -0.7814, -1.3970, -0.4693, -0.3137,  0.0097,
          1.0269,  0.7430, -0.6164, -0.7070,  1.3978,  0.6560,  0.4312,  0.9657,
          1.4866,  1.1634, -1.3779, -0.4978, -0.3533,  0.6932,  1.4032, -1.6677,
         -1.1841, -0.3967,  1.1742, -0.0981, -0.0533,  0.0421,  1.3512,  1.0210,
          0.8692, -0.3518, -1.5903, -1.0461, -1.2040, -0.9503,  0.0405,  0.2029,
       

In [33]:
net_loaded.embeddings_target

Embedding(1456, 200)

In [32]:
index_to_word[1]

'PF06397'

In [34]:
import torch
import torch.nn as nn

# Get embeddings for the word at index 1 which is 'PF13742'
given_tensor = torch.LongTensor([1]) 

emb = net_loaded.embeddings_target.to('cpu')
emb(given_tensor)

tensor([[ 0.5359, -0.7684,  0.7282, -0.7458,  0.2250,  0.8369, -0.1053,  0.6144,
         -1.5434,  1.3837, -0.0328,  0.8713,  1.0123,  1.0426, -1.1505, -0.0632,
         -0.1059, -0.7121,  0.0374, -0.4777, -0.5480,  1.1887, -0.1980,  2.3879,
         -0.0540, -0.2098, -1.9747,  0.5113,  0.0321,  0.3041, -0.2379, -1.8359,
          0.9093,  0.5847, -2.1586, -1.2363,  0.9626, -1.5322,  0.1414, -1.0482,
         -0.1414, -1.6646,  0.0410,  0.2592,  0.3366, -0.2584,  1.6519, -0.3203,
         -0.8890,  1.0372,  1.7368, -0.4734,  0.5459,  0.5644, -0.5998, -0.8241,
          0.2712,  0.2275, -0.8104,  0.4923, -0.3675, -0.7963, -0.3968,  0.7664,
         -0.1714, -0.2046,  0.8938, -0.4721,  1.4047, -1.3800,  1.3890, -0.0399,
          0.0359,  0.8458,  0.5668, -1.0664,  0.6646,  1.3218,  0.1220, -1.8032,
          0.9963, -1.0555,  1.2964, -0.7797,  0.5181, -0.4212, -0.0137, -1.5138,
          0.6439,  1.1263, -0.2956,  1.1352,  0.6468,  0.1048,  0.5438,  0.3626,
         -0.9292, -0.4250,  

In [44]:
# m.weight contains the embedding weights.
embed_weights = net_loaded.embeddings_target.weight.detach().numpy()
embed_weights

array([[-1.1012517 , -0.73667127, -0.88576293, ..., -0.28950164,
        -1.2180898 , -1.6013262 ],
       [ 0.5359053 , -0.7683729 ,  0.72822714, ..., -0.77044475,
         0.42743716, -0.08898509],
       [ 0.12467647, -0.25790042,  0.16013083, ..., -0.21123819,
        -2.0667732 , -1.626157  ],
       ...,
       [-0.60910374,  0.6216965 , -0.8842806 , ..., -0.01349114,
         0.7565897 , -0.4376295 ],
       [ 0.02750395, -1.6208653 , -0.01751652, ..., -0.21836942,
        -0.3727686 , -0.03830582],
       [-0.3886597 , -0.9766208 , -0.46216154, ..., -0.43965217,
        -0.79844314,  0.03470441]], dtype=float32)

In [58]:
# vocab_dim
len(embed_weights)

1456

In [60]:
# embed_dim
len(embed_weights[0])

200

In [67]:
print(f"""We have {len(embed_weights)} embeddings which is equal to the total number of rows  
and each embedding has 200 values (a tensor of size (1x 200)) which is equal to the embedding_dim we set before.""")

We have 1456 embeddings which is equal to the total number of pfams
and each embedding has 200 values (a tensor of size (1x 200)) which is equal to the embedding_dim we set before.


### Save and Load embedding weights

In [45]:
np.savetxt('embed_weights.txt', embed_weights)

In [46]:
embed_weights = np.loadtxt('embed_weights.txt')
embed_weights

array([[-1.10125172, -0.73667127, -0.88576293, ..., -0.28950164,
        -1.21808982, -1.60132623],
       [ 0.5359053 , -0.76837289,  0.72822714, ..., -0.77044475,
         0.42743716, -0.08898509],
       [ 0.12467647, -0.25790042,  0.16013083, ..., -0.21123819,
        -2.06677318, -1.62615705],
       ...,
       [-0.60910374,  0.62169647, -0.88428062, ..., -0.01349114,
         0.75658971, -0.43762949],
       [ 0.02750395, -1.62086535, -0.01751652, ..., -0.21836942,
        -0.37276861, -0.03830582],
       [-0.38865969, -0.97662079, -0.46216154, ..., -0.43965217,
        -0.79844314,  0.03470441]])

### Let's test the embedding weights for a given tensor

In [79]:
import torch
import torch.nn as nn

vocabulary_size = len(vocabulary) # 1456

# FloatTensor containing pretrained weights
tensor_weight = torch.FloatTensor(embed_weights)
embedding = nn.Embedding(vocabulary_size, 200).from_pretrained(tensor_weight)

# Get embeddings for index 100
given_tensor = torch.LongTensor([100])
embedding(given_tensor)

tensor([[ 1.9294, -0.4915,  1.0318, -0.4365, -0.5282, -1.0763, -1.0245, -0.3804,
         -0.1479, -1.5536,  1.7433, -1.4383, -0.4809,  1.1201,  1.1437,  0.0459,
          0.3696, -2.1861, -0.5929,  1.8557, -1.2914,  1.4966,  0.8013, -0.5334,
         -0.9920, -0.0042,  0.1584,  0.4800, -0.6065, -0.1880, -0.1205, -0.3557,
          0.8604,  0.1648,  1.1942,  0.5635, -0.1705,  0.9610, -1.2785, -0.4699,
          1.1572,  2.0063, -0.3878,  0.6736, -0.4867, -1.7404, -0.5976,  0.0538,
          0.3461,  0.8936, -0.4225, -0.2287, -0.6770, -1.5320,  1.0000,  0.8466,
         -0.8438, -0.1719,  0.8197,  0.7720, -0.6393, -0.7313,  0.0736,  0.9093,
         -1.1983, -0.6063,  0.8366, -0.1000,  0.4303,  0.1646,  0.6053,  0.2938,
          0.7185,  1.8824,  0.7304, -0.1334, -0.5072,  0.1319, -0.4822, -0.5414,
          1.6454,  0.5448, -0.8725,  0.7131, -0.2712,  0.7965,  0.9743,  1.2016,
          0.3105,  1.3030, -0.8468,  0.2280, -0.8793,  1.2977, -0.1493, -0.5487,
         -1.7064, -0.1393, -

### Now we use these weights in the embedding layer of our model in the next project