In [6]:
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
import random
import json
import re
from sklearn.manifold import TSNE
from scipy import spatial
import matplotlib.pyplot as plt
import pickle
import torch.nn.functional as F
import random
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)

<torch._C.Generator at 0x7e5d62e4fc90>

# Processing data and gold file

In [7]:
# preprocess data and gold file +  convert bi and trigrams to a underscore seperated word

subtask = "2B.music"
phase = "training"

f = open(f'/kaggle/input/inlp-project/{subtask}.{phase}.data.txt','r')
data = f.read()
f.close()
f = open(f'/kaggle/input/inlp-project/{subtask}.{phase}.gold.txt','r')
gold = f.read()
f.close()
f = open(f'/kaggle/input/inlp-project/{subtask}.vocabulary.txt','r')
Vocab = f.read()
f.close()

data = data.split('\n')
gold = gold.split('\n')
Vocab = Vocab.split('\n')

In [8]:
w2i = {}
i2w = {}
vocab = []
ind = 1

w2i['UNK'] = 0
i2w[0] = 'UNK'
vocab.append('UNK')


for line in tqdm(Vocab):
    line = line.lower()
    line = line.split(' ') 
    joined_word = ""
    for w in line:
        joined_word += w +"_"
    joined_word = joined_word[:-1]
    
    w2i[joined_word] = ind
    i2w[ind] = joined_word
    vocab.append(joined_word)
    ind += 1
    
        

100%|██████████| 69119/69119 [00:00<00:00, 606815.49it/s]


In [9]:
hyponyms = []

for line in data:
    line = line.lower()
    line = line.split("\t")
    line = line[0]
    line = line.split(" ")
    if len(line)>1:
        joined_word = ""
        for word in line:
            joined_word += word + "_"
        joined_word = joined_word[:-1]
        if joined_word not in w2i.keys():
            l = len(w2i.keys())
            w2i[joined_word] = l
            i2w[l] = joined_word
            vocab.append(joined_word)
        hyponyms.append(joined_word)
    else:
        hyponyms.append(line[0])
        joined_word = line[0]
        if joined_word not in w2i.keys():
            l = len(w2i.keys())
            w2i[joined_word] = l
            i2w[l] = joined_word
            vocab.append(joined_word)
hyponyms = hyponyms[:-1]

In [12]:
hyponyms[-10:]

['melodic_phrase',
 'hot_issue',
 'gavotte',
 'antiphon',
 'recapitulation',
 'fugazi',
 'nightshift',
 'solfeggio',
 'dance_pop',
 'zydeco']

In [28]:
hypernyms = []
for line in gold:
    line = line.lower()
    line = line.split("\t")
    temp_hypernyms = []
    for word in line:
        word = word.split(" ")
        if len(word)>1:
            joined_word = ""
            for w in word:
                joined_word += w + "_"
            joined_word = joined_word[:-1]
            if joined_word not in w2i.keys():
                l = len(w2i.keys())
                w2i[joined_word] = l
                i2w[l] = joined_word
                vocab.append(joined_word)
            temp_hypernyms.append(joined_word)
        else:
            temp_hypernyms.append(word[0])
            joined_word = word[0]
            if joined_word not in w2i.keys():
                l = len(w2i.keys())
                w2i[joined_word] = l
                i2w[l] = joined_word
                vocab.append(joined_word)
    hypernyms.append(temp_hypernyms)
    

In [29]:
w2i['exodus']

19507

In [30]:
all_hypernyms = set()

for line in hypernyms:
    for word in line:
        all_hypernyms.add(word)

all_hypernyms = list(all_hypernyms)
# all_hypernyms[:10]


### vocab

In [31]:
# a function for finding negative hypernyms of given hyponyms
# this function will return hyponym positive and negative hpyernyms in following manner
# given hyponym - 'ayush'
''' function should return - 
[

    [['man'],['neg11','neg12','neg13','neg14','neg15']],
    [['boy'],['neg21','neg22','neg23','neg24','neg25']],
    [['person'],['neg31','neg32','neg33','neg34','neg35']],
    [['student'],['neg41','neg42','neg43','neg44','neg45']],
    
    ]
    
    
    '''

num_neg_hypernyms = 5

def pos_neg_hypernyms(hyponym):

    try:
        index_in_data = hyponyms.index(hyponym)
        
    except:
        print(ind,len(hyponyms),hyponym)
    
    hypernyms_temp = hypernyms[index_in_data]
    num_hypernyms = len(hypernyms_temp)
    neg_hypernyms = []
    for i in range(num_hypernyms*num_neg_hypernyms):
        neg_h = all_hypernyms[random.randint(0,len(all_hypernyms)-1)]
        while neg_h in neg_hypernyms or neg_h == hyponym or neg_h in hypernyms_temp: 
            neg_h = all_hypernyms[random.randint(0,len(all_hypernyms)-1)]
            
        neg_hypernyms.append(neg_h)
    
    ans = []
    for i in range(num_hypernyms):
        ans_temp = []
        h_ind = w2i[hypernyms_temp[i]]
        ans_temp.append([h_ind])
        
                    
        neg_temp = []
        for j in range(i*5,i*5+5):
            neg_temp.append(w2i[neg_hypernyms[j]])
            
        ans_temp.append(neg_temp)
        
        ans.append(ans_temp)
    return ans
        

In [32]:
pos_neg_hypernyms(w2i['maliciousness'])

KeyError: 'maliciousness'

In [33]:
print( hyponyms.index('maliciousness'))
print(hypernyms[0])
print(w2i['malignity'])

ValueError: 'maliciousness' is not in list

# Step 1 -  Glove se embedding le rhe

In [34]:
# embed_dict = {}

# with open('/kaggle/input/glove-embeddings/glove.6B.300d.txt','r') as f:
#     for line in f:
#         values = line.split()
#         word = values[0]
#         vector = np.asarray(values[1:],'float32')
#         embed_dict[word]=vector

# embed_dict['oov'] = np.zeros(300)


f = open('/kaggle/input/word2vec/model.txt','r')
word2vec_pretrained = f.read()
word2vec_pretrained = word2vec_pretrained.split('\n')
word_emb = {}
for i,sent in tqdm(enumerate(word2vec_pretrained)):
    if i == 0 or i == len(word2vec_pretrained)-1:
        continue
    sent = sent.split(' ')
    word_tag = sent[0]
    word_tag = word_tag.split('_')
    word = word_tag[0]
    tag = word_tag[1]
    emb = sent[1:]
    word_emb[word] = emb

163475it [00:06, 23552.59it/s]


In [35]:
my_embed = torch.empty((len(w2i.keys()),300),dtype=torch.float32).to(device)

for i in tqdm(range(len(w2i.keys()))):
    try:
        my_embed[i] = tensor.torch(word_emb[i2w[i]])
    except:
        my_embed[i] = torch.randn(300) - 0.5
#     my_embed.append(x)
    
# my_embed = np.array(my_embed)

100%|██████████| 69214/69214 [00:02<00:00, 31704.77it/s]


In [36]:
print(type(my_embed[0]))
ayush = torch.tensor(my_embed).to(device)

<class 'torch.Tensor'>


  


## Model architecture

In [37]:
my_embed[0]

tensor([-2.3473, -0.1376,  1.3339, -0.1941, -0.0558, -0.9553, -0.2165, -0.7405,
        -0.2102, -2.5986, -0.6938, -0.9715,  0.2765, -0.3417, -2.3303, -1.9035,
         1.5834,  0.6114, -0.2227, -1.4244, -0.8406, -0.1344,  0.4327, -0.8914,
        -0.0413, -1.9956,  0.4505, -1.9739, -0.0778, -1.5297, -1.2487, -1.7162,
        -0.7949, -0.1387, -0.5110, -1.6256, -1.8286,  1.5264, -0.1992, -0.3642,
        -1.0611, -1.0926, -0.1016,  0.1592, -1.3246,  0.1706, -0.8517, -0.8840,
        -0.1951, -2.3112, -0.1781,  0.5027, -0.4036, -0.1619, -0.4024, -1.2881,
         0.9208,  0.5303,  0.7317, -2.1539,  0.1457, -2.0435,  0.2250, -0.9976,
         1.7259,  0.6078,  0.8222, -0.5802, -0.7488, -1.6067, -0.2956, -1.5322,
         0.1158, -0.4531, -0.0584, -0.1054, -0.5861, -0.5074, -1.0755, -0.3392,
        -2.1925, -1.3966,  0.3693, -0.4955,  0.0185, -0.3899,  0.9854, -0.0216,
        -0.3865, -0.8611, -1.3445, -1.2779,  1.6939,  0.7684, -0.3829,  0.4310,
        -0.5873, -1.6402, -0.2029, -1.34

In [38]:
class w2v_HH_embeddings(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(w2v_HH_embeddings, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.embedding.weight.data.copy_(my_embed) #to do
        
        self.linear1 = nn.Linear(embedding_size, 1)
        self.linear2 = nn.Linear(embedding_size, 1)
        # self.sigmoid = nn.Sigmoid()

    def forward(self, hyponym, hypernym, neg_hypernym):
        # (bs,1) (bs,1) (bs,neg_sam)
        hyponym_embeddings = self.embedding(hyponym) # bs,1,300
        hypernym_embeddings = self.embedding(hypernym) # bs,1,300
        neg_hypernym_embeddings = self.embedding(neg_hypernym) # bs,5,300
        
#         similarity between hyponym and true hypernym
        pos_score = torch.mul(hyponym_embeddings, hypernym_embeddings) #bs,1,300
        pos_score = torch.squeeze(pos_score, 1)#bs,300  
        pos_score = self.linear1(pos_score)#bs,1        
        pos_score = -F.logsigmoid(pos_score) #bs,1
        pos_score = torch.squeeze(pos_score,1) #bs

#         similarity between hyponym and true neg hypernym
        hyponym_embeddingsT = torch.transpose(hyponym_embeddings, 1, 2) #bs,300, 1
        neg_score = torch.bmm(neg_hypernym_embeddings, hyponym_embeddingsT) #bs,5,1
        neg_score = torch.squeeze(neg_score, 2)#bs,5
        neg_score = -F.logsigmoid(-neg_score) #bs,5
        neg_score = torch.sum(neg_score,dim=1) # bs
        total_score = torch.mean(pos_score + neg_score)
        return total_score

# Parameters

In [39]:
vocab_size = len(i2w.keys())
epochs = 50
batch_size = 32

In [40]:
model = w2v_HH_embeddings(vocab_size,300)
model.to(device)
optimizer = optim.Adam(model.parameters(),lr=0.0001)

# Training the embeddings

In [41]:
print(len(w2i.keys()),len(vocab))

69214 69214


In [42]:
for epoch in range(epochs):
    hypernym_batch = []
    hyponym_batch = []
    neg_hypernym_batch = []
    running_loss = []
    
    for i,hyponym in tqdm(enumerate(hyponyms)):
#         ind = w2i[hyponyms[i]]
        temp = pos_neg_hypernyms(hyponym)
        
        for a_list in temp:
            # (bs,1) (bs,1) (bs,neg_sam)
            hyponym_batch.append([ind]) #bs*1
            
            hypernym_batch.append(a_list[0]) # bs*1
            neg_hypernym_batch.append(a_list[1]) # bs*5
            
            if len(hyponym_batch) == batch_size:
                a = torch.tensor(hyponym_batch).to(device) # bs*1
                
                b = torch.tensor(hypernym_batch).to(device) # bs*1
                
                c = torch.tensor(neg_hypernym_batch).to(device) # bs*5
                
                loss = model(a,b,c)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                running_loss.append(loss.item())
                
                hyponym_batch.clear()
                hypernym_batch.clear()
                neg_hypernym_batch.clear()
                
    epoch_loss = np.mean(running_loss)
    print("training epoch_loss is", epoch_loss)
                
#         a = torch.tensor(hyponym_batch)
#         b = torch.tensor(hypernym_batch)
#         c = torch.tensor(neg_hypernym_batch)
#         similarity = model(a,b,c)
''' function should return - 
[

    [['man'],['neg11','neg12','neg13','neg14','neg15']],
    [['boy'],['neg21','neg22','neg23','neg24','neg25']],
    [['person'],['neg31','neg32','neg33','neg34','neg35']],
    [['student'],['neg41','neg42','neg43','neg44','neg45']],
    
    ]
    
    
    '''

499it [00:01, 431.15it/s]


training epoch_loss is 336.03765061322383


499it [00:01, 436.15it/s]


training epoch_loss is 313.1902713551241


499it [00:01, 432.86it/s]


training epoch_loss is 291.8517249612247


499it [00:01, 438.70it/s]


training epoch_loss is 269.1547970042509


499it [00:01, 442.03it/s]


training epoch_loss is 248.03088387882008


499it [00:01, 437.95it/s]


training epoch_loss is 227.13322358972886


499it [00:01, 434.15it/s]


training epoch_loss is 204.3816203397863


499it [00:01, 435.37it/s]


training epoch_loss is 182.01185204001035


499it [00:01, 424.57it/s]


training epoch_loss is 162.69990369011373


499it [00:01, 433.38it/s]


training epoch_loss is 141.3122401966768


499it [00:01, 432.63it/s]


training epoch_loss is 121.30799663768096


499it [00:01, 435.98it/s]


training epoch_loss is 101.20090103149414


499it [00:01, 438.31it/s]


training epoch_loss is 84.5836877710679


499it [00:01, 437.60it/s]


training epoch_loss is 69.41216924330767


499it [00:01, 431.81it/s]


training epoch_loss is 55.03534128525678


499it [00:01, 443.35it/s]


training epoch_loss is 43.96806541891659


499it [00:01, 443.60it/s]


training epoch_loss is 34.33574177237118


499it [00:01, 438.59it/s]


training epoch_loss is 26.541035079956053


499it [00:01, 423.17it/s]


training epoch_loss is 20.085487898658304


499it [00:01, 414.80it/s]


training epoch_loss is 14.573886910606833


499it [00:01, 399.85it/s]


training epoch_loss is 10.814496332056382


499it [00:01, 440.57it/s]


training epoch_loss is 7.898683971517226


499it [00:01, 439.74it/s]


training epoch_loss is 5.584952235221863


499it [00:01, 439.86it/s]


training epoch_loss is 4.256281128350426


499it [00:01, 438.15it/s]


training epoch_loss is 3.0173813285196527


499it [00:01, 436.25it/s]


training epoch_loss is 2.1856285459855025


499it [00:01, 437.48it/s]


training epoch_loss is 1.5370405879090814


499it [00:01, 423.59it/s]


training epoch_loss is 1.1260290913283826


499it [00:01, 432.02it/s]


training epoch_loss is 0.8013097006608458


499it [00:01, 438.03it/s]


training epoch_loss is 0.6005954576108385


499it [00:01, 438.97it/s]


training epoch_loss is 0.505729165213073


499it [00:01, 439.22it/s]


training epoch_loss is 0.4913382186280454


499it [00:01, 440.24it/s]


training epoch_loss is 0.3614407949587878


499it [00:01, 438.80it/s]


training epoch_loss is 0.34079958696247026


499it [00:01, 439.98it/s]


training epoch_loss is 0.2754429548047483


499it [00:01, 441.08it/s]


training epoch_loss is 0.2503763465046444


499it [00:01, 437.01it/s]


training epoch_loss is 0.2048667786478558


499it [00:01, 422.53it/s]


training epoch_loss is 0.2052698367521824


499it [00:01, 440.14it/s]


training epoch_loss is 0.14010505785015137


499it [00:01, 438.72it/s]


training epoch_loss is 0.15102387710255297


499it [00:01, 438.16it/s]


training epoch_loss is 0.10606016990838243


499it [00:01, 440.25it/s]


training epoch_loss is 0.09949690085452269


499it [00:01, 441.11it/s]


training epoch_loss is 0.06850692567360751


499it [00:01, 439.71it/s]


training epoch_loss is 0.04045850287246353


499it [00:01, 439.44it/s]


training epoch_loss is 0.021113137025660013


499it [00:01, 434.20it/s]


training epoch_loss is 0.010848208074457943


499it [00:01, 422.37it/s]


training epoch_loss is 0.007108109299203053


499it [00:01, 360.81it/s]


training epoch_loss is 0.005947790813961011


499it [00:01, 440.10it/s]


training epoch_loss is 0.005055907646687154


499it [00:01, 438.87it/s]

training epoch_loss is 0.004283288653994746





" function should return - \n[\n\n    [['man'],['neg11','neg12','neg13','neg14','neg15']],\n    [['boy'],['neg21','neg22','neg23','neg24','neg25']],\n    [['person'],['neg31','neg32','neg33','neg34','neg35']],\n    [['student'],['neg41','neg42','neg43','neg44','neg45']],\n    \n    ]\n    \n    \n    "

In [43]:
torch.save(model, '/kaggle/working/hypernym-hyponym-embeddings_training.pt')

In [45]:
saved_embeddings = {}
for i in tqdm(range(1,len(i2w.keys()))):#(len(index2word)):
    word = i2w[i]
    saved_embeddings[word] = model.embedding.weight[i].detach().cpu().numpy()

with open('hypernym-hyponym-embeddings_2B.pkl','wb') as f:
    pickle.dump(saved_embeddings,f)

100%|██████████| 69213/69213 [00:02<00:00, 23812.00it/s]


In [46]:
parameters = {}

parameters['vocab'] = vocab
parameters['i2w'] = i2w
parameters['w2i'] = w2i
parameters['hypernyms'] = hypernyms
parameters['hyponyms'] = hyponyms

with open('hypernym-hyponym-dictionaries_2B.pkl','wb') as f:
    pickle.dump(parameters,f)

In [None]:
a = torch.tensor([[1,2,3]])
b = torch.tensor([[1,1,1],[2,2,2]])
ans = torch.mul(b,a)
print(a)
print(b)
print(ans)

In [None]:
embedding = nn.Embedding(100, 30)
x = torch.tensor([[5],[3],[4]])
z = embedding(x)
y = embedding(x)
sim = torch.mul(z,y)
sim = torch.squeeze(sim, 1)#bs,300
sim = torch.sum(sim,dim = 1)
print(sim)
a = nn.Sigmoid()
sim = a(torch.tensor([[-9]]))
print(sim.shape)
print(sim)


In [None]:
a = torch.randn(10,5)
b = torch.randn(10,5)
print(a.shape)

print(torch.sum(a,dim = 1).shape)

In [None]:
x = np.random.rand(300) - 0.5
x