# Imports

In [2]:
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter

import random
import json
import re
from sklearn.manifold import TSNE
from scipy import spatial
import matplotlib.pyplot as plt
import pickle
import copy
from tqdm import tqdm
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)

<torch._C.Generator at 0x7766a1a81e30>

# Parameters

In [10]:
# no of projection matrices
k = 24

# no of dimentions in embedding
dim = 300

# no of negative samples
neg_sample_count = 5

# learning rate
lr = 0.001

batch_size = 32

vocab_size = len(vocab)

# 1A 2A 2B
subtask = "1A"

# training test
phase = "training"

# datafile
dataFilePath = f"/kaggle/input/inlp-project/{subtask}.english.{phase}.data.txt"

# goldfile
goldFilePath = f"/kaggle/input/inlp-project/{subtask}.english.{phase}.gold.txt"

# vocab
vocabFilePath = f"/kaggle/input/inlp-project/{subtask}.english.vocabulary.txt"


# Data loading

In [25]:
file = open(f"/kaggle/input/inlp-project/hypernym-hyponym-dictionaries_{subtask}.pkl",'rb')
parameters = pickle.load(file)
file.close()

vocab = parameters['vocab']
w2i = parameters['w2i']
i2w = parameters['i2w']


In [14]:
# considering preprocesses data like lower and three gram, bi gram, one gram

data = []
with open(dataFilePath) as dataset:
    for line in tqdm(dataset):
        line = line.lower()
        line = line.split('\t')
        data.append(line[0])
        
print(len(data))

1500it [00:00, 595218.16it/s]

1500





In [16]:
gold = []
with open(goldFilePath) as dataset:
    for line in tqdm(dataset):
        line = line.lower()
        line = line.strip()
        line = line.split('\t')
        gold.append(line)
        

1500it [00:00, 402267.01it/s]


# Model Architecture

In [17]:
class HHD(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(HHD, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
#         self.embedding.weight.data.copy_(trained_embeddings) #to do

        self.output = nn.Linear(k, 1)
        
        var = 2 / (dim + dim)
        
        # Initialize projection matrices using scheme from Glorot & Bengio (2008).
        
        self.proj_mats = torch.zeros([k, dim, dim], dtype=torch.float32).to(device)
        # Fills self tensor with elements samples from the normal distribution parameterized by mean and std.
        self.proj_mats.normal_(0, var)
        # mat_data is of size k*dim*dim
        # finally mat_data is k*dim*dim matrix ie k projection matrices, each matric is populated with random value
        # diagonal elements will be 1+random value and other will be 0+random value and random value will range 0 and var
        self.proj_mats += torch.cat([torch.eye(dim, ).unsqueeze(0) for _ in range(k)]).to(device)
        self.sigmoid = nn.Sigmoid()
        
#     def similarity(self,query, cand_hypernym):
#         query = self.embedding(query) #1*d
#         cand_hypernymT = self.embedding(cand_hypernym) #1*d
        
#         #proj is of dim d*d, q is 1*d
#         qT = torch.transpose(query,0,1).to(device) # d*1
#         projT = torch.matmul(self.proj_mats,qT).to(device) #k*d*d X d*1 = k*d*1
#         projT = torch.squeeze(projT,2).to(device) #k*d
#         proj = torch.transpose(projT,0,1).to(device) #d*k
        
#         # find similarity between query and candidate 
#         cand_hypernym = torch.transpose(cand_hypernymT,0,1) #d*1
#         simPosHyper = torch.matmul(projT,cand_hypernym).to(device) #k*d x d*1 = k*1
#         simPosHyper = torch.squeeze(simPosHyper,1) # k
#         simPos = self.output(simPosHyper) # 1
        
#         return simPos
    
    def similarity(self,query, cand_hypernym,bs):
        query = self.embedding(query) #1*d
        cand_hypernymT = self.embedding(cand_hypernym) #bs*d
        
        #proj is of dim d*d, q is 1*d
        qT = torch.transpose(query,0,1).to(device) # d*1
        projT = torch.matmul(self.proj_mats,qT).to(device) #k*d*d X d*1 = k*d*1
        projT = torch.squeeze(projT,2).to(device) #k*d
        proj = torch.transpose(projT,0,1).to(device) #d*k
        
        # find similarity between query and candidate 
        cand_hypernym = torch.transpose(cand_hypernymT,0,1) #d*bs
        simPosHyper = torch.matmul(projT,cand_hypernym).to(device) #k*d x d*bs = k*bs
#         simPosHyper = torch.squeeze(simPosHyper,1) # k
        simPosHyper = torch.transpose(simPosHyper,0,1) # bs*k
        simPos = self.output(simPosHyper) # bs*1
        simPos = self.sigmoid(simPos) #bs*1
        
        return simPos
        

    def forward(self, query, cand_hypernym, neg_hypernyms ):
        # query - 255 , cand_hypernym - 255, neg_hypernyms - 255*5
        # getting embeddings of required entities
        query = self.embedding(query) #bs*d
        cand_hypernymT = self.embedding(cand_hypernym) #bs*d
        neg_hypernymsT = self.embedding(neg_hypernyms) #bs*ns*d
        
        query = torch.unsqueeze(query,2) # bs*d*1

#         batch_proj = torch.empty((batch_size,k,dim),dtype=torch.float32).to(device)
#         for i,q in enumerate(query):
#             # q is tensor of size d*1
#             projT = torch.matmul(self.proj_mats,q).to(device) # k*d*d X d*1 = k*d*1
#             projT = torch.squeeze(projT,2) # k*d
#             batch_proj[i] = projT #bs*k*d
            
        batch_proj = torch.tensor([]).to(device)
        for i,q in enumerate(query):
            projT = torch.matmul(self.proj_mats,q).to(device) # k*d*d X d*1 = k*d*1
            projT = torch.squeeze(projT,2) # k*d
            projT = projT.reshape([-1])
            batch_proj = torch.cat((batch_proj,projT))
        
        batch_proj = batch_proj.reshape([-1,k,dim])
        
        
        # find similarity between query and candidate 
        cand_hypernym = torch.unsqueeze(cand_hypernymT,2) #bs*d*1
        simPos = torch.bmm(batch_proj,cand_hypernym) #bs*k*d x bs*d*1 = bs*k*1
        simPos = torch.squeeze(simPos,2) #bs*k
        simPosOutput = self.output(simPos) #bs*1
#         simPosOutput = self.sigmoid(simPosOutput)
        
        
        # a step from above
        # find similarity between query and negative samples
        batch_projT = torch.transpose(batch_proj,1,2) #bs*d*k
        simNegs = torch.bmm(neg_hypernymsT,batch_projT) #bs*ns*d x bs*d*k = bs*ns*k
        simNegsOutput = self.output(simNegs) #bs*ns*1
        simNegsOutput = torch.squeeze(simNegsOutput,2) #bs*ns
#         simNegsOutput = self.sigmoid(simNegsOutput)
        
        
        
        # simPos - bs*1, simNegs - bs*ns
        return simPosOutput,simNegsOutput
    
#     ///////////////////////////////////////////////////////////////////////////////////////
        # getting embeddings of required entities
        query = self.embedding(query)
        cand_hypernymT = self.embedding(cand_hypernym) #1*d
        neg_hypernymsT = self.embedding(neg_hypernyms) #ns*1*d
        
        #proj is of dim d*d, q is 1*d
        qT = torch.transpose(query,0,1) # d*1
        projT = torch.matmul(self.proj_mats,qT).to(device)
        projT = torch.squeeze(projT,2) # k*d*d X d*1 = k*d*1
        proj = torch.transpose(projT,0,1) #k*d
        
        # find similarity between query and candidate 
        cand_hypernym = torch.transpose(cand_hypernymT,0,1) #d*1
        simPosHyper = torch.matmul(proj,cand_hypernym).to(device) #k*d x d*1 = k*1
        simPosHyper = torch.squeeze(simPosHyper,1) # k
        simPos = output(simPosHyper) # 1
        
        # find similarity between query and negative samples
        #neg_hypernyms = torch.transpose(neg_hypernymsT,1,2) # ns*d*1
        simNegHypersT = torch.matmul(neg_hypernymsT,projT).to(device) # ns*1*d x d*k= ns*1*k
        simNegHypers = torch.transpose(simNegHypersT,1,2) # ns*k*1
        simNegHypers = torch.squeeze(simNegHypers,2) # ns*k
        simNegs = output(simNegHypers) # ns*1
        simNegs = torch.squeeze(simNegs,1) # ns
        
        return simPos,simNegs
    
            

# Projection model

In [26]:
projection_model = HHD(vocab_size,dim)
projection_model.to(device)
projection_model = torch.load("/kaggle/input/inlp-project/HH_Projection_model_1A.pt")
projection_model.eval()

HHD(
  (embedding): Embedding(219246, 300)
  (output): Linear(in_features=24, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [30]:
'''
    predict function will take a query, a word and will return list of 
    100 closest words according to projection learning model ie supervised learning
'''
def predict(query):
    
    try:
        q = torch.tensor([w2i[query]]).to(device)
    except:
        return "word not found in vocab"
    
    closest_hypernyms = [] 
    
    h = torch.tensor(list(range(1,vocab_size))).to(device)
    s = projection_model.similarity(q,h,h.shape[0]) #bs*1

    for i in range(1,vocab_size):
        closest_hypernyms.append([float(s[i-1]),vocab[i]])
    closest_hypernyms.sort(reverse=True)
    answer = []
    
    l = 100
    if l>len(closest_hypernyms):
        l = len(closest_hypernyms)
    
    for i in range(l):
        answer.append(closest_hypernyms[i][1])
        
    return answer

In [29]:
predict("figure")

['foodborne_illness',
 'supraocular',
 'incurrence',
 'chinaberry_tree',
 'vegetate',
 'schwalbea',
 'dialectal',
 'iodine-131',
 'style_of_architecture',
 'vedic',
 'inlight',
 'losel',
 'semi-dry',
 'haliotis_rufescens',
 'sawgrass',
 'naslund',
 'unvaluable',
 'fly',
 'morphotactics',
 '185',
 'orchard',
 'quinquennial',
 'housewifization',
 'korotkoff_sounds',
 'monoarticular',
 'tryptamine',
 'pi',
 'string_along',
 'nonlinear_optics',
 'microbotryum',
 'sister-in-laws',
 'sodium_stibogluconate',
 'overking',
 'blind_person',
 'spener',
 'ferromagnetic',
 'archi',
 'bloodthirstily',
 'langenbach',
 'botrytis_cinerea',
 'batrachospermum',
 'apocrine_gland',
 'abscond',
 'buenoa',
 'meeuwsen',
 'second_deck',
 'sulzberger',
 'change_course',
 'population_commission',
 'nusselt_number',
 'vicar',
 'sulh',
 'charax',
 'moocher',
 'garishness',
 'racquet',
 'pillow_talk',
 'catenet',
 'nationalist_leader',
 'shot_hole',
 'myrtaceae',
 'splicer',
 'hemicrania',
 'practical_application',

In [None]:
print(vo)