In [52]:
#importing req modules
import numpy as np
import math
import pandas as pd
from gensim.models import Word2Vec

In [53]:
#reading the similarity dataset
l=[]
with open('hindi.txt','r',encoding="utf8") as fp:
    l.extend(fp.readlines())
#getting the similarity dataset pairs and ground truths
pairs=[]
for i in range(len(l)-1):
    pairs.append(l[i].split(','))
#storing different thresholds
thresholds=[0.4,0.5,0.6,0.7,0.8]

In [54]:
#function to load the glove embeddings
def load_glove_embeddings(File):
    #to store embeddings
    glove_embeddings = {}
    #opening glove file
    with open(File,'rb') as f:
        #for every line, every line consists word first and its  100d embedding vec next
        for line in f:
            #splitting
            split_line = line.split()
            #getting the word
            word = split_line[0]
            #getting the words glove embedding
            embedding = np.array(split_line[1:], dtype=np.float64)
            #adding word and its embedding to dict
            glove_embeddings[word] = embedding
    #returning the embeddings
    return glove_embeddings

In [55]:
#function to compute cosine similarity when give two vectors of same dim
def cos_sim(v1,v2):
    #getting the denominarors
    a=math.sqrt(sum([x*x for x in v1]))
    b=math.sqrt(sum([x*x for x in v2]))
    #getting the numerator
    num=0
    for i in range(len(v1)):
        num+=(v1[i]*v2[i])
    #returning cos sim value
    return num/(a*b)

In [56]:
#function to calculate accuracy when given the embedding model,embedding data,threshold as parameters
def cal_acc(m_type,data,threshold,dimension):
    #when the model type is either continous bag of words or skipgrams or fasttext
    #the data is list consisting of numpy array of  word vectors  and indexes dict which map word to the corresponding index in the numpy array 
    if m_type in ['cbow','sg','ft']:
        #getting word to index mapping
        indexes_dict=data[0]
        #getting word vectors numpy array
        word_vects=data[1]
        #to store accuracy
        acc=0
        #used to write to csv file
        df=pd.DataFrame(columns=['word1','word2','similarity_score','ground_truth_similarity_score','label'])
        #for each pair
        for i,j,k in pairs:
            #getting embedding vec's for the pair
            v1=word_vects[indexes_dict[i]]
            v2=word_vects[indexes_dict[j]]
            #calculating the similarity score
            sim=cos_sim(v1,v2)
            #creating a dict for current pair
            d={'word1':i,'word2':j,'similarity_score':sim*10,'ground_truth_similarity_score':float(k[:-1]),'label':0}
            #if similarity score is greater than threshold
            if (sim>=threshold and float(k[:-1])>=threshold*10)or(sim<threshold and float(k[:-1])<threshold*10):
                #increasing acc by 1
                acc+=1
                #assining label as 1
                d['label']=1
            #adding to dataframe
            df=df.append(d, ignore_index = True)
        #adding the accuracy score to dataframe
        d={'word1':'accuracy is:'+str(acc/len(pairs))}
        df=df.append(d, ignore_index = True)
        #writing to csv file
        file_name='Q1_'+m_type+'_similarity_'+str(int(threshold*10))+'_'+dimension+'.csv'
        df.to_csv(file_name, index=False)

    #else if the embedding type is glove,then the data is a dict with word as key and its embedding as value
    elif m_type=='glove':
        #to store accuracy
        acc=0
        #used to write to csv file
        df=pd.DataFrame(columns=['word1','word2','similarity_score','ground_truth_similarity_score','label'])
        #for each pair
        for i,j,k in pairs:
            #getting embedding vec's for the pair
            #in the dict the words are stored as encoded values
            v1=data[i.encode()]
            v2=data[j.encode()]
            #calculating the similarity score
            sim=cos_sim(v1,v2)
            #creating a dict for current pair
            d={'word1':i,'word2':j,'similarity_score':sim*10,'ground_truth_similarity_score':float(k[:-1]),'label':0}
            #if similarity score is greater than threshold
            if (sim>=threshold and float(k[:-1])>=threshold*10)or(sim<threshold and float(k[:-1])<threshold*10):
                #increasing acc by 1
                acc+=1
                #assining label as 1
                d['label']=1
            #adding to dataframe
            df=df.append(d, ignore_index = True)
        #adding the accuracy score to dataframe
        d={'word1':'accuracy is:'+str(acc/len(pairs))}
        df=df.append(d, ignore_index = True)
        #writing to csv file
        file_name='Q1_'+m_type+'_similarity_'+str(int(threshold*10))+'_'+dimension+'.csv'
        df.to_csv(file_name, index=False)


In [57]:
#loading cbow model
model = Word2Vec.load('hi-d100-m2-cbow.model')

In [58]:
#loading the cbow word embeddings array with 100 dimensions
n=np.load('hi-d100-m2-cbow.model.wv.vectors.npy')

In [59]:
#getting vocab
vocab=list(model.wv.vocab)
#dict to store the word to its corresponding index in the numpy array of embedding vects
ind={}
for key in vocab:
    ind[key]=-1
#getting the word and its corresponding index in numpy array
dict_wv={}
for key in vocab:
    dict_wv[tuple(model.wv.word_vec(key))]=key
for i in range(len(n)):
    cr=dict_wv[tuple(n[i])]
    ind[cr]=i

In [60]:
#storing the word to index mapping and numpy vectors of embeddings
data=[ind,n]
#performing word similarity using cbow embeddings
for threshold in thresholds:
    cal_acc('cbow',data,threshold,'100d')

In [61]:
#loading the cbow word embeddings array with 50 dimensions
n=np.load('hi-d50-m2-cbow.model.wv.vectors.npy')
#storing the word to index mapping and numpy vectors of embeddings
data=[ind,n]
#performing word similarity using cbow embeddings
for threshold in thresholds:
    cal_acc('cbow',data,threshold,'50d')

In [62]:
#loading skipgram embeddings 100 dimensional
s = np.load('hi-d100-m2-sg.model.wv.vectors.npy')
data=[ind,s]
#performing word similarity using skipgrams embeddings
for threshold in thresholds:
    cal_acc('sg',data,threshold,'100d')

In [63]:
#loading skipgram embeddings 50 dimensional
s = np.load('hi-d50-m2-sg.model.wv.vectors.npy')
data=[ind,s]
#performing word similarity using skipgrams embeddings
for threshold in thresholds:
    cal_acc('sg',data,threshold,'50d')

In [64]:
#loading fasttext embeddings 100 dimensional
f=np.load('hi-d100-m2-fasttext.model.wv.vectors.npy')
data=[ind,f]
#performing word similarity using fasttext embeddigs
for threshold in thresholds:
    cal_acc('ft',data,threshold,'100d')

In [65]:
#loading fasttext embeddings 50 dimensional
f=np.load('hi-d50-m2-fasttext.model.wv.vectors.npy')
data=[ind,f]
#performing word similarity using fasttext embeddigs
for threshold in thresholds:
    cal_acc('ft',data,threshold,'50d')

In [66]:
#loading glove embeddings 100 dimensional
g=load_glove_embeddings('hi-d100-glove.txt')
#performing word similarity using glove embeddigs
for threshold in thresholds:
    cal_acc('glove',g,threshold,'100d')

In [67]:
#loading glove embeddings 50 dimensional
g=load_glove_embeddings('hi-d50-glove.txt')
#performing word similarity using glove embeddigs
for threshold in thresholds:
    cal_acc('glove',g,threshold,'50d')