## Code to evaluate the performance of our models on gujarati word similarity task

In [1]:
# Importing the necessadry libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import fasttext
import fasttext.util

In [2]:
# Reading the similarity scores data
df = pd.read_csv("../../Word-Similarity-Datasets-for-Indian-Languages/Gujarati-WS.txt", delimiter="\t", header=None)
df = df.drop([3], axis=1)
df.columns = ["word1", "word2", "sim_score"]

In [3]:
# File storing the results of different configurations of run
# stats_df = pd.DataFrame(columns=["decomp", "comp", "filename", "score"])
stats_df = pd.read_csv("summary.csv")

In [4]:

stats_df

Unnamed: 0.1,Unnamed: 0,decomp,comp,filename,score,size
0,0,bpe,mtxatt,gu.sent.1m.bpe.mtxatt.ep15.lr0.01.bs100.vec.txt,0.353574,5L
1,1,charn,add,gu.sent.1m.charn.add.ep15.lr0.01.bs100.vec.txt,0.402194,5L
2,2,morf,add,gu.sent.1m.morf.add.ep15.lr0.01.bs100.vec.txt,0.386497,5L
3,3,bpe,add,gu.sent.1m.bpe.add.ep15.lr0.01.bs100.vec.txt,0.393351,2M
4,4,bpe,add,gu_5L_add.vec.txt,0.335785,5L
5,5,bpe,att,gu.sent.1m.bpe.att.ep15.lr0.01.bs100.vec.txt,0.335785,5L
6,6,bpe,wwadd,gu.sent.1m.bpe.wwadd.ep15.lr0.01.bs100.vec.txt,0.381871,5L
7,7,charn,wwmtxatt,gu.sent.1m.charn.wwmtxatt.ep15.lr0.01.bs100.ve...,0.390828,5L


In [5]:
# passing in our model configurations and vector files
vector_file_path = "gu.sent.1m.charn.wwatt.ep15.lr0.01.bs100.vec.txt"
decomp = "charn"
comp = "wwatt"
corpus_size = "5L"

In [6]:
# Reading the vectors in appropriate format
with open(vector_file_path) as f:
    vectors = f.read()
    vectors = vectors.split("\n")
    
vector_dict = {}
for vector in vectors[1:-1]:
    vector = vector.split()
    word = vector[0]
    emb = np.array([float(v) for v in vector[1:]])
    vector_dict[word] = emb

In [9]:
# vocabulary of our model
words = vector_dict.keys()

In [10]:
# calculating the similarity score of evaluation word pairs through our embeddings
scores = []
for ix, (word1, word2, _) in df.iterrows():
    if word1 in words and word2 in words:
        vec1 = vector_dict[word1].reshape(-1, 1).T
        vec2 = vector_dict[word2].reshape(-1, 1).T
        score = cosine_similarity(vec1, vec2)*10
        score = score[0][0]
    else:
        score = 0
    
    scores.append(score)
    

In [11]:
# calculating the correlation
corr, _ = pearsonr(df["sim_score"], scores)

In [12]:
print(corr)

0.3657216353199176


In [13]:
# adding everything in our stats data file
d = {"decomp": decomp, "comp": comp, "filename": vector_file_path, "score": corr, "size": corpus_size}
stats_df = stats_df.append(d, ignore_index=True)

In [14]:
stats_df

Unnamed: 0.1,Unnamed: 0,decomp,comp,filename,score,size
0,0.0,bpe,mtxatt,gu.sent.1m.bpe.mtxatt.ep15.lr0.01.bs100.vec.txt,0.353574,5L
1,1.0,charn,add,gu.sent.1m.charn.add.ep15.lr0.01.bs100.vec.txt,0.402194,5L
2,2.0,morf,add,gu.sent.1m.morf.add.ep15.lr0.01.bs100.vec.txt,0.386497,5L
3,3.0,bpe,add,gu.sent.1m.bpe.add.ep15.lr0.01.bs100.vec.txt,0.393351,2M
4,4.0,bpe,add,gu_5L_add.vec.txt,0.335785,5L
5,5.0,bpe,att,gu.sent.1m.bpe.att.ep15.lr0.01.bs100.vec.txt,0.335785,5L
6,6.0,bpe,wwadd,gu.sent.1m.bpe.wwadd.ep15.lr0.01.bs100.vec.txt,0.381871,5L
7,7.0,charn,wwmtxatt,gu.sent.1m.charn.wwmtxatt.ep15.lr0.01.bs100.ve...,0.390828,5L
8,,charn,wwatt,gu.sent.1m.charn.wwatt.ep15.lr0.01.bs100.vec.txt,0.365722,5L


### Calculating performance of ft-text

In [45]:
#fasttext.util.download_model('gu', if_exists='ignore')
ft = fasttext.load_model('cc.gu.300.bin')



In [49]:
df = df.drop(["predicted_score"], axis=1)

In [54]:
scores = []
for ix, (word1, word2, _) in df.iterrows():
    vec1 = ft.get_word_vector(word1).reshape(-1,1).T
    vec2 = ft.get_word_vector(word2).reshape(-1,1).T
    cosine_sim = cosine_similarity(vec1, vec2)[0][0]*10
    scores.append(cosine_sim)

In [56]:
corr, _ = pearsonr(df["sim_score"], scores)
print("Correlation: ", corr)

Correlation:  0.46852529426143874
