**----LOAD THE PRETRAINED FastText MODEL AND BUILD THE DICTONARY FOR WORD INDEX AND VICE-VERSA---**

In [2]:
import fasttext
import io

embedding_index = {}
def LoadFastText():
    input_file = io.open('crawl-300d-2M.vec', 'r', encoding='utf-8', newline='\n', errors='ignore')
    no_of_words, vector_size = map(int, input_file.readline().split())
    word_to_vector: Dict[str, List[float]] = dict()
    for i, line in enumerate(input_file):
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        vector = list(map(float, tokens[1:]))
        assert len(vector) == vector_size
        embedding_index[word] = vector
    #return word_to_vector

In [None]:
LoadFastText()
i=0
word2idx={}
idx2word={}
for word,v in embedding_index:
    word2idx[word]=i
    idx2word[i]=word
    i=i+1

In [None]:
#type(matrix[0][0])

**--------Word_Vector FUNCTION TAKES A WORD AS INPUT AND RETURNS THE CORRESPONDING WORD VECTOR-------**

In [None]:
def Word_Vector(word):
    return embedding_index[word]

**------ exists FUNCTION TAKES A WORD AS PARAMETER AND RETURNS IF THE WORD EXIXTS IN THE DICTIONARY OR NOT---**

In [None]:
def exists(word):
    try :
        v=embedding_index[word]
        return 1
    except :
        return 0

In [None]:
import import_ipynb
import pandas as pd
from Data_Loader import MEN_Dataloader, CC_Dataloader, gender_Dataloader
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics
import numpy as np

**--------SEMANTIC RELATEDNESS------**

In [None]:
wordList_1, wordList2, gold_std = MEN_Dataloader()
cos_sim=[]
for w1, w2 in zip(wordList_1, wordList2):
    v1 = Word_Vector(w1).reshape(1,-1)
    v2 = Word_Vector(w2).reshape(1,-1)
    sim = float(cosine_similarity(v1, v2))
    cos_sim.append(sim)

In [None]:
similarity_df=pd.DataFrame({'Gold': gold_std, 'Cos Sim': cos_sim})

print("---- Experiment 2: MEN dataset-------")
print("Pearson Co-relation: \n", similarity_df.corr(method='pearson') )
print("\nSpearman Co-relation: \n", similarity_df.corr(method='spearman'))

**------ N_Nearest_Neighbour TAKES A WORD-VECTOR AS INPUT AND RETURNS N- NEAREST NEIGHBOURS -----**

In [None]:
def N_Nearest_Neighbhour(v1, n):
    nearest_neighbours={}
    v1=v1.reshape(1,-1)
    for word, v in embedding_index.items():
        v2= v.reshape(1,-1)
        dis = 1 - cosine_similarity(v1, v2)
        if len(nearest_neighbours)<n:
            nearest_neighbours[word]= dis
        else:
            max_dis=0
            farthest=""
            for neigh_x, dis_x in nearest_neighbours.items():
                if dis_x> max_dis: 
                    farthest=neigh_x
                    max_dis=dis_x
            if dis<max_dis:
                del nearest_neighbours[farthest]
                nearest_neighbours[word]=dis
    neighbours=[]
    for neigh_x, dis_x in nearest_neighbours.items():
        neighbours.append(neigh_x)
    return neighbours

**----------CONCEPT CATEGORIZATION --------------**

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
test_word_embeddings = np.zeros((45, 300)) #---------- change the dimension (100 or 300) based on Golve----
word_list, gold_standard_labels = CC_Dataloader()
index=0
for w in word_list:
    w=w.lower()
    missed=0
    if exists(w):
        v= np.array([np.float64(x) for x in Word_Vector(w)])
        test_word_embeddings[index] = v
        index+=1
    else:
        missed+=1

print(len(test_word_embeddings))   
kmeans = KMeans(init="random", n_clusters=6, n_init=20, max_iter=100)
result = kmeans.fit(test_word_embeddings) 
clustering_labels = result.labels_  # get the cluster ID assigned to each word embedding
print(gold_standard_labels)
print(clustering_labels)
contingency_matrix = metrics.cluster.contingency_matrix(gold_standard_labels, clustering_labels) 


max_each_cluster = np.amax(contingency_matrix, axis=0)  
total_number_datapoints = np.sum(contingency_matrix)  

purity = np.sum(max_each_cluster) / total_number_datapoints  
print("Contingency Matrix:\n", contingency_matrix)  
print("Purity:", purity)

**---- GENDER BIASES WITH T-TEST (P-VALUES)-----

In [None]:
from scipy.stats import ttest_ind
word_list= gender_Dataloader()
male_list=["he", 'him', 'himself', 'male', 'boy', 'man', 'masculine']
female_list=["she", 'her', 'herself', 'female', 'girl', 'woman', 'feminine']
biased=0
non_biased=0
for word in word_list:
    list1=[]
    list2=[]
    if exists(word):
        v=np.array([np.float64(x) for x in Word_Vector(word)]).reshape(1,-1)
        m=0
        f=0
        for him, her in zip(male_list, female_list):
            if exists(him) and exists(her):
                v1= np.array([np.float64(x) for x in Word_Vector(him)]).reshape(1,-1)
                v2= np.array([np.float64(x) for x in Word_Vector(her)]).reshape(1,-1)
                sim1=(cosine_similarity(v, v1)*100)[0][0]
                sim2=(cosine_similarity(v, v2)*100)[0][0]
                if(sim1<0):
                    sim1*=(-1)
                if(sim2<0):
                    sim2*=(-1)
                dif=sim1-sim2
                abs_dif=dif
                if(dif<0):
                    abs_dif=dif*(-1)
                if(abs_dif>0.5 and dif>0):
                    m+=1
                if(abs_dif>0.5 and dif<0):
                    f+=1
                list1.append(sim1)
                list2.append(sim2)
        if(abs(m-f)>1):
            biased+=1
        else:
            non_biased+=1
        print(m, f, ttest_ind(list1, list2).pvalue)
print(biased, non_biased)

**----- This part is just rough code-----

**--------------ANALOGY TESTING---------**

In [None]:
def AN_Dataloader():
    dataset= open('./dataset/analogy_test_reduced_2.txt')
    word_list_1=[]
    word_list_2=[]
    word_list_3=[]
    word_list_4=[]
    for line in dataset:
        if line.split(' ')[0]==':':
            continue
        w1 = line.split(' ')[0]  
        w2 = line.split(' ')[1] 
        w3 = line.split(' ')[2]  
        w4 = line.split(' ')[3]
        word_list_1.append(w1)
        word_list_2.append(w2)
        word_list_3.append(w3)
        word_list_4.append(w4)
    return word_list_1, word_list_2, word_list_3, word_list_4

In [None]:
def analogy_test():
    word_list_1, word_list_2, word_list_3, word_list_4 = AN_Dataloader()
    total_test = len(word_list_1)
    missed = successful = unsuccessful = 0
    for w1, w2, w3, w4 in zip(word_list_1, word_list_2, word_list_3, word_list_4):
        w1=w1.lower()
        w2=w2.lower()
        w3=w3.lower()
        w4=w4.lower().strip()
        if exists(w1)==1 and exists(w2)==1 and exists(w3)==1 and exists(w4)==1 :
            v1= np.array([np.float64(x) for x in Word_Vector(w1)])
            v2= np.array([np.float64(x) for x in Word_Vector(w2)])
            v3= np.array([np.float64(x) for x in Word_Vector(w3)])
            v= v2-v1+v3
            nearest_neighbours = N_Nearest_Neighbhour(v, 5)
            if w4 in nearest_neighbours: 
                successful += 1
            else:
                unsuccessful +=1
        else:
            missed += 1
    return total_test, missed, successful, unsuccessful

In [None]:
total_test, missed, successful, unsuccessful = analogy_test()
accuracy1= successful/(successful+unsuccessful)
accuracy1= successful/(successful+unsuccessful+missed)
print("Accuracy1: ", accuracy1)
print("Accuracy2: ", accuracy2)
print(successful, unsuccessful, missed)