# import libraries

In [4]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.stats import ttest_ind as t
from sklearn.mixture import GaussianMixture
from sklearn.metrics.cluster import v_measure_score as v_score
X = np.array([[1, 2], [1, 4], [1, 0], [10, 666], [10, 4], [10, 0]])
gm = GaussianMixture(n_components=2, random_state=0).fit(X)
types = ['h1','h2']

In [65]:
aa ={'Switzerland':'mono','thousands':'mono','pencil':'mono','transistor':'mono','computer':'mono','door':'nn.figr.poly','potato':\
     'nn.plfd.poly','questions':'nv.poly','mistakes':'nv.poly','thought':'nv.poly','fish':'nn.anime.poly',\
     'salmon':'nn.anime.poly','like':'vprep.hom','tomatoes':'nn.plfd.poly','lamb':'nn.anime.poly',\
     'chicken':'nn.anime.poly','can':'nv.hom','power':'poly','bank':'nn.hom','rock':'nn.hom',\
     'books':'nv.hom','duck':'nn.anime.poly','dates':'nv.hom','even':'adjadv.hom',\
     'pupil':'nn.hom','tears':'nv.hom','moves':'nv.poly','book':'nv.hom','form':'nv.hom','watch':'nv.hom','wind':'nv.hom'} 
aa =pd.DataFrame.from_dict(aa,orient = 'index').reset_index().rename(columns = {'index':'target',0:'type'})

In [66]:
def euclidean(x,y):
    dist = np.linalg.norm(x-y)
    return dist.round(2)
def cos(x,y):
    dist = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
    return dist.round(2)
#calculate the distance between two files
def single_type_word_dist(df,t):
    # t is the extraction type
    #four embedding is a dict key = c0 h0 c1 h1, val =  6 columns
    res = {}
    for word,dfx in df.groupby(by='target'):
        if word == 'went' or word =='line':
            continue
        dfa = dfx[dfx['file']=='1']
        dfb = dfx[dfx['file']=='2']
        x1 = dfa['tensors'].agg('mean')
        x2 = dfb['tensors'].agg('mean')
        #res[word] = euclidean(x1,x2)
        res[word] = euclidean(x1,x2)
    file_dis = pd.DataFrame.from_dict(res, orient='index',columns = [f'file_dist_{t}']).reset_index().rename(columns={"index": "target"})
    return file_dis
# calculate the ground truth distance between file
# input df contatin all embeddings of all target words
# extraction_type can be c0 h0 c1 h1
# output the distances of all target words
def all_types_of_dist(embed_dict,types=types):
    for t in types:
        delta_df = single_type_word_dist(embed_dict[t],t)
        if t == types[0]:
            df = delta_df
        else:
            df = pd.merge(df,delta_df, on='target')
    return df
# calculate l2 normalisation factors for each extraction type
# def l2_normalisation_factors(embed_dict,types=types):
#     norm_factors = pd.DataFrame(columns = ['target'] + [f'l2_{t}' for t in types])
#     word_set = set()
#     for t in types:
#         a = []
#         for word,group in embed_dict[t].groupby(by='target'):
#             #exlude the werd words in the data set
#             if word =='went' or word == 'line':
#                 continue
#             word_set.add(word)
#             a.append(np.linalg.norm(group['tensors'].apply('mean')))
#         norm_factors[f'l2_{t}']=a
#     norm_factors['target'] = list(word_set)
#     return norm_factors

In [57]:
# load pickled data
embed_dict = {}
for t in types:
    with open(f'contextual_embeddings/all_sent_{t}','rb') as f:
        embed_dict[t] = pickle.load(f)

In [58]:
embed_dict['h1'].head()

Unnamed: 0,tensors,labels,file,prev,target
0,"[0.018521532, 0.045597155, 0.002335213, -0.058...",Looks like2,2,Looks,like
1,"[0.007343838, 0.03755672, 0.0031507332, -0.077...",floor like2,2,floor,like
2,"[0.07417224, 0.027875945, -0.006848427, -0.153...",yourself like2,2,yourself,like
3,"[0.003018174, 0.016517585, 0.025704604, -0.028...",support like2,2,support,like
4,"[0.11227927, 0.03317975, 0.001646313, -0.10016...",OS like2,2,OS,like


In [59]:
norm_factors = l2_normalisation_factors(embed_dict)

In [39]:
all_types_dist_all_words = all_types_of_dist(embed_dict).set_index('target')
all_types_dist_all_words

Unnamed: 0_level_0,file_dist_h1,file_dist_h2
target,Unnamed: 1_level_1,Unnamed: 2_level_1
Switzerland,0.84,1.11
bank,1.44,3.16
book,1.94,2.76
books,1.87,2.55
can,1.78,2.29
chicken,1.39,2.77
computer,0.48,0.92
dates,2.12,3.53
door,0.99,1.53
duck,0.99,2.2


In [40]:
# fit 2 component Gaussian Mixture n times and return the result dist and acc dictionaries
# input random_seed,df is of one type [c0, h0, c1, h1], factor of the corresponding type, n_init in gmm 
# returns a df with target, dist and acc
# extraction_type is t
def gaussian_prediction_accuracy(random_seed,df,n,t='h1',metric='euclidean'):
    assert t in types
    #euclidean_dist = {}
    dist = {}
    v_of_all_targets = {}
    for word,group in df.groupby(by='target'):
        # exclude these two words
        if word == 'line' or word=='went':
            continue
        dfx = group.sort_values(by='file')
        truth = np.array(dfx['file'].apply(lambda x: int(x=='2')).to_list())
        data = np.array((dfx['tensors']).to_list())
        gm = GaussianMixture(n_components=2, random_state=random_seed,n_init=n).fit(data)
        means = gm.means_
        if metric == 'cosine':
            dist[word] = cos(means[0],means[1])
        if metric =='euclidean':
            dist[word] = euclidean(means[0],means[1])
        preds = gm.predict(data)
        # the accuracy approaches 0.5 for monoseme becuase it is a binary choice problem
        # we could have used v test here but there is no need.
        # v = v_score(preds,truth)
        # v_of_all_targets[word] = round(v,2)
        acc_1 = (np.sum(truth==preds)/len(truth)).round(2)
        acc_2 = (np.sum(truth!=preds)/len(truth)).round(2)
        if acc_1>= acc_2:
            v_of_all_targets[word] = acc_1
        else:
            v_of_all_targets[word] = acc_2
    return pd.DataFrame({'target':dist.keys(), f'dist_{t}':dist.values(), f'acc_{t}':v_of_all_targets.values()})

In [41]:
x = gaussian_prediction_accuracy(1,embed_dict['h1'],1,t='h1',metric='euclidean').sort_values(by='dist_h1')
x.head()

Unnamed: 0,target,dist_h1,acc_h1
27,tomatoes,1.13,0.76
9,duck,1.16,0.64
22,rock,1.21,0.7
23,salmon,1.24,0.67
19,power,1.28,0.77


In [42]:
x.sort_values(by='acc_h1')

Unnamed: 0,target,dist_h1,acc_h1
17,pencil,1.58,0.5
18,potato,1.37,0.5
0,Switzerland,1.7,0.51
10,even,2.37,0.51
2,book,2.19,0.53
20,pupil,1.44,0.54
6,computer,1.76,0.54
28,transistor,1.46,0.54
11,fish,2.21,0.57
26,thousands,1.46,0.57


In [23]:
# repeat the gaussian distance and clustering for n times and get the average
# extraction_type is t
def ngauss(n_random,df,n_init,t='h1',metric='euclidean'):
    # do the gaussian n_random times for an average value, return the result
    res = pd.DataFrame()
    for i in tqdm(range(n_random)):
        delta_df = gaussian_prediction_accuracy(i,df,n_init,t=t,metric=metric)
        if i==0:
            res = delta_df
        else:
            res[f'dist_{t}']+=delta_df[f'dist_{t}']
            res[f'acc_{t}']+= delta_df[f'acc_{t}']
    res[f'dist_{t}'] = res[f'dist_{t}'].apply(lambda x: (x/n_random)).round(2)
    res[f'acc_{t}'] = res[f'acc_{t}'].apply(lambda x: (x/n_random)).round(2)
    return res
# repeat the ngauss for [c0 h0 c1 h1], 4 times
def all_types_ngauss(n_random,embed_dict,n_init,types=types,metric='euclidean'):
    for t in types:
        if t==types[0]:
            res = ngauss(n_random,embed_dict[t],n_init,t=t,metric=metric)
        else:
            res = pd.DataFrame.merge(res,ngauss(n_random,embed_dict[t],n_init,t=t,metric = metric),on='target')
    return res

In [24]:
two_ng = all_types_ngauss(3,embed_dict,1)

100%|█████████████████████████████████████████████| 3/3 [00:18<00:00,  6.29s/it]
100%|█████████████████████████████████████████████| 3/3 [00:26<00:00,  8.89s/it]


In [43]:
two_ng.sort_values(by='acc_h1')

Unnamed: 0,target,dist_h1,acc_h1,dist_h2,acc_h2
17,pencil,1.38,0.53,2.27,0.57
6,computer,1.76,0.54,2.56,0.54
18,potato,1.52,0.56,2.45,0.62
26,thousands,1.24,0.56,1.61,0.56
0,Switzerland,2.12,0.57,2.23,0.6
22,rock,1.16,0.6,2.58,0.74
28,transistor,1.51,0.61,2.72,0.51
11,fish,1.72,0.62,2.34,0.58
23,salmon,1.31,0.64,2.33,0.71
20,pupil,1.29,0.64,2.51,0.79


In [44]:
# new.sort_values(by ='file_dist_h1').to_csv('Results/4_distances')

In [46]:
# get the results
results= pd.DataFrame.merge(two_ng,all_types_dist_all_words,on='target')
results.sort_values(by='acc_h2')

Unnamed: 0,target,dist_h1,acc_h1,dist_h2,acc_h2,file_dist_h1,file_dist_h2
28,transistor,1.51,0.61,2.72,0.51,0.8,1.2
6,computer,1.76,0.54,2.56,0.54,0.48,0.92
26,thousands,1.24,0.56,1.61,0.56,0.48,0.89
17,pencil,1.38,0.53,2.27,0.57,0.72,1.39
11,fish,1.72,0.62,2.34,0.58,0.7,1.74
0,Switzerland,2.12,0.57,2.23,0.6,0.84,1.11
18,potato,1.52,0.56,2.45,0.62,0.85,2.1
27,tomatoes,1.13,0.64,2.5,0.64,0.86,1.58
8,door,1.3,0.76,2.08,0.67,0.99,1.53
19,power,1.24,0.72,2.5,0.68,1.2,2.35


In [47]:
# apply pearson's r to dis and acc
corr = {}
p = {}
for i in types:
    a = pearsonr(results[f'dist_{i}'],results[f'acc_{i}'])
#     b = pearsonr(results[f'dist_{i}'],results[f'file_dist_{i}'])
    c = pearsonr(results[f'file_dist_{i}'],results[f'acc_{i}'])
    corr[i] = [a[0],a[1]]
    p[i] = [c[0],c[1]]
corr = pd.DataFrame.from_dict(corr,orient='index',columns =['corr_dist_acc','p_dist_acc']).reset_index()
p = pd.DataFrame.from_dict(p,orient='index', columns =['corr_file_dist_acc','p_file_dist_acc']).reset_index()
corr_p = pd.DataFrame.merge(corr,p, on = 'index').rename(columns = {'index':'types'}).set_index('types')
corr_p

Unnamed: 0_level_0,corr_dist_acc,p_dist_acc,corr_file_dist_acc,p_file_dist_acc
types,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
h1,0.669319,3.8e-05,0.899343,6.177115e-12
h2,0.675338,3.1e-05,0.886458,3.252648e-11


In [49]:
results.sort_values(by='acc_h1') #.drop(['acc_h0','acc_c1','acc_c0','acc_h1',],axis=1)
comparison = results[['target','acc_h1','acc_h2','dist_h1','file_dist_h1','file_dist_h2']]
comparison.sort_values(by='acc_h2').reset_index().drop(['index'],axis=1)

Unnamed: 0,target,acc_h1,acc_h2,dist_h1,file_dist_h1,file_dist_h2
0,transistor,0.61,0.51,1.51,0.8,1.2
1,computer,0.54,0.54,1.76,0.48,0.92
2,thousands,0.56,0.56,1.24,0.48,0.89
3,pencil,0.53,0.57,1.38,0.72,1.39
4,fish,0.62,0.58,1.72,0.7,1.74
5,Switzerland,0.57,0.6,2.12,0.84,1.11
6,potato,0.56,0.62,1.52,0.85,2.1
7,tomatoes,0.64,0.64,1.13,0.86,1.58
8,door,0.76,0.67,1.3,0.99,1.53
9,power,0.72,0.68,1.24,1.2,2.35


In [1353]:
# t(comparison['acc_h0'],comparison['acc_h1'])

In [1354]:
# l2 normalise the four ngauss results
# df contains only distantces
# def normalise_results(df,norm_factors,types = types):
#     new = df.copy()
#     for t in types:
#         new[f'dist_{t}'] =  (new[f'dist_{t}']/norm_factors[f'l2_{t}']).round(2)
#         new[f'file_dist_{t}'] =  (new[f'file_dist_{t}']/norm_factors[f'l2_{t}']).round(2)
#     return new

In [60]:
dists = results[['target','dist_h1','dist_h2','file_dist_h1','file_dist_h2',]]

In [61]:
dists = pd.merge(dists,aa,on='target')
# dists['sort'] = dists['type'].apply(lambda x: x.split('.')[-1])

In [62]:
dists.sort_values(by='dist_h1')[['target','type','dist_h1','dist_h2']]

Unnamed: 0,target,type,dist_h1,dist_h2
27,tomatoes,nn.plfd.poly,1.13,2.5
9,duck,nn.anime.poly,1.15,2.35
22,rock,nn.hom,1.16,2.58
19,power,poly,1.24,2.5
26,thousands,mono,1.24,1.61
20,pupil,nn.hom,1.29,2.51
8,door,nn.figr.poly,1.3,2.08
23,salmon,nn.anime.poly,1.31,2.33
13,lamb,nn.anime.poly,1.32,2.63
17,pencil,mono,1.38,2.27


In [63]:
a = normalise_results(dists,norm_factors)

NameError: name 'normalise_results' is not defined

In [64]:
a[['target','dist_c0','dist_c1','dist_h0','dist_h1']].sort_values(by='dist_h1')

TypeError: tuple indices must be integers or slices, not list

In [1333]:
(norm_factors[['target','l2_c0','l2_c1','l2_h0','l2_h1',]]).round(2).sort_values(by='target')

Unnamed: 0,target,l2_c0,l2_c1,l2_h0,l2_h1
3,Switzerland,10.38,6.71,2.07,2.22
6,bank,10.94,7.81,2.72,3.58
28,book,10.24,6.38,2.38,2.43
4,books,10.68,7.68,2.69,2.88
9,can,9.12,7.78,2.28,3.28
19,chicken,10.44,7.91,2.57,3.12
23,computer,11.04,8.43,2.81,3.67
0,dates,10.12,7.95,2.44,2.59
12,door,9.86,6.22,2.19,2.03
24,duck,11.1,7.73,2.92,3.11


In [1258]:
5.3/2.0

2.65