# import libraries

In [5]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.stats import ttest_ind as t
from sklearn.mixture import GaussianMixture
from sklearn.metrics.cluster import v_measure_score as v_score
X = np.array([[1, 2], [1, 4], [1, 0], [10, 666], [10, 4], [10, 0]])
gm = GaussianMixture(n_components=2, random_state=0).fit(X)
types = ['h1','h2']

In [6]:
aa ={'Switzerland':'mono','pencil':'mono','transistor':'mono','computer':'mono','door':'nn.figr.poly','potato':\
     'nn.plfd.poly','questions':'nv.poly','mistakes':'nv.poly','thought':'nv.poly','fish':'nn.anime.poly',\
     'salmon':'nn.anime.poly','like':'vprep.hom','tomatoes':'nn.plfd.poly','lamb':'nn.anime.poly',\
     'chicken':'nn.anime.poly','can':'nv.hom','power':'poly','bank':'nn.hom','rock':'nn.hom',\
     'books':'nv.hom','duck':'nn.anime.poly','dates':'nv.hom','even':'adjadv.hom',\
     'pupil':'nn.hom','tears':'nv.homg','moves':'nv.poly','book':'nv.hom','form':'nv.hom','watch':'nv.hom','wind':'nv.homg'} 
aa =pd.DataFrame.from_dict(aa,orient = 'index').reset_index().rename(columns = {'index':'target',0:'type'})
aa

Unnamed: 0,target,type
0,Switzerland,mono
1,pencil,mono
2,transistor,mono
3,computer,mono
4,door,nn.figr.poly
5,potato,nn.plfd.poly
6,questions,nv.poly
7,mistakes,nv.poly
8,thought,nv.poly
9,fish,nn.anime.poly


In [60]:
def euclidean(x,y):
    dist = np.linalg.norm(x-y)
    return dist.round(2)
def cos(x,y):
    dist = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
    return dist.round(2)
#calculate the distance between two files
def single_type_word_dist(df,t):
    # t is the extraction type
    #four embedding is a dict key = c0 h0 c1 h1, val =  6 columns
    res = {}
    for word,dfx in df.groupby(by='target'):
        if word == 'went' or word =='line':
            continue
        dfa = dfx[dfx['file']=='1']
        dfb = dfx[dfx['file']=='2']
        x1 = dfa['tensors'].agg('mean')
        x2 = dfb['tensors'].agg('mean')
        #res[word] = euclidean(x1,x2)
        res[word] = cos(x1,x2)
    file_dis = pd.DataFrame.from_dict(res, orient='index',columns = [f'file_dist_{t}']).reset_index().rename(columns={"index": "target"})
    return file_dis
# calculate the ground truth distance between file
# input df contatin all embeddings of all target words
# extraction_type can be c0 h0 c1 h1
# output the distances of all target words
def all_types_of_dist(embed_dict,types=types):
    for t in types:
        delta_df = single_type_word_dist(embed_dict[t],t)
        if t == types[0]:
            df = delta_df
        else:
            df = pd.merge(df,delta_df, on='target')
    return df
# calculate l2 normalisation factors for each extraction type
def l2_normalisation_factors(embed_dict,types=types):
    norm_factors = pd.DataFrame(columns = ['target'] + [f'l2_{t}' for t in types])
    word_set = set()
    for t in types:
        a = []
        for word,group in embed_dict[t].groupby(by='target'):
            #exlude the werd words in the data set
            if word =='went' or word == 'line':
                continue
            word_set.add(word)
            a.append(np.linalg.norm(group['tensors'].apply('mean')))
        norm_factors[f'l2_{t}']=a
    norm_factors['target'] = list(word_set)
    return norm_factors

SyntaxError: invalid syntax (4293052508.py, line 41)

In [55]:
# load pickled data
embed_dict = {}
for t in types:
    with open(f'contextual_embeddings/all_sent_{t}','rb') as f:
        embed_dict[t] = pickle.load(f)

In [58]:
embed_dict['h1'].head()

Unnamed: 0,tensors,labels,file,prev,target
0,"[0.018521532, 0.045597155, 0.002335213, -0.058...",Looks like2,2,Looks,like
1,"[0.07884548, 0.033203643, 0.01734861, -0.12496...",yourself like2,2,yourself,like
2,"[0.0039849347, 0.016776048, 0.024410218, -0.02...",support like2,2,support,like
3,"[0.11227927, 0.03317975, 0.001646313, -0.10016...",OS like2,2,OS,like
4,"[0.007854236, 0.0059105917, 0.009929255, -0.06...",around like2,2,around,like


In [59]:
norm_factors = l2_normalisation_factors(embed_dict)
norm_factors

Unnamed: 0,target,l2_h1,l2_h2
0,chicken,2.436679,2.586743
1,lamb,2.746895,2.977511
2,mistakes,1.890011,2.198264
3,wind,2.066962,2.2191
4,bank,2.693521,2.875046
5,Switzerland,2.52775,3.696086
6,form,2.720585,3.577686
7,computer,3.013174,2.923898
8,book,2.651863,3.070725
9,moves,2.27753,3.279745


In [46]:
all_types_dist_all_words = all_types_of_dist(embed_dict).set_index('target')
all_types_dist_all_words.head()

Unnamed: 0_level_0,file_dist_h1,file_dist_h2
target,Unnamed: 1_level_1,Unnamed: 2_level_1
Switzerland,0.94,0.91
bank,0.9,0.61
book,0.58,0.44
books,0.66,0.51
can,0.8,0.74


In [47]:
# fit 2 component Gaussian Mixture n times and return the result dist and acc dictionaries
# input random_seed,df is of one type [c0, h0, c1, h1], factor of the corresponding type, n_init in gmm 
# returns a df with target, dist and acc
# extraction_type is t
def gaussian_prediction_accuracy(random_seed,df,n,t='h1',metric='euclidean'):
    assert t in types
    #euclidean_dist = {}
    dist = {}
    v_of_all_targets = {}
    for word,group in df.groupby(by='target'):
        # exclude these two words
        if word == 'line' or word=='went':
            continue
        dfx = group.sort_values(by='file')
        truth = np.array(dfx['file'].apply(lambda x: int(x=='2')).to_list())
        data = np.array((dfx['tensors']).to_list())
        gm = GaussianMixture(n_components=2, random_state=random_seed,n_init=n).fit(data)
        means = gm.means_
        if metric == 'cosine':
            dist[word] = cos(means[0],means[1])
        if metric =='euclidean':
            dist[word] = euclidean(means[0],means[1])
        preds = gm.predict(data)
        # the accuracy approaches 0.5 for monoseme becuase it is a binary choice problem
        # we could have used v test here but there is no need.
        # v = v_score(preds,truth)
        # v_of_all_targets[word] = round(v,2)
        acc_1 = (np.sum(truth==preds)/len(truth)).round(2)
        acc_2 = (np.sum(truth!=preds)/len(truth)).round(2)
        if acc_1>= acc_2:
            v_of_all_targets[word] = acc_1
        else:
            v_of_all_targets[word] = acc_2
    return pd.DataFrame({'target':dist.keys(), f'dist_{t}':dist.values(), f'acc_{t}':v_of_all_targets.values()})

In [48]:
x = gaussian_prediction_accuracy(1,embed_dict['h1'],1,t='h1',metric='euclidean').sort_values(by='dist_h1')
x.head()

Unnamed: 0,target,dist_h1,acc_h1
27,tomatoes,1.13,0.76
10,duck,1.16,0.64
23,rock,1.21,0.7
24,salmon,1.24,0.67
20,power,1.28,0.77


In [49]:
# repeat the gaussian distance and clustering for n times and get the average
# extraction_type is t
def ngauss(n_random,df,n_init,t='h1',metric='euclidean'):
    # do the gaussian n_random times for an average value, return the result
    res = pd.DataFrame()
    for i in tqdm(range(n_random)):
        delta_df = gaussian_prediction_accuracy(i,df,n_init,t=t,metric=metric)
        if i==0:
            res = delta_df
        else:
            res[f'dist_{t}']+=delta_df[f'dist_{t}']
            res[f'acc_{t}']+= delta_df[f'acc_{t}']
    res[f'dist_{t}'] = res[f'dist_{t}'].apply(lambda x: (x/n_random)).round(2)
    res[f'acc_{t}'] = res[f'acc_{t}'].apply(lambda x: (x/n_random)).round(2)
    return res
# repeat the ngauss for [c0 h0 c1 h1], 4 times
def all_types_ngauss(n_random,embed_dict,n_init,types=types,metric='euclidean'):
    for t in types:
        if t==types[0]:
            res = ngauss(n_random,embed_dict[t],n_init,t=t,metric=metric)
        else:
            res = pd.DataFrame.merge(res,ngauss(n_random,embed_dict[t],n_init,t=t,metric = metric),on='target')
    return res

In [50]:
two_ng = all_types_ngauss(1,embed_dict,1)

100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.01s/it]
100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.91s/it]


In [51]:
two_ng.sort_values(by='acc_h1')

Unnamed: 0,target,dist_h1,acc_h1,dist_h2,acc_h2
23,rock,1.21,0.52,2.24,0.61
19,potato,2.07,0.54,2.86,0.58
6,computer,1.76,0.54,2.54,0.56
24,salmon,1.12,0.54,2.06,0.71
20,power,1.19,0.54,2.66,0.62
18,pencil,1.28,0.55,2.32,0.55
27,tomatoes,1.09,0.56,2.31,0.68
0,Switzerland,2.33,0.6,1.76,0.66
12,fish,1.48,0.65,1.73,0.57
28,transistor,1.5,0.66,2.59,0.51


In [1277]:
# new.sort_values(by ='file_dist_h1').to_csv('Results/4_distances')

In [54]:
# get the results
results= pd.DataFrame.merge(two_ng,all_types_dist_all_words,on='target')
results

Unnamed: 0,target,dist_h1,acc_h1,dist_h2,acc_h2,file_dist_h1,file_dist_h2
0,Switzerland,2.33,0.6,1.76,0.66,0.94,0.91
1,bank,1.47,0.83,3.14,0.92,0.9,0.61
2,book,1.94,1.0,2.76,1.0,0.58,0.44
3,books,1.89,0.98,2.6,0.95,0.66,0.51
4,can,1.94,0.92,2.38,0.92,0.8,0.74
5,chicken,1.52,0.92,2.89,0.95,0.86,0.76
6,computer,1.76,0.54,2.54,0.56,0.98,0.97
7,dates,2.14,0.95,3.53,1.0,0.8,0.5
8,deduce,2.44,0.9,3.04,0.9,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
9,door,1.28,0.81,1.67,0.74,0.93,0.88


In [53]:
# apply pearson's r to dis and acc
corr = {}
p = {}
for i in types:
    a = pearsonr(results[f'dist_{i}'],results[f'acc_{i}'])
#     b = pearsonr(results[f'dist_{i}'],results[f'file_dist_{i}'])
    c = pearsonr(results[f'file_dist_{i}'],results[f'acc_{i}'])
    corr[i] = [a[0],a[1]]
    p[i] = [c[0],c[1]]
corr = pd.DataFrame.from_dict(corr,orient='index',columns =['corr_dist_acc','p_dist_acc']).reset_index()
p = pd.DataFrame.from_dict(p,orient='index', columns =['corr_file_dist_acc','p_file_dist_acc']).reset_index()
corr_p = pd.DataFrame.merge(corr,p, on = 'index').rename(columns = {'index':'types'}).set_index('types')
corr_p

AttributeError: 'bool' object has no attribute 'all'

In [1352]:
results.sort_values(by='acc_h0') #.drop(['acc_h0','acc_c1','acc_c0','acc_h1',],axis=1)
comparison = results[['target','acc_c0','acc_c1','acc_h0','acc_h1','dist_c0','dist_c1','dist_h0','dist_h1','file_dist_c0','file_dist_h0','file_dist_c1','file_dist_h1',]]
comparison.sort_values(by='acc_c1').reset_index().drop(['index'],axis=1)

Unnamed: 0,target,acc_c0,acc_c1,acc_h0,acc_h1,dist_c0,dist_c1,dist_h0,dist_h1,file_dist_c0,file_dist_h0,file_dist_c1,file_dist_h1
0,potato,0.5,0.5,0.54,0.58,4.93,5.06,2.07,2.86,0.97,0.96,0.9,0.88
1,computer,0.54,0.51,0.54,0.56,2.35,5.16,1.76,2.54,0.99,0.98,0.97,0.97
2,fish,0.65,0.52,0.65,0.57,5.08,3.35,1.48,1.73,0.97,0.97,0.9,0.88
3,rock,0.52,0.55,0.52,0.61,4.23,6.92,1.21,2.24,0.96,0.94,0.8,0.78
4,pencil,0.5,0.55,0.55,0.55,4.85,5.16,1.28,2.32,0.97,0.96,0.94,0.91
5,Switzerland,0.69,0.57,0.6,0.66,4.91,5.3,2.33,1.76,0.97,0.94,0.96,0.91
6,transistor,0.66,0.57,0.66,0.51,4.94,8.38,1.5,2.59,0.97,0.96,0.96,0.95
7,power,0.59,0.62,0.54,0.62,5.05,6.08,1.19,2.66,0.93,0.9,0.82,0.76
8,tomatoes,0.63,0.63,0.56,0.68,4.95,6.21,1.09,2.31,0.97,0.96,0.93,0.89
9,door,0.79,0.64,0.81,0.74,4.4,4.05,1.28,1.67,0.95,0.93,0.92,0.88


In [1353]:
# t(comparison['acc_h0'],comparison['acc_h1'])

In [1354]:
# l2 normalise the four ngauss results
# df contains only distantces
def normalise_results(df,norm_factors,types = types):
    new = df.copy()
    for t in types:
        new[f'dist_{t}'] =  (new[f'dist_{t}']/norm_factors[f'l2_{t}']).round(2)
        new[f'file_dist_{t}'] =  (new[f'file_dist_{t}']/norm_factors[f'l2_{t}']).round(2)
    return new

In [1355]:
dists = results[['target','dist_c0','dist_c1','dist_h0','dist_h1','file_dist_c0','file_dist_c1','file_dist_h0','file_dist_h1',]]

In [1356]:
dists = pd.merge(dists,aa,on='target')
# dists['sort'] = dists['type'].apply(lambda x: x.split('.')[-1])

In [1357]:
dists.sort_values(by='dist_h1')[['target','type','dist_c0','dist_c1','dist_h0','dist_h1']]

Unnamed: 0,target,type,dist_c0,dist_c1,dist_h0,dist_h1
8,door,nn.figr.poly,4.4,4.05,1.28,1.67
11,fish,nn.anime.poly,5.08,3.35,1.48,1.73
0,Switzerland,mono,4.91,5.3,2.33,1.76
23,salmon,nn.anime.poly,4.57,5.64,1.12,2.06
14,like,vprep.hom,6.07,4.98,2.02,2.11
22,rock,nn.hom,4.23,6.92,1.21,2.24
26,tomatoes,nn.plfd.poly,4.95,6.21,1.09,2.31
17,pencil,mono,4.85,5.16,1.28,2.32
4,can,nv.hom,5.79,6.56,1.94,2.38
9,duck,nn.anime.poly,4.16,6.29,1.03,2.46


In [1358]:
a = normalise_results(dists,norm_factors)

In [1359]:
a[['target','dist_c0','dist_c1','dist_h0','dist_h1']].sort_values(by='dist_h1')

Unnamed: 0,target,dist_c0,dist_c1,dist_h0,dist_h1
11,fish,0.48,0.4,0.56,0.5
8,door,0.44,0.51,0.48,0.54
23,salmon,0.41,0.67,0.4,0.56
22,rock,0.41,0.82,0.43,0.65
0,Switzerland,0.48,0.67,0.96,0.68
18,potato,0.46,0.54,0.68,0.68
27,transistor,0.48,0.97,0.53,0.7
26,tomatoes,0.43,0.62,0.37,0.7
13,lamb,0.46,0.66,0.52,0.7
6,computer,0.21,0.66,0.65,0.71


In [1333]:
(norm_factors[['target','l2_c0','l2_c1','l2_h0','l2_h1',]]).round(2).sort_values(by='target')

Unnamed: 0,target,l2_c0,l2_c1,l2_h0,l2_h1
3,Switzerland,10.38,6.71,2.07,2.22
6,bank,10.94,7.81,2.72,3.58
28,book,10.24,6.38,2.38,2.43
4,books,10.68,7.68,2.69,2.88
9,can,9.12,7.78,2.28,3.28
19,chicken,10.44,7.91,2.57,3.12
23,computer,11.04,8.43,2.81,3.67
0,dates,10.12,7.95,2.44,2.59
12,door,9.86,6.22,2.19,2.03
24,duck,11.1,7.73,2.92,3.11


In [1258]:
5.3/2.0

2.65