# import libraries

In [27]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.stats import f_oneway
from scipy.stats import ttest_ind as ttest
from sklearn.mixture import GaussianMixture
from sklearn.metrics.cluster import v_measure_score as v_score
X = np.array([[1, 2], [1, 4], [1, 0], [10, 666], [10, 4], [10, 0]])
gm = GaussianMixture(n_components=2, random_state=0).fit(X)
types = ['h1','h2']

In [28]:
aa ={'Switzerland':'mono','thousands':'mono','pencil':'mono','transistor':'mono','computer':'mono',\
     'door':'nn.poly','potato':'nn.poly','questions':'nv.poly','mistakes':'nv.poly',\
     'thought':'nv.poly','fish':'nn.poly','salmon':'nn.poly','like':'vprep.hom',\
     'tomatoes':'nn.poly','lamb':'nn.poly','chicken':'nn.poly','can':'nv.hom',\
     'power':'nn.poly','bank':'nn.hom','rock':'nn.hom','books':'nv.hom',\
     'duck':'nn.poly','dates':'nv.hom','even':'adjadv.hom','pupil':'nn.hom',\
     'tears':'nv.hom','moves':'nv.poly','book':'nv.hom','form':'nv.hom','watch':'nv.hom','wind':'nv.hom'} 
aa =pd.DataFrame.from_dict(aa,orient = 'index').reset_index().rename(columns = {'index':'target',0:'type'})
aa

Unnamed: 0,target,type
0,Switzerland,mono
1,thousands,mono
2,pencil,mono
3,transistor,mono
4,computer,mono
5,door,nn.poly
6,potato,nn.poly
7,questions,nv.poly
8,mistakes,nv.poly
9,thought,nv.poly


In [29]:
def euclidean(x,y):
    dist = np.linalg.norm(x-y)
    return dist.round(2)
def cos(x,y):
    dist = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
    return dist.round(2)
#calculate the distance between two files
def single_type_word_dist(df,t):
    # t is the extraction type
    #four embedding is a dict key = c0 h0 c1 h1, val =  6 columns
    res = {}
    for word,dfx in df.groupby(by='target'):
        if word == 'went' or word =='line':
            continue
        dfa = dfx[dfx['file']=='1']
        dfb = dfx[dfx['file']=='2']
        x1 = dfa['tensors'].agg('mean')
        x2 = dfb['tensors'].agg('mean')
        #res[word] = euclidean(x1,x2)
        res[word] = euclidean(x1,x2)
    file_dis = pd.DataFrame.from_dict(res, orient='index',columns = [f'file_dist_{t}']).reset_index().rename(columns={"index": "target"})
    return file_dis
# calculate the ground truth distance between file
# input df contatin all embeddings of all target words
# output the distances of all target words
def all_types_of_dist(embed_dict,types=types):
    for t in types:
        delta_df = single_type_word_dist(embed_dict[t],t)
        if t == types[0]:
            df = delta_df
        else:
            df = pd.merge(df,delta_df, on='target')
    return df

In [30]:
# load pickled data
embed_dict = {}
for t in types:
    with open(f'contextual_embeddings/all_sent_{t}','rb') as f:
        embed_dict[t] = pickle.load(f)

In [5]:
embed_dict['h2'].head()

Unnamed: 0,tensors,labels,file,prev,target
0,"[0.100887455, 0.01806052, 0.0100426935, 0.0538...",Looks like2,2,Looks,like
1,"[0.03834144, 0.028573925, 0.029391237, 0.07486...",floor like2,2,floor,like
2,"[0.025371015, 0.0147670405, 0.0077385097, 0.05...",yourself like2,2,yourself,like
3,"[0.10195144, 0.014568253, 0.0050676605, 0.1015...",support like2,2,support,like
4,"[0.053712066, 0.012888801, -0.01040047, 0.1084...",OS like2,2,OS,like


In [17]:
all_types_dist_all_words = all_types_of_dist(embed_dict).set_index('target')
dists = pd.merge(aa,all_types_dist_all_words,on='target')
dists['crit']=dists['type'].apply(lambda x:x.split('.')[-1])
dists = dists.sort_values(by='file_dist_h1')
dists
# dists.sort_values(by='file_dist_h1').to_csv('tables/distance.csv')

Unnamed: 0,target,type,file_dist_h1,file_dist_h2,crit
1,thousands,mono,0.48,0.89,mono
4,computer,mono,0.48,0.92,mono
10,fish,nn.poly,0.7,1.74,poly
2,pencil,mono,0.72,1.39,mono
3,transistor,mono,0.8,1.2,mono
0,Switzerland,mono,0.84,1.11,mono
6,potato,nn.poly,0.85,2.1,poly
13,tomatoes,nn.poly,0.86,1.58,poly
11,salmon,nn.poly,0.89,2.06,poly
5,door,nn.poly,0.99,1.53,poly


In [18]:
h1 = {}
h2 = {}
for label,group in dists.groupby(by='crit'):
    h1[label]=group['file_dist_h1'].to_numpy()
    h2[label]=group['file_dist_h2'].to_numpy()

In [19]:
f_oneway(h1['mono'],h1['poly'],h1['hom'])

F_onewayResult(statistic=9.41891886637677, pvalue=0.0007444863018499986)

In [20]:
f_oneway(h2['mono'],h2['poly'],h2['hom'])

F_onewayResult(statistic=20.17870188011014, pvalue=3.7429549763751104e-06)

In [23]:
ttest(dists['file_dist_h2'],dists['file_dist_h1'])

Ttest_indResult(statistic=5.123594926377686, pvalue=3.358582153265804e-06)

In [25]:
ttest(h1['hom'],h1['poly'])

Ttest_indResult(statistic=1.9050688139074101, pvalue=0.0688216867308275)

In [26]:
ttest(h2['hom'],h2['poly'])

Ttest_indResult(statistic=2.0866358368559057, pvalue=0.04771860730413095)

In [107]:
outdists=dists.copy().drop(['crit'],axis=1)
outdists['file_dist_h1']=outdists['file_dist_h1'].apply(lambda x:str(x)[:4])
outdists['file_dist_h2']=outdists['file_dist_h2'].apply(lambda x:str(x)[:4])
outdists.sort_values(by='file_dist_h1').to_csv('tables/distance.csv')
outdists

Unnamed: 0,target,type,file_dist_h1,file_dist_h2
1,thousands,mono,0.47,0.88
4,computer,mono,0.47,0.92
10,fish,nn.poly,0.69,1.74
2,pencil,mono,0.72,1.38
3,transistor,mono,0.8,1.2
0,Switzerland,mono,0.83,1.11
6,potato,nn.poly,0.85,2.09
13,tomatoes,nn.poly,0.86,1.58
11,salmon,nn.poly,0.88,2.05
5,door,nn.poly,0.99,1.52
