In [1]:
%matplotlib inline

import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np

import json
import os
import glob
import datetime
from scipy.stats import wilcoxon
from sklearn.metrics import mean_squared_error
from scipy.stats import mannwhitneyu

### Organization

#### DI_shuffle

In [14]:
org_base_di = "/home/mim/repo/Document_representation/output/e885c38c446af586e887fb790dd90930"
org_finetune_di = "/home/mim/repo/Document_representation/output/0aa2570ced889c4e88ae1554253cb412"
org_no_finetune_di ="/home/mim/repo/Document_representation/output/46f970b1c6379f85b7ccc1fe68a8af14"

In [15]:
def get_prdct_score(file):
    prdct=[]
    for i in range(5):
        pjs = os.path.join(file, "prediction_f{}.json".format(i))
        with open(pjs) as f:
            data = json.load(f)
            predct =data["system"]
            prdct+=predct

    return prdct

In [16]:
def get_gold_score(file):
    prdct=[]
    for i in range(5):
        pjs = os.path.join(file, "prediction_f{}.json".format(i))
        with open(pjs) as f:
            data = json.load(f)
            predct =data["gold"]
            prdct+=predct

    return prdct

In [17]:
def get_mse(gold, prdct):
    
    mse=[]
    
    for i in range(0, len(gold)):
        m = mean_squared_error([gold[i]], [prdct[i]])
        mse.append(m)
        
    return mse

In [18]:
org_gold_base_di = get_gold_score(org_base_di)
org_prdct_base_di = get_prdct_score(org_base_di)
org_mse_base_di = get_mse(org_gold_base_di, org_prdct_base_di)

In [19]:
org_gold_no_finetune_di = get_gold_score(org_no_finetune_di)
org_prdct_no_finetune_di = get_prdct_score(org_no_finetune_di)
org_mse_no_finetune_di = get_mse(org_gold_no_finetune_di, org_prdct_no_finetune_di)

In [20]:
org_gold_finetune_di = get_gold_score(org_finetune_di)
org_prdct_finetune_di = get_prdct_score(org_finetune_di)
org_mse_finetune_di = get_mse(org_gold_finetune_di, org_prdct_finetune_di)

In [21]:
print('Wilcoxon Signed-Rank Test')
print('Organization: di_shuffle')
print('no_finetune')
# compare samples
stat, p = wilcoxon(org_mse_base_di, org_mse_no_finetune_di)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

Wilcoxon Signed-Rank Test
Organization: di_shuffle
no_finetune
Statistics=240871.000, p=0.236
Same distribution (fail to reject H0)


In [22]:
print('Wilcoxon Signed-Rank Test')
print('Organization: di shuffle')
print('finetune')
# compare samples
stat, p = wilcoxon(org_mse_base_di, org_mse_finetune_di)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

Wilcoxon Signed-Rank Test
Organization: di shuffle
finetune
Statistics=242273.000, p=0.302
Same distribution (fail to reject H0)


#### sentence shuffle

In [32]:
org_base_sent = "/home/mim/repo/Document_representation/output/e885c38c446af586e887fb790dd90930"
org_finetune_sent = "/home/mim/repo/Document_representation/output/0cb7a2429b6cc73297413c20570c824f"
org_no_finetune_sent ="/home/mim/repo/Document_representation/output/752aa6160e706a6ffe0f91a1e423b40a"

In [33]:
org_gold_base_sent = get_gold_score(org_base_sent)
org_prdct_base_sent = get_prdct_score(org_base_sent)
org_mse_base_sent = get_mse(org_gold_base_sent, org_prdct_base_sent)

In [34]:
org_gold_no_finetune_sent = get_gold_score(org_no_finetune_sent)
org_prdct_no_finetune_sent = get_prdct_score(org_no_finetune_sent)
org_mse_no_finetune_sent = get_mse(org_gold_no_finetune_sent, org_prdct_no_finetune_sent)

In [35]:
org_gold_finetune_sent = get_gold_score(org_finetune_sent)
org_prdct_finetune_sent = get_prdct_score(org_finetune_sent)
org_mse_finetune_sent = get_mse(org_gold_finetune_sent, org_prdct_finetune_sent)

In [36]:
print('Wilcoxon Signed-Rank Test')
print('Organization: sent_shuffle')
print('no_finetune')
# compare samples
stat, p = wilcoxon(org_mse_base_sent, org_mse_no_finetune_sent)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

Wilcoxon Signed-Rank Test
Organization: sent_shuffle
no_finetune
Statistics=223678.000, p=0.002
Different distribution (reject H0)


In [37]:
print('Wilcoxon Signed-Rank Test')
print('Organization: sent shuffle')
print('finetune')
# compare samples
stat, p = wilcoxon(org_mse_base_sent, org_mse_finetune_sent)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

Wilcoxon Signed-Rank Test
Organization: sent shuffle
finetune
Statistics=224649.000, p=0.003
Different distribution (reject H0)


### Argument strength

#### di_shuffle

In [38]:
arg_base_di = "/home/mim/repo/Document_representation/output/c223f9171d2065520643cf6d654dc138"
arg_finetune_di = "/home/mim/repo/Document_representation/output/b1809650ecdefed27f07e32a05dc3ade"
arg_no_finetune_di ="/home/mim/repo/Document_representation/output/cee23dc2ed54aed0911230d84151441e"

In [39]:
arg_gold_base_di = get_gold_score(arg_base_di)
arg_prdct_base_di = get_prdct_score(arg_base_di)
arg_mse_base_di = get_mse(arg_gold_base_di, arg_prdct_base_di)

In [40]:
arg_gold_no_finetune_di = get_gold_score(arg_no_finetune_di)
arg_prdct_no_finetune_di = get_prdct_score(arg_no_finetune_di)
arg_mse_no_finetune_di = get_mse(arg_gold_no_finetune_di, arg_prdct_no_finetune_di)

In [41]:
arg_gold_finetune_di = get_gold_score(arg_finetune_di)
arg_prdct_finetune_di = get_prdct_score(arg_finetune_di)
arg_mse_finetune_di = get_mse(arg_gold_finetune_di, arg_prdct_finetune_di)

In [42]:
print('Wilcoxon Signed-Rank Test')
print('Argument Strength: di_shuffle')
print('no_finetune')
# compare samples
stat, p = wilcoxon(arg_mse_base_di, arg_mse_no_finetune_di)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

Wilcoxon Signed-Rank Test
Argument Strength: di_shuffle
no_finetune
Statistics=246339.000, p=0.669
Same distribution (fail to reject H0)


In [43]:
print('Wilcoxon Signed-Rank Test')
print('Argument Strength: di shuffle')
print('finetune')
# compare samples
stat, p = wilcoxon(arg_mse_base_di, arg_mse_finetune_di)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

Wilcoxon Signed-Rank Test
Argument Strength: di shuffle
finetune
Statistics=242302.000, p=0.384
Same distribution (fail to reject H0)


#### sentence shuffle

In [44]:
arg_base_sent = "/home/mim/repo/Document_representation/output/c223f9171d2065520643cf6d654dc138"
arg_finetune_sent = "/home/mim/repo/Document_representation/output/51a2e8727c1fe1fa27d190f879ce078d"
arg_no_finetune_sent ="/home/mim/repo/Document_representation/output/fcbeada6ec3cb1984984fe3ec9cb664e"

In [45]:
arg_gold_base_sent = get_gold_score(arg_base_sent)
arg_prdct_base_sent = get_prdct_score(arg_base_sent)
arg_mse_base_sent = get_mse(arg_gold_base_sent, arg_prdct_base_sent)

In [46]:
arg_gold_no_finetune_sent = get_gold_score(arg_no_finetune_sent)
arg_prdct_no_finetune_sent = get_prdct_score(arg_no_finetune_sent)
arg_mse_no_finetune_sent = get_mse(arg_gold_no_finetune_sent, arg_prdct_no_finetune_sent)

In [47]:
arg_gold_finetune_sent = get_gold_score(arg_finetune_sent)
arg_prdct_finetune_sent = get_prdct_score(arg_finetune_sent)
arg_mse_finetune_sent = get_mse(arg_gold_finetune_sent, arg_prdct_finetune_sent)

In [50]:
print('Wilcoxon Signed-Rank Test')
print('Argument Strength: sent shuffle')
print('no_finetune')
# compare samples
stat, p = wilcoxon(arg_mse_base_sent, arg_mse_no_finetune_sent)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

Wilcoxon Signed-Rank Test
Argument Strength: sent shuffle
no_finetune
Statistics=232016.000, p=0.046
Different distribution (reject H0)


In [51]:
print('Wilcoxon Signed-Rank Test')
print('Argument Strength: sent shuffle')
print('finetune')
# compare samples
stat, p = wilcoxon(arg_mse_base_sent, arg_mse_finetune_sent)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

Wilcoxon Signed-Rank Test
Argument Strength: sent shuffle
finetune
Statistics=248003.000, p=0.806
Same distribution (fail to reject H0)
