In [1]:
import os
import pandas as pd

from statsmodels.stats.inter_rater import fleiss_kappa
from sklearn.metrics import cohen_kappa_score

In [2]:
df1 = pd.read_excel("ForManualEval.xlsx", engine="openpyxl", index_col="Unnamed: 0").drop(['Appropriateness'], axis=1)
df2 = pd.read_excel("ForManualEval1.xlsx", engine="openpyxl", index_col="Unnamed: 0").drop(['Appropriateness'], axis=1)

In [3]:
df1.head()

Unnamed: 0,id,title_orig_case,source,headline,masked,predicted,new span,predicted_score,method,Humorousness,Grammaticality
0,163896,Is This Going To Make Me A Different Person? G...,AllTheNewsComponentsOne,Is this going to make me a different person? g...,Is this going to make me a different person ? ...,Is this going to make me a different person ? ...,monkey,0.999713,bert_humedit,0,2
1,143206,Rio has turned Hope Solo into public enemy No. 1,AllTheNewsComponentsOne,Rio has turned hope solo into public enemy no. 1,Rio has turned hope solo into public enemy [MA...,Rio has turned hope solo into public enemy chi...,chihuahua,0.999743,bert_humedit,1,2
2,1605,WHO says new drugs urgently needed to fight su...,Harvard,Who says new drugs urgently needed to fight Su...,Who says new drugs urgently [MASK] to fight Su...,Who says new drugs urgently going to fight Sup...,going,0.951083,bert_humedit,0,2
3,69263,A top investor nails why Microsoft is getting ...,AllTheNewsComponentsOne,A top investor nails why Microsoft is getting ...,A top investor nails [MASK] Microsoft is getti...,A top investor nails microsoft Microsoft is ge...,microsoft,0.817679,bert_humedit,0,1
4,212657,"As he moves campaign to battlegrounds, which D...",AllTheNewsKaggle,"As he moves campaign to battlegrounds, which D...","As he moves campaign to battlegrounds , which ...","As he moves campaign to battlegrounds , which ...",duck,0.98156,bert_humedit,3,3


In [4]:
df2.head()

Unnamed: 0,id,title_orig_case,source,headline,masked,predicted,new span,predicted_score,method,Humorousness,Grammaticality
0,163896,Is This Going To Make Me A Different Person? G...,AllTheNewsComponentsOne,Is this going to make me a different person? g...,Is this going to make me a different person ? ...,Is this going to make me a different person ? ...,monkey,0.999713,bert_humedit,2,3
1,143206,Rio has turned Hope Solo into public enemy No. 1,AllTheNewsComponentsOne,Rio has turned hope solo into public enemy no. 1,Rio has turned hope solo into public enemy [MA...,Rio has turned hope solo into public enemy chi...,chihuahua,0.999743,bert_humedit,1,3
2,1605,WHO says new drugs urgently needed to fight su...,Harvard,Who says new drugs urgently needed to fight Su...,Who says new drugs urgently [MASK] to fight Su...,Who says new drugs urgently going to fight Sup...,going,0.951083,bert_humedit,0,3
3,69263,A top investor nails why Microsoft is getting ...,AllTheNewsComponentsOne,A top investor nails why Microsoft is getting ...,A top investor nails [MASK] Microsoft is getti...,A top investor nails microsoft Microsoft is ge...,microsoft,0.817679,bert_humedit,0,0
4,212657,"As he moves campaign to battlegrounds, which D...",AllTheNewsKaggle,"As he moves campaign to battlegrounds, which D...","As he moves campaign to battlegrounds , which ...","As he moves campaign to battlegrounds , which ...",duck,0.98156,bert_humedit,2,3


In [5]:
df_merge = pd.concat([df1,df2[['Humorousness','Grammaticality']].rename({'Humorousness':'Humorousness_b',
                                                             'Grammaticality':'Grammaticality_b'},
                                                             axis=1)], axis=1)

In [6]:
print('Hum corr: ', df_merge['Humorousness'].corr(df_merge['Humorousness_b'], method='spearman'),
'Gram corr: ', df_merge['Grammaticality'].corr(df_merge['Grammaticality_b'], method='spearman'))

Hum corr:  0.3583235600305203 Gram corr:  0.22007040433219197


In [7]:
print('Hum corr: ', df_merge['Humorousness'].corr(df_merge['Humorousness_b'], method='kendall'),
'Gram corr: ', df_merge['Grammaticality'].corr(df_merge['Grammaticality_b'], method='kendall'))

Hum corr:  0.32760024774116947 Gram corr:  0.1877074128002507


In [13]:
df_merge["Humorousness"].corr(df_merge["predicted_score"])

-0.00404663938995562

In [14]:
df_merge["Humorousness_b"].corr(df_merge["predicted_score"])

0.1714272338006116

In [15]:
df_merge.mean()

id                     126315.080000
predicted_score             0.178145
Humorousness                0.425000
Grammaticality              2.245000
Humorousness_b              0.725000
Grammaticality_b            1.245000
mean_humorousness           0.575000
mean_grammaticality         1.745000
dtype: float64

In [16]:
df_merge['mean_humorousness'] = df_merge.apply(lambda x: (x['Humorousness']+x['Humorousness_b'])/2,
                                              axis=1)
df_merge['mean_grammaticality'] = df_merge.apply(lambda x: (x['Grammaticality']+x['Grammaticality_b'])/2,
                                                axis=1)

In [17]:
df_merge['mean_humorousness'].describe()

count    200.000000
mean       0.575000
std        0.690477
min        0.000000
25%        0.000000
50%        0.500000
75%        1.000000
max        3.000000
Name: mean_humorousness, dtype: float64

In [18]:
df_merge['mean_grammaticality'].describe()

count    200.000000
mean       1.745000
std        0.795701
min        0.000000
25%        1.000000
50%        1.750000
75%        2.500000
max        3.000000
Name: mean_grammaticality, dtype: float64

Попробуем по методам отдельно:

In [19]:
df_merge['method'].unique()

array(['bert_humedit', 'bert_jokes', 'colloc', 'jokecolloc'], dtype=object)

In [21]:
def analyze_method(method):
    subs = df_merge[df_merge['method']==method]
    for feat in ('Humorousness','Humorousness_b','mean_humorousness'):
        print(f"{feat}:")
        print(subs[feat].describe())
    print(f"Humorousness kappa: {cohen_kappa_score(subs['Humorousness'], subs['Humorousness_b'])}")
    for feat in ('Grammaticality','Grammaticality_b','mean_grammaticality'):
        print(f"{feat}:")
        print(subs[feat].describe())
    print(f"Grammaticality kappa: {cohen_kappa_score(subs['Grammaticality'], subs['Grammaticality_b'])}")
    print(f"Annot#1 corr with ColBert: {subs['Humorousness'].corr(subs['predicted_score'])}")
    print(f"Annot#2 corr with ColBert: {subs['Humorousness_b'].corr(subs['predicted_score'])}")


In [22]:
analyze_method("bert_humedit")

Humorousness:
count    50.000000
mean      0.860000
std       1.010355
min       0.000000
25%       0.000000
50%       0.500000
75%       2.000000
max       3.000000
Name: Humorousness, dtype: float64
Humorousness_b:
count    50.000000
mean      1.040000
std       1.068281
min       0.000000
25%       0.000000
50%       1.000000
75%       2.000000
max       3.000000
Name: Humorousness_b, dtype: float64
mean_humorousness:
count    50.000000
mean      0.950000
std       0.904919
min       0.000000
25%       0.000000
50%       0.750000
75%       1.875000
max       3.000000
Name: mean_humorousness, dtype: float64
Humorousness kappa: 0.23932124049151537
Grammaticality:
count    50.000000
mean      2.600000
std       0.728431
min       0.000000
25%       2.000000
50%       3.000000
75%       3.000000
max       3.000000
Name: Grammaticality, dtype: float64
Grammaticality_b:
count    50.00000
mean      1.70000
std       1.28174
min       0.00000
25%       0.00000
50%       2.00000
75%       3.

In [23]:
analyze_method('bert_jokes')

Humorousness:
count    50.000000
mean      0.220000
std       0.581694
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       3.000000
Name: Humorousness, dtype: float64
Humorousness_b:
count    50.000000
mean      0.660000
std       0.871546
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       3.000000
Name: Humorousness_b, dtype: float64
mean_humorousness:
count    50.00000
mean      0.44000
std       0.52138
min       0.00000
25%       0.00000
50%       0.00000
75%       1.00000
max       1.50000
Name: mean_humorousness, dtype: float64
Humorousness kappa: 0.1329479768786127
Grammaticality:
count    50.000000
mean      2.320000
std       0.867556
min       0.000000
25%       2.000000
50%       3.000000
75%       3.000000
max       3.000000
Name: Grammaticality, dtype: float64
Grammaticality_b:
count    50.00000
mean      1.28000
std       1.03095
min       0.00000
25%       0.00000
50%       1.00000
75%       2.00000
max

In [24]:
analyze_method('colloc')

Humorousness:
count    50.000000
mean      0.280000
std       0.607437
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       2.000000
Name: Humorousness, dtype: float64
Humorousness_b:
count    50.000000
mean      0.580000
std       0.758355
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       3.000000
Name: Humorousness_b, dtype: float64
mean_humorousness:
count    50.000000
mean      0.430000
std       0.505177
min       0.000000
25%       0.000000
50%       0.500000
75%       0.500000
max       1.500000
Name: mean_humorousness, dtype: float64
Humorousness kappa: 0.05063291139240511
Grammaticality:
count    50.000000
mean      2.000000
std       1.010153
min       0.000000
25%       1.250000
50%       2.000000
75%       3.000000
max       3.000000
Name: Grammaticality, dtype: float64
Grammaticality_b:
count    50.000000
mean      0.900000
std       0.974156
min       0.000000
25%       0.000000
50%       1.000000
75%   

In [25]:
analyze_method('jokecolloc')

Humorousness:
count    50.000000
mean      0.340000
std       0.688388
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       3.000000
Name: Humorousness, dtype: float64
Humorousness_b:
count    50.000000
mean      0.620000
std       0.779586
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       3.000000
Name: Humorousness_b, dtype: float64
mean_humorousness:
count    50.000000
mean      0.480000
std       0.630516
min       0.000000
25%       0.000000
50%       0.500000
75%       0.500000
max       2.500000
Name: mean_humorousness, dtype: float64
Humorousness kappa: 0.24642049736247174
Grammaticality:
count    50.000000
mean      2.060000
std       1.057722
min       0.000000
25%       2.000000
50%       2.000000
75%       3.000000
max       3.000000
Name: Grammaticality, dtype: float64
Grammaticality_b:
count    50.000000
mean      1.100000
std       0.974156
min       0.000000
25%       0.000000
50%       1.000000
75%   

In [25]:
df_merge['mean_humorousness'].corr(df_merge['predicted_score'])

0.10828450041925756

In [26]:
df_merge['Humorousness'].corr(df_merge['predicted_score'])

-0.00404663938995562

In [27]:
df_merge['Humorousness_b'].corr(df_merge['predicted_score'])

0.1714272338006116

In [10]:
cohen_kappa_score(df_merge["Humorousness"], df_merge["Humorousness_b"])

0.1900671977215077

In [11]:
cohen_kappa_score(df_merge["Grammaticality"], df_merge["Grammaticality_b"])

0.04099584008129298

In [12]:
len(df_merge[(df_merge['Humorousness']==df_merge['Humorousness_b'])&(df_merge['Grammaticality']==df_merge['Grammaticality_b'])])

24

In [13]:
full_agreement = df_merge[(df_merge['Humorousness']==df_merge['Humorousness_b'])&(df_merge['Grammaticality']==df_merge['Grammaticality_b'])]

In [14]:
full_agreement.to_csv("FullAgreement.tsv",sep='\t')

In [5]:
only_humour_agreement = df_merge[(df_merge["Humorousness"]==df_merge["Humorousness_b"])&(df_merge["Grammaticality"]!=df_merge["Grammaticality_b"])]

In [9]:
only_humour_agreement.to_csv("OnlyHumAgreement.tsv",sep='\t')

In [10]:
only_grammaticality_agreement = df_merge[(df_merge["Humorousness"]!=df_merge["Humorousness_b"])&(df_merge["Grammaticality"]==df_merge["Grammaticality_b"])]

In [12]:
only_grammaticality_agreement.to_csv("OnlyGramAgreement.tsv",sep='\t')