# LIAR-New labeling first round

In [21]:
import pandas as pd

df = pd.read_csv('liar-new_eval_gpt-4-0125_manuallabels.tsv', sep='\t')
df_originally_correct = df[df.prediction_matches_label == 1]
df_originally_wrong = df[df.prediction_matches_label == 0]

In [22]:
df_originally_wrong.labeler_1.value_counts()

Unnamed: 0_level_0,count
labeler_1,Unnamed: 1_level_1
Predictive system is not wrong,64
Predictive system is wrong,23
Uncertain / open to interpretation,13


In [23]:
df_originally_wrong.labeler_2.value_counts()

Unnamed: 0_level_0,count
labeler_2,Unnamed: 1_level_1
Predictive system is not wrong,43
Predictive system is wrong,30
Uncertain / open to interpretation,27


In [24]:
len(df_originally_wrong[(df_originally_wrong.labeler_1 == "Predictive system is not wrong") & (df_originally_wrong.labeler_2 == "Predictive system is not wrong")])

38

In [25]:
len(df_originally_wrong[df_originally_wrong.labeler_1 == df_originally_wrong.labeler_2])

60

In [26]:
from sklearn.metrics import cohen_kappa_score

# Calculate Cohen's Kappa Score
kappa_score = cohen_kappa_score(df['labeler_1'], df['labeler_2'])

print(f"Cohen’s Kappa Score: {kappa_score}")


Cohen’s Kappa Score: 0.3590417148261007


# LIAR-New disagreement resolution

In [27]:
df_originally_wrong.resolution_label.value_counts()

Unnamed: 0_level_0,count
resolution_label,Unnamed: 1_level_1
resolution_not_conducted,60
Predictive system is not wrong,17
Predictive system is wrong,14
Uncertain,9


In [28]:
import numpy as np

def combine_labels(row):
    if row['resolution_label'] != 'resolution_not_conducted':
      return row['resolution_label']
    else: # agreed originally
      return row['labeler_1']

df_originally_wrong['final_label'] = df_originally_wrong.apply(lambda x: combine_labels(x), axis=1)

def sync_labels(row):
  if 'not wrong' in row['final_label'].lower():
    return 'NW'
  elif 'wrong' in row['final_label'].lower():
    return 'W'
  else:
    return 'U'

df_originally_wrong['final_label'] = df_originally_wrong.apply(lambda x: sync_labels(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_originally_wrong['final_label'] = df_originally_wrong.apply(lambda x: combine_labels(x), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_originally_wrong['final_label'] = df_originally_wrong.apply(lambda x: sync_labels(x), axis=1)


In [29]:
df_originally_wrong.final_label.value_counts()

Unnamed: 0_level_0,count
final_label,Unnamed: 1_level_1
NW,55
W,30
U,15


# LIAR-New examples where predicted label already matched dataset one

In [30]:
df_originally_correct.labeler_1.value_counts()

Unnamed: 0_level_0,count
labeler_1,Unnamed: 1_level_1
Predictive system is not wrong,83
Uncertain / open to interpretation,14
Predictive system is wrong,3


In [31]:
df_originally_correct.labeler_2.value_counts()

Unnamed: 0_level_0,count
labeler_2,Unnamed: 1_level_1
Predictive system is not wrong,87
Uncertain / open to interpretation,9
Predictive system is wrong,4


In [32]:
# number of cases where they agree
sum(df_originally_correct.labeler_1 == df_originally_correct.labeler_2)

79

In [39]:
# the labels on cases where they agree
df_originally_correct[df_originally_correct.labeler_1 == df_originally_correct.labeler_2].labeler_1.value_counts()

Unnamed: 0_level_0,count
labeler_1,Unnamed: 1_level_1
Predictive system is not wrong,76
Uncertain / open to interpretation,2
Predictive system is wrong,1


In [36]:
# number of cases where one labeler said it was wrong and one said not wrong
sum(((df_originally_correct.labeler_1 == 'Predictive system is not wrong') & (df_originally_correct.labeler_2 == 'Predictive system is wrong')) | ((df_originally_correct.labeler_2 == 'Predictive system is not wrong') & (df_originally_correct.labeler_1 == 'Predictive system is wrong')))

2

In [45]:
# what labeler_2 said, in cases where labeler_1 said uncertain and labeler_2 said something else
df_originally_correct[(df_originally_correct.labeler_1 != df_originally_correct.labeler_2) & (df_originally_correct.labeler_1 == 'Uncertain / open to interpretation')].labeler_2.value_counts()

Unnamed: 0_level_0,count
labeler_2,Unnamed: 1_level_1
Predictive system is not wrong,10
Predictive system is wrong,2


In [47]:
# converse of above, what labeler_1 said, in cases where labeler_2 said uncertain and labeler_1 said something else
df_originally_correct[(df_originally_correct.labeler_1 != df_originally_correct.labeler_2) & (df_originally_correct.labeler_2 == 'Uncertain / open to interpretation')].labeler_1.value_counts()

Unnamed: 0_level_0,count
labeler_1,Unnamed: 1_level_1
Predictive system is not wrong,6
Predictive system is wrong,1


# FEVER

# New manual eval of prediction quality


In [54]:
import pandas as pd

df = pd.read_csv('fever_eval_gpt-3.5-0125_manuallabels.tsv', sep='\t')

In [55]:
df

Unnamed: 0.1,Unnamed: 0,claim,label,evidence_wiki_url,prediction_matches_label,messages,labeler_1,labeler_2
0,10,The End of Time is the Tenth Doctor's final ad...,REFUTES,Tenth_Doctor,False,Your task is to analyze the factuality of the...,Predictive system is not wrong,Predictive system is not wrong
1,11,Gal Gadot was ranked behind Bar Refaeli for Is...,SUPPORTS,Gal_Gadot,False,Your task is to analyze the factuality of the...,Predictive system is not wrong,Predictive system is not wrong
2,12,Vietnam is a place.,SUPPORTS,Vietnam,False,Your task is to analyze the factuality of the...,Predictive system is not wrong,Predictive system is not wrong
3,13,Cleopatre premiered in an author's lifetime.,SUPPORTS,Cléopâtre,False,Your task is to analyze the factuality of the...,Uncertain / open to interpretation,Predictive system is wrong
4,14,Benjamin Franklin was born in 1790.,REFUTES,United_States_Postmaster_General,False,Your task is to analyze the factuality of the...,Predictive system is wrong,Predictive system is wrong
...,...,...,...,...,...,...,...,...
94,105,Jefferson Davis grew up in the United States.,SUPPORTS,Jefferson_Davis,False,Your task is to analyze the factuality of the...,Predictive system is not wrong,Predictive system is wrong
95,106,Goosebumps (film) is based on a story by Scott...,SUPPORTS,Goosebumps_-LRB-film-RRB-,False,Your task is to analyze the factuality of the...,Predictive system is wrong,Predictive system is not wrong
96,107,Robinson Crusoe on Mars was produced by Paramo...,SUPPORTS,Paramount_Pictures,False,Your task is to analyze the factuality of the...,Uncertain / open to interpretation,Uncertain / open to interpretation
97,108,"Diana, Princess of Wales never married.",REFUTES,"Diana,_Princess_of_Wales",False,Your task is to analyze the factuality of the...,Predictive system is wrong,Predictive system is wrong


In [56]:
# marked NOT wrong by both
len(df[(df.labeler_1 == "Predictive system is not wrong") & (df.labeler_2 == "Predictive system is not wrong")])

36

In [57]:
# marked wrong by both
len(df[(df.labeler_1 == "Predictive system is wrong") & (df.labeler_2 == "Predictive system is wrong")])

31

In [58]:
# marked NOT wrong by one only
len(df[((df.labeler_1 == "Predictive system is not wrong") | (df.labeler_2 == "Predictive system is not wrong")) & ((df.labeler_1 != "Predictive system is not wrong") | (df.labeler_2 != "Predictive system is not wrong")) ])

19

In [59]:
# percentage agreement
len(df[df.labeler_1 == df.labeler_2])

70