In [1]:
import json
import numpy as np
import pandas as pd
from tabulate import tabulate
from sklearn.metrics import cohen_kappa_score
from sectors.utils.dataset import stratified_split
from sectors.config import INDUSTRY_DATA_DIR, HATESPEECH_DATA_DIR

# Calculate Agreement


In [3]:
TEST_PATH = INDUSTRY_DATA_DIR / "test_preprocessed.json"
test = pd.read_json(TEST_PATH, lines=True)

labels = [col for col in test.columns if col not in [
    "id", "legal_name", "description", "short_description", "tags", "len_des", "tags_string", "len_tags", "prompt"
    ]]

xcol = ["id", "legal_name", "description", "tags_string"]
ycol = labels
unused, evaluation = stratified_split(test, xcol, ycol, test_size=0.12)
evaluation.to_csv(INDUSTRY_DATA_DIR / "evaluation.csv", index=False)

true = pd.merge(evaluation["id"], test[labels + ["id"]], on="id", how="left")
true[labels] = true[labels].astype(int)

def get_rater_data(path):
    rater1 = pd.read_csv(path)
    rater1 = rater1.drop(columns=["Unnamed: 1", "Unnamed: 2", "Unnamed: 3"])
    rater1.loc[2] = rater1.loc[2].fillna(rater1.loc[1])
    rater1.loc[3] = rater1.loc[3].fillna(rater1.loc[2])
    rater1 = rater1.drop([0, 1, 2, 4, 5, 6])
    rater1.iloc[0]["Unnamed: 0"] = "id"
    rater1.columns = rater1.iloc[0]
    rater1 = rater1.drop(3)
    rater1 = rater1.reset_index(drop=True)
    rater1.rename_axis(None, axis=1, inplace=True)
    rater1.fillna(0, inplace=True)
    rater1 = rater1[true.columns]
    rater1[labels] = rater1[labels].astype(int)
    return rater1

rater1 = get_rater_data(INDUSTRY_DATA_DIR / "annotations_rater1.csv")
rater2 = get_rater_data(INDUSTRY_DATA_DIR / "annotations_rater2.csv")
rater3 = get_rater_data(INDUSTRY_DATA_DIR / "annotations_rater3.csv")

def apply_threshold(a, t):
    return np.where(a > t, 1, 0)

path = "../sectors/experiments/prompt_tuning/results/huggyllama/llama-7b/best_model/results.json"
model_results = json.load(open(path, "r"))
test_predictions = test
test_predictions[labels] = apply_threshold(np.array(model_results["test_probs"]), 0.666).astype(int)
test_predictions = pd.merge(evaluation["id"], test_predictions[labels + ["id"]], on="id", how="left")

rater1 = rater1[labels].to_numpy()
rater2 = rater2[labels].to_numpy()
rater3 = rater3[labels].to_numpy()
true = true[labels].to_numpy()
test_predictions = test_predictions[labels].to_numpy()

In [4]:
def careful_cohen_kappa(a:np.ndarray, b:np.ndarray):
    if np.array_equal(a, b):
        return 1
    else:
        return cohen_kappa_score(a, b)


def macro_kappa(a:np.ndarray, b:np.ndarray):
    scores = [careful_cohen_kappa(a[:, i], b[:, i]) for i in range(a.shape[1])]
    return np.mean(scores)


raters = {"Rater1": rater1, "Rater2": rater2, "Rater3": rater3, "Gold": true, "PTEC": test_predictions}
kappa_values = {rater: [np.nan]*len(raters) for rater in raters.keys()}
for i, (rater_a_name, rater_a) in enumerate(raters.items()):
    for j, (rater_b_name, rater_b) in enumerate(raters.items()):
        if i < j:  # Fill only one side of the diagonal
            kappa = round(macro_kappa(rater_a, rater_b), 3)
            kappa_values[rater_a_name][j] = kappa
        elif i == j:  # The agreement of rater with itself is always 1
            kappa_values[rater_a_name][j] = np.nan

kappa_df = pd.DataFrame(kappa_values, index=raters.keys())
# Calculate average kappa value for all raters, excluding 'Diff Gold - PTEC' row and diagonal
kappa_df['Average'] = kappa_df.loc[['Gold', 'PTEC'], ['Rater1', 'Rater2', 'Rater3']].mean(axis=1).round(3)
# Calculate the average agreement of each rater with Gold and PTEC
kappa_df.loc['$\Delta_{\text{Gold} - \text{PTEC}}$'] = (kappa_df.loc['Gold'] - kappa_df.loc['PTEC']).round(3)
kappa_df = kappa_df.replace({np.nan: ''})
kappa_df = kappa_df.drop(kappa_df.index[0])
kappa_df = kappa_df.drop(['PTEC'], axis=1)
kappa_df = kappa_df.transpose()
table = tabulate(kappa_df, headers="keys", tablefmt='latex_booktabs', showindex="always")
table = table.replace('tabular}{llllrl}', 'tabular}{lccccc}')
print(table)


\begin{tabular}{lccccc}
\toprule
         & Rater2   & Rater3   & Gold   &   PTEC & \$\textbackslash{}Delta\_\{	ext\{Gold\} - 	ext\{PTEC\}\}\$       \\
\midrule
 Rater1  & 0.477    & 0.401    & 0.389  &  0.36  & 0.029 \\
 Rater2  &          & 0.444    & 0.551  &  0.422 & 0.129 \\
 Rater3  &          &          & 0.311  &  0.245 & 0.066 \\
 Gold    &          &          &        &  0.562 &       \\
 Average &          &          & 0.417  &  0.342 & 0.075 \\
\bottomrule
\end{tabular}
