Code to load human and llm scores for the new reddit stories and compare them.

Based on the code in `story_eval/analyze.py`.

In [7]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

In [14]:
DATA_DIR = Path("human_study/data")

human_paths = [p for p in (DATA_DIR / "preprocessed").iterdir() if p.name.startswith("cse_517")]
human_paths

[PosixPath('human_study/data/preprocessed/cse_517_harine.csv'),
 PosixPath('human_study/data/preprocessed/cse_517_deniz.csv'),
 PosixPath('human_study/data/preprocessed/cse_517_ethan.csv')]

In [47]:
LLM_PATTERN = re.compile("([^_]*)_(no_mop|mop)_new_human_stories_annotations.csv")

llm_paths = {
    g.groups(): p for p in (DATA_DIR / "processed").iterdir() if (g := LLM_PATTERN.fullmatch(p.name))
}
llm_paths

{('gpt-4o-2024-08-06',
  'no_mop'): PosixPath('human_study/data/processed/gpt-4o-2024-08-06_no_mop_new_human_stories_annotations.csv'),
 ('o1-preview',
  'mop'): PosixPath('human_study/data/processed/o1-preview_mop_new_human_stories_annotations.csv'),
 ('gpt-4o-2024-08-06',
  'mop'): PosixPath('human_study/data/processed/gpt-4o-2024-08-06_mop_new_human_stories_annotations.csv'),
 ('gpt-4o-mini-2024-07-18',
  'no_mop'): PosixPath('human_study/data/processed/gpt-4o-mini-2024-07-18_no_mop_new_human_stories_annotations.csv'),
 ('gpt-4o-mini-2024-07-18',
  'mop'): PosixPath('human_study/data/processed/gpt-4o-mini-2024-07-18_mop_new_human_stories_annotations.csv')}

In [48]:
human_results_list = []
for p_idx, p in enumerate(human_paths):
    df = pd.read_csv(p)
    df["participant_id"] = p_idx
    human_results_list.append(df)


human_results = pd.concat(human_results_list)
human_results.rename(columns={c: c.lower().replace(" ", "_") + "_score" for c in human_results.columns if "_id" not in c}, inplace=True)
human_results

Unnamed: 0,story_id,authenticity_score,emotion_provoking_score,empathy_score,engagement_score,narrative_complexity_score,participant_id
0,new_1a,4,4,4,5,2,0
1,new_1b,4,4,3,3,3,0
2,new_1c,4,3,2,3,2,0
3,new_2a,3,3,2,3,2,0
4,new_2b,3,4,4,5,3,0
5,new_2c,2,2,2,2,2,0
6,new_3a,3,3,3,4,2,0
7,new_3b,4,4,3,3,4,0
8,new_3c,4,4,4,3,5,0
9,new_4a,5,4,5,4,5,0


In [49]:
llm_results = {k: pd.read_csv(p) for k, p in llm_paths.items()}
llm_results[list(llm_results)[0]]

Unnamed: 0,authenticity_explanation,authenticity_score,emotion_provoking_explanation,emotion_provoking_score,empathy_explanation,empathy_score,engagement_explanation,engagement_score,narrative_complexity_explanation,narrative_complexity_score,...,premise,text,author_type,author_short,author_full,net_upvotes,Column1,_1,_2,_3
0,The story feels authentic in depicting a human...,3.0,The story effectively provokes emotions associ...,3.0,Readers can empathize with the protagonist's a...,3.0,The text maintains a certain level of engageme...,3.0,While the story introduces a potentially inter...,3.0,...,"You don't know it, but you're the child of a h...","�Son, we need to talk.�\n\nOf course, here it ...",Human,Human-Advanced,Human-Advanced,264,,,,
1,The story explores complex themes of existenti...,4.0,The story effectively depicts a range of emoti...,4.0,The reader is invited to empathize with the pr...,4.0,The narrative is engaging due to its intriguin...,4.0,The story presents a multifaceted narrative wi...,4.0,...,"You don't know it, but you're the child of a h...","The hills I had known were really mountains, a...",Human,Human-Intermediate,Human-Intermediate,74,,,,
2,The story deals with a highly fantastical scen...,2.0,The story has an underlying sense of tension a...,3.0,The situation is too far removed from real-wor...,2.0,The story presents an engaging mystery with it...,3.0,The narrative is based on a singular twist wit...,2.0,...,"You don't know it, but you're the child of a h...","""I've been so alone for so long, but finally, ...",Human,Human-Novice,Human-Novice,20,,,,
3,The story presents a somewhat exaggerated and ...,3.0,The writing effectively portrays intense emoti...,4.0,While the narrative is gripping and portrays s...,3.0,The story is engaging due to its dramatic cont...,4.0,The narrative doesn’t delve deeply into charac...,3.0,...,You wake up to fund your friend standing over ...,"""Don't say a word, and don't move an inch.""\n\...",Human,Human-Advanced,Human-Advanced,96,,,,
4,The writing feels somewhat authentic in terms ...,3.0,The writing does provoke a sense of tension an...,3.0,The story gives some insight into the characte...,3.0,"The opening creates immediate tension, invitin...",3.0,The narrative is straightforward with a twist ...,2.0,...,You wake up to fund your friend standing over ...,Shuffling sand filters through darkness and ba...,Human,Human-Intermediate,Human-Intermediate,23,,,,
5,"The scenario is humorous but implausible, as i...",2.0,The story uses elements of humor and suspense ...,2.0,The story doesn't delve into its characters' i...,2.0,The story's humor and suspense are engaging to...,3.0,The narrative is straightforward and relies on...,2.0,...,You wake up to fund your friend standing over ...,"""Uhmmm, Kendrick . . . why you standing over m...",Human,Human-Novice,Human-Novice,9,,,,
6,"The story creates a surreal environment, with ...",3.0,The writing explores the narrator's internal e...,3.0,While the repetition in the story depicts the ...,2.0,The looping phrases and surreal atmosphere eng...,3.0,The story presents a mysterious setting and co...,2.0,...,"one ordinary morning, you board the subway for...","I looked at the screen--No, I have been starin...",Human,Human-Advanced,Human-Advanced,19,,,,
7,"The story presents an unreal situation, a time...",2.0,The writing lacks depth in depicting emotional...,2.0,While the protagonist's frustration is somewha...,2.0,The premise of a time loop is inherently intri...,3.0,The story is more focused on the repetitive na...,2.0,...,"one ordinary morning, you board the subway for...","At first, I thought I was too sleepy, after al...",Human,Human-Intermediate,Human-Intermediate,8,,,,
8,The story captures real-life experiences such ...,4.0,The story makes use of sensory details and dep...,3.0,The protagonist's struggle and his moments of ...,3.0,The narrative maintains interest through the c...,3.0,The story offers a simple narrative primarily ...,2.0,...,"one ordinary morning, you board the subway for...",The subway wailed through the tunnels as they ...,Human,Human-Novice,Human-Novice,2,,,,
9,The story presents an interesting concept of a...,3.0,The story successfully illustrates a range of ...,4.0,Readers can empathize with the woman's grief a...,4.0,The narrative engages readers by presenting a ...,4.0,The story introduces a complex idea of a deity...,3.0,...,You find yourself scratching your head. Gods t...,"""Well, I was thinking even those who do not be...",Human,Human-Advanced,Human-Advanced,81,,,,


In [69]:
df_path = DATA_DIR / "cse_517_analyzed.csv"

In [None]:
# From story_eval/analyze.py around Line 430
from story_eval.analyze import AnnotationAnalyzer, MeanAggregator

if not df_path.exists()
    analyzer = AnnotationAnalyzer()


    COMPONENTS = ['authenticity_score', 'empathy_score', 'engagement_score', 'emotion_provoking_score', 'narrative_complexity_score']


    results = []

    for component in COMPONENTS:
        human_iaa = analyzer.regular_iaa(human_results, component, prefix="human")

        for (llm_name, llm_use_mop), llm_df in llm_results.items():
            print(llm_name, llm_use_mop, component)
            llm_iaa   = analyzer.regular_iaa(llm_df, component, prefix="llm")
            # print(f"Component: {component}")
            # print(f"human_iaa:\n{human_iaa}")
            # print(f"llm_iaa:\n{llm_iaa}")

            human_vs_llm_corr = analyzer.comparative_correlation(human_results, llm_df, component, MeanAggregator())
            # print(f"human_vs_llm:\n{human_vs_llm_corr}")
            results.append({
                "llm_name": llm_name,
                "llm_use_mop": llm_use_mop,
                "component": component,
                **human_iaa,
                **llm_iaa,
                **human_vs_llm_corr
            })

    results_df = pd.DataFrame.from_records(results)
    results_df.to_csv(df_path)

gpt-4o-2024-08-06 no_mop authenticity_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  s2 = (pg - rpbar) / (r - 1)
  rdash = _answers_matrix.sum(axis=1).sum() / N
  po2 = (_answers_matrix * (r_star-1) / (rdash * (n - 1))[:,np.newaxis]).sum(axis=1).mean()
  pc = ((_answers_matrix.mean(axis=0) / rdash)**2).sum()
  ret = um.true_divide(
  p_j = answers_matrix.sum(axis=0) / (N * answers_matrix.sum(axis=1).mean())


o1-preview mop authenticity_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-2024-08-06 mop authenticity_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-mini-2024-07-18 no_mop authenticity_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  s2 = (pg - rpbar) / (r - 1)
  rdash = _answers_matrix.sum(axis=1).sum() / N
  po2 = (_answers_matrix * (r_star-1) / (rdash * (n - 1))[:,np.newaxis]).sum(axis=1).mean()
  pc = ((_answers_matrix.mean(axis=0) / rdash)**2).sum()
  ret = um.true_divide(
  p_j = answers_matrix.sum(axis=0) / (N * answers_matrix.sum(axis=1).mean())


gpt-4o-mini-2024-07-18 mop authenticity_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  corr, p = spearmanr(pivot_table[rater_ids[i]], pivot_table[rater_ids[j]])


gpt-4o-2024-08-06 no_mop empathy_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  s2 = (pg - rpbar) / (r - 1)
  rdash = _answers_matrix.sum(axis=1).sum() / N
  po2 = (_answers_matrix * (r_star-1) / (rdash * (n - 1))[:,np.newaxis]).sum(axis=1).mean()
  pc = ((_answers_matrix.mean(axis=0) / rdash)**2).sum()
  ret = um.true_divide(
  p_j = answers_matrix.sum(axis=0) / (N * answers_matrix.sum(axis=1).mean())


o1-preview mop empathy_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-2024-08-06 mop empathy_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-mini-2024-07-18 no_mop empathy_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  s2 = (pg - rpbar) / (r - 1)
  rdash = _answers_matrix.sum(axis=1).sum() / N
  po2 = (_answers_matrix * (r_star-1) / (rdash * (n - 1))[:,np.newaxis]).sum(axis=1).mean()
  pc = ((_answers_matrix.mean(axis=0) / rdash)**2).sum()
  ret = um.true_divide(
  p_j = answers_matrix.sum(axis=0) / (N * answers_matrix.sum(axis=1).mean())


gpt-4o-mini-2024-07-18 mop empathy_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-2024-08-06 no_mop engagement_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  s2 = (pg - rpbar) / (r - 1)
  rdash = _answers_matrix.sum(axis=1).sum() / N
  po2 = (_answers_matrix * (r_star-1) / (rdash * (n - 1))[:,np.newaxis]).sum(axis=1).mean()
  pc = ((_answers_matrix.mean(axis=0) / rdash)**2).sum()
  ret = um.true_divide(
  p_j = answers_matrix.sum(axis=0) / (N * answers_matrix.sum(axis=1).mean())


o1-preview mop engagement_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-2024-08-06 mop engagement_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-mini-2024-07-18 no_mop engagement_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  s2 = (pg - rpbar) / (r - 1)
  rdash = _answers_matrix.sum(axis=1).sum() / N
  po2 = (_answers_matrix * (r_star-1) / (rdash * (n - 1))[:,np.newaxis]).sum(axis=1).mean()
  pc = ((_answers_matrix.mean(axis=0) / rdash)**2).sum()
  ret = um.true_divide(
  p_j = answers_matrix.sum(axis=0) / (N * answers_matrix.sum(axis=1).mean())


gpt-4o-mini-2024-07-18 mop engagement_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-2024-08-06 no_mop emotion_provoking_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  s2 = (pg - rpbar) / (r - 1)
  rdash = _answers_matrix.sum(axis=1).sum() / N
  po2 = (_answers_matrix * (r_star-1) / (rdash * (n - 1))[:,np.newaxis]).sum(axis=1).mean()
  pc = ((_answers_matrix.mean(axis=0) / rdash)**2).sum()
  ret = um.true_divide(
  p_j = answers_matrix.sum(axis=0) / (N * answers_matrix.sum(axis=1).mean())


o1-preview mop emotion_provoking_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-2024-08-06 mop emotion_provoking_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-mini-2024-07-18 no_mop emotion_provoking_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  s2 = (pg - rpbar) / (r - 1)
  rdash = _answers_matrix.sum(axis=1).sum() / N
  po2 = (_answers_matrix * (r_star-1) / (rdash * (n - 1))[:,np.newaxis]).sum(axis=1).mean()
  pc = ((_answers_matrix.mean(axis=0) / rdash)**2).sum()
  ret = um.true_divide(
  p_j = answers_matrix.sum(axis=0) / (N * answers_matrix.sum(axis=1).mean())


gpt-4o-mini-2024-07-18 mop emotion_provoking_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-2024-08-06 no_mop narrative_complexity_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  s2 = (pg - rpbar) / (r - 1)
  rdash = _answers_matrix.sum(axis=1).sum() / N
  po2 = (_answers_matrix * (r_star-1) / (rdash * (n - 1))[:,np.newaxis]).sum(axis=1).mean()
  pc = ((_answers_matrix.mean(axis=0) / rdash)**2).sum()
  ret = um.true_divide(
  p_j = answers_matrix.sum(axis=0) / (N * answers_matrix.sum(axis=1).mean())


o1-preview mop narrative_complexity_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-2024-08-06 mop narrative_complexity_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15
gpt-4o-mini-2024-07-18 no_mop narrative_complexity_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  s2 = (pg - rpbar) / (r - 1)
  rdash = _answers_matrix.sum(axis=1).sum() / N
  po2 = (_answers_matrix * (r_star-1) / (rdash * (n - 1))[:,np.newaxis]).sum(axis=1).mean()
  pc = ((_answers_matrix.mean(axis=0) / rdash)**2).sum()
  ret = um.true_divide(
  p_j = answers_matrix.sum(axis=0) / (N * answers_matrix.sum(axis=1).mean())


gpt-4o-mini-2024-07-18 mop narrative_complexity_score
len(human_consensus_labels): 15
len(llm_consensus_labels): 15


In [71]:
results_df = pd.read_csv(df_path)
results_df

Unnamed: 0.1,Unnamed: 0,llm_name,llm_use_mop,component,human_unweighted_cohens_kappa,human_unweighted_krippendorffs_alpha,human_unweighted_fleiss_kappa,human_linear_weighted_cohens_kappa,human_linear_weighted_krippendorffs_alpha,human_linear_weighted_fleiss_kappa,...,llm_ordinal_weighted_krippendorffs_alpha,llm_ordinal_weighted_fleiss_kappa,human_vs_llm_spearman_corr,human_vs_llm_spearman_p_value,human_vs_llm_pearson_corr,human_vs_llm_pearson_p_value,human_vs_llm_permutation_test_stat,human_vs_llm_permutation_test_p_value,human_consensus_labels,llm_consensus_labels
0,0,gpt-4o-2024-08-06,no_mop,authenticity_score,0.048077,0.02681,0.004692,0.100642,0.587131,0.577748,...,,,0.23103,0.407418,0.295662,0.284662,0.23103,0.401,"[4.0, 3.33, 3.67, 3.33, 2.67, 1.67, 2.67, 3.33...","[3.0, 4.0, 2.0, 3.0, 3.0, 2.0, 3.0, 2.0, 4.0, ..."
1,1,o1-preview,mop,authenticity_score,0.048077,0.02681,0.004692,0.100642,0.587131,0.577748,...,0.768563,0.765436,0.267892,0.334379,0.515511,0.049204,0.267892,0.3332,"[4.0, 3.33, 3.67, 3.33, 2.67, 1.67, 2.67, 3.33...","[3.4, 3.8, 2.8, 3.4, 3.6, 2.0, 4.0, 3.2, 3.2, ..."
2,2,gpt-4o-2024-08-06,mop,authenticity_score,0.048077,0.02681,0.004692,0.100642,0.587131,0.577748,...,0.856954,0.855021,0.078188,0.7818,0.235697,0.397738,0.078188,0.7966,"[4.0, 3.33, 3.67, 3.33, 2.67, 1.67, 2.67, 3.33...","[2.8, 4.0, 1.8, 2.9, 3.4, 2.0, 3.1, 2.6, 3.8, ..."
3,3,gpt-4o-mini-2024-07-18,no_mop,authenticity_score,0.048077,0.02681,0.004692,0.100642,0.587131,0.577748,...,,,0.276583,0.31832,0.246577,0.375653,0.276583,0.415,"[4.0, 3.33, 3.67, 3.33, 2.67, 1.67, 2.67, 3.33...","[5.0, 4.0, 4.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, ..."
4,4,gpt-4o-mini-2024-07-18,mop,authenticity_score,0.048077,0.02681,0.004692,0.100642,0.587131,0.577748,...,0.75099,0.747625,0.293695,0.288029,0.22446,0.421251,0.293695,0.2912,"[4.0, 3.33, 3.67, 3.33, 2.67, 1.67, 2.67, 3.33...","[4.2, 4.0, 3.2, 4.4, 4.0, 3.6, 4.0, 3.6, 4.0, ..."
5,5,gpt-4o-2024-08-06,no_mop,empathy_score,-0.014799,-0.017341,-0.040462,0.068047,0.666185,0.658598,...,,,0.569518,0.026682,0.562581,0.029021,0.569518,0.035,"[3.67, 2.67, 2.67, 2.33, 3.0, 1.67, 3.0, 2.33,...","[3.0, 4.0, 2.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, ..."
6,6,o1-preview,mop,empathy_score,-0.014799,-0.017341,-0.040462,0.068047,0.666185,0.658598,...,0.898693,0.897324,0.67555,0.005709,0.732326,0.001905,0.67555,0.008,"[3.67, 2.67, 2.67, 2.33, 3.0, 1.67, 3.0, 2.33,...","[3.4, 3.8, 2.6, 3.0, 3.0, 1.8, 3.8, 2.0, 3.6, ..."
7,7,gpt-4o-2024-08-06,mop,empathy_score,-0.014799,-0.017341,-0.040462,0.068047,0.666185,0.658598,...,0.796129,0.793374,0.507341,0.053555,0.502545,0.056237,0.507341,0.051,"[3.67, 2.67, 2.67, 2.33, 3.0, 1.67, 3.0, 2.33,...","[2.8, 4.0, 2.2, 3.3, 3.4, 2.0, 3.6, 2.6, 3.4, ..."
8,8,gpt-4o-mini-2024-07-18,no_mop,empathy_score,-0.014799,-0.017341,-0.040462,0.068047,0.666185,0.658598,...,,,0.488282,0.064792,0.525865,0.044071,0.488282,0.0744,"[3.67, 2.67, 2.67, 2.33, 3.0, 1.67, 3.0, 2.33,...","[5.0, 4.0, 3.0, 4.0, 3.0, 3.0, 4.0, 3.0, 3.0, ..."
9,9,gpt-4o-mini-2024-07-18,mop,empathy_score,-0.014799,-0.017341,-0.040462,0.068047,0.666185,0.658598,...,0.881994,0.880399,0.338229,0.217555,0.386449,0.154777,0.338229,0.2006,"[3.67, 2.67, 2.67, 2.33, 3.0, 1.67, 3.0, 2.33,...","[4.2, 3.8, 3.4, 4.6, 3.6, 2.4, 3.8, 2.8, 3.2, ..."


In [78]:
spearman_df = results_df[["llm_name", "llm_use_mop", "component"] + [c for c in results_df.columns if c.startswith("human_vs_llm") and "spearman" in c]]
spearman_df = spearman_df.pivot(index=["llm_name", "llm_use_mop"], columns="component", values=["human_vs_llm_spearman_corr", "human_vs_llm_spearman_p_value"])
spearman_df

Unnamed: 0_level_0,Unnamed: 1_level_0,human_vs_llm_spearman_corr,human_vs_llm_spearman_corr,human_vs_llm_spearman_corr,human_vs_llm_spearman_corr,human_vs_llm_spearman_corr,human_vs_llm_spearman_p_value,human_vs_llm_spearman_p_value,human_vs_llm_spearman_p_value,human_vs_llm_spearman_p_value,human_vs_llm_spearman_p_value
Unnamed: 0_level_1,component,authenticity_score,emotion_provoking_score,empathy_score,engagement_score,narrative_complexity_score,authenticity_score,emotion_provoking_score,empathy_score,engagement_score,narrative_complexity_score
llm_name,llm_use_mop,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
gpt-4o-2024-08-06,mop,0.078188,0.263656,0.507341,0.579999,0.286589,0.7818,0.342369,0.053555,0.023422,0.300393
gpt-4o-2024-08-06,no_mop,0.23103,0.578731,0.569518,0.270148,0.423556,0.407418,0.023799,0.026682,0.330167,0.115663
gpt-4o-mini-2024-07-18,mop,0.293695,0.388896,0.338229,0.7141,0.450001,0.288029,0.151958,0.217555,0.002785,0.092357
gpt-4o-mini-2024-07-18,no_mop,0.276583,0.480987,0.488282,0.249675,0.427084,0.31832,0.069514,0.064792,0.369489,0.112341
o1-preview,mop,0.267892,0.277406,0.67555,0.703293,0.263986,0.334379,0.316823,0.005709,0.003443,0.341742


In [82]:
print(spearman_df.to_latex())

\begin{tabular}{llrrrrrrrrrr}
\toprule
 &  & \multicolumn{5}{r}{human_vs_llm_spearman_corr} & \multicolumn{5}{r}{human_vs_llm_spearman_p_value} \\
 & component & authenticity_score & emotion_provoking_score & empathy_score & engagement_score & narrative_complexity_score & authenticity_score & emotion_provoking_score & empathy_score & engagement_score & narrative_complexity_score \\
llm_name & llm_use_mop &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{2}{*}{gpt-4o-2024-08-06} & mop & 0.078188 & 0.263656 & 0.507341 & 0.579999 & 0.286589 & 0.781800 & 0.342369 & 0.053555 & 0.023422 & 0.300393 \\
 & no_mop & 0.231030 & 0.578731 & 0.569518 & 0.270148 & 0.423556 & 0.407418 & 0.023799 & 0.026682 & 0.330167 & 0.115663 \\
\cline{1-12}
\multirow[t]{2}{*}{gpt-4o-mini-2024-07-18} & mop & 0.293695 & 0.388896 & 0.338229 & 0.714100 & 0.450001 & 0.288029 & 0.151958 & 0.217555 & 0.002785 & 0.092357 \\
 & no_mop & 0.276583 & 0.480987 & 0.488282 & 0.249675 & 0.427084 & 0.318320 & 0.069514 & 0.0647

In [81]:
ka_df = results_df[["llm_name", "llm_use_mop", "component", "human_unweighted_krippendorffs_alpha", "llm_unweighted_krippendorffs_alpha"]]
ka_df = ka_df.pivot(index=["llm_name", "llm_use_mop"], columns="component", values=["human_unweighted_krippendorffs_alpha", "llm_unweighted_krippendorffs_alpha"])
ka_df

Unnamed: 0_level_0,Unnamed: 1_level_0,human_unweighted_krippendorffs_alpha,human_unweighted_krippendorffs_alpha,human_unweighted_krippendorffs_alpha,human_unweighted_krippendorffs_alpha,human_unweighted_krippendorffs_alpha,llm_unweighted_krippendorffs_alpha,llm_unweighted_krippendorffs_alpha,llm_unweighted_krippendorffs_alpha,llm_unweighted_krippendorffs_alpha,llm_unweighted_krippendorffs_alpha
Unnamed: 0_level_1,component,authenticity_score,emotion_provoking_score,empathy_score,engagement_score,narrative_complexity_score,authenticity_score,emotion_provoking_score,empathy_score,engagement_score,narrative_complexity_score
llm_name,llm_use_mop,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
gpt-4o-2024-08-06,mop,0.02681,0.067031,-0.017341,0.001335,0.111709,0.318371,0.413841,0.403552,0.531798,0.410894
gpt-4o-2024-08-06,no_mop,0.02681,0.067031,-0.017341,0.001335,0.111709,,,,,
gpt-4o-mini-2024-07-18,mop,0.02681,0.067031,-0.017341,0.001335,0.111709,0.252969,0.470811,0.291963,0.326005,0.593832
gpt-4o-mini-2024-07-18,no_mop,0.02681,0.067031,-0.017341,0.001335,0.111709,,,,,
o1-preview,mop,0.02681,0.067031,-0.017341,0.001335,0.111709,0.30569,0.360909,0.431373,0.434755,0.474666


In [83]:
print(ka_df.to_latex())

\begin{tabular}{llrrrrrrrrrr}
\toprule
 &  & \multicolumn{5}{r}{human_unweighted_krippendorffs_alpha} & \multicolumn{5}{r}{llm_unweighted_krippendorffs_alpha} \\
 & component & authenticity_score & emotion_provoking_score & empathy_score & engagement_score & narrative_complexity_score & authenticity_score & emotion_provoking_score & empathy_score & engagement_score & narrative_complexity_score \\
llm_name & llm_use_mop &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{2}{*}{gpt-4o-2024-08-06} & mop & 0.026810 & 0.067031 & -0.017341 & 0.001335 & 0.111709 & 0.318371 & 0.413841 & 0.403552 & 0.531798 & 0.410894 \\
 & no_mop & 0.026810 & 0.067031 & -0.017341 & 0.001335 & 0.111709 & NaN & NaN & NaN & NaN & NaN \\
\cline{1-12}
\multirow[t]{2}{*}{gpt-4o-mini-2024-07-18} & mop & 0.026810 & 0.067031 & -0.017341 & 0.001335 & 0.111709 & 0.252969 & 0.470811 & 0.291963 & 0.326005 & 0.593832 \\
 & no_mop & 0.026810 & 0.067031 & -0.017341 & 0.001335 & 0.111709 & NaN & NaN & NaN & NaN & NaN \\
\cl