In [1]:
# @title Task A: Establish Human Expert Reliability (Inter-Rater Agreement)
import json
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
from IPython.display import display, HTML
from pathlib import Path
import os # Import os for path manipulation
import sys

sys.path.append('../')
from src.config import *

def calculate_and_display_agreement():
    """
    Main function to read data, calculate inter-rater agreement using a specific
    binning strategy to replicate known results, and display the output.
    Modified to handle three raters by computing average pairwise Cohen's Kappa
    as an approximation of Fleiss' Kappa, including overall kappa.
    This handles imbalanced/rare categories better by avoiding Fleiss' sensitivity to prevalence.
    The results are also saved to a JSON file.
    """
    try:
        # Step 1: Define file paths and load the JSON annotation files
        file_path_a = Path(PATH_PROCESSED) / 'human_annotations_expert_a.json'
        file_path_b = Path(PATH_PROCESSED) / 'human_annotations_expert_b.json'
        file_path_c = Path(PATH_PROCESSED) / 'human_annotations_expert_c.json' # Added third file
        output_dir = PATH_RESULTS 
        output_file = os.path.join(output_dir, 'human_expert_agreement_metrics.json') # Construct full output path


        print(f"Reading data from:\n- {file_path_a}\n- {file_path_b}\n- {file_path_c}\n")

        with open(file_path_a, 'r') as f:
            data_a = json.load(f)
        with open(file_path_b, 'r') as f:
            data_b = json.load(f)
        with open(file_path_c, 'r') as f: # Load third file
            data_c = json.load(f)

        # Step 2: Structure the data for analysis
        scores_a = {item['unit_id']: item['rhetorical_vector'] for item in data_a}
        scores_b = {item['unit_id']: item['rhetorical_vector'] for item in data_b}
        scores_c = {item['unit_id']: item['rhetorical_vector'] for item in data_c} # Scores for third expert

        unit_ids = sorted(list(set(scores_a.keys()) & set(scores_b.keys()) & set(scores_c.keys()))) # Common unit_ids among all three
        if not unit_ids:
            print("Error: No common unit_ids found between the three files.")
            return

        # Ensure consistent vector order for calculations
        vectors = sorted(list(data_a[0]['rhetorical_vector'].keys()))

        # Step 3: Define the SPECIFIC binning strategy to replicate the results
        bins = [-1, 5, 25, 55, 101]
        labels = range(len(bins) - 1)

        # Step 4: Prepare data for agreement calculation (Average Pairwise Cohen's Kappa approximating Fleiss')
        print("-" * 50)
        print("Calculating Inter-Rater Agreement for Three Raters (Avg. Pairwise Cohen's Kappa approx. Fleiss')")
        print("-" * 50)

        # Step 5a: Calculate Per-Vector Kappa (average of pairwise Cohen's)
        kappa_results_list = [] # List to hold per-vector results for JSON
        all_binned_a = []
        all_binned_b = []
        all_binned_c = [] # To collect all binned ratings for overall kappa

        for vector in vectors:
            binned_a_vec = []
            binned_b_vec = []
            binned_c_vec = []
            for uid in unit_ids:
                # Get ratings for the current vector from all three experts
                rating_a = scores_a[uid].get(vector, 0)
                rating_b = scores_b[uid].get(vector, 0)
                rating_c = scores_c[uid].get(vector, 0)

                # Bin the ratings
                binned_a = pd.cut([rating_a], bins=bins, labels=labels, include_lowest=True).codes[0]
                binned_b = pd.cut([rating_b], bins=bins, labels=labels, include_lowest=True).codes[0]
                binned_c = pd.cut([rating_c], bins=bins, labels=labels, include_lowest=True).codes[0]

                binned_a_vec.append(binned_a)
                binned_b_vec.append(binned_b)
                binned_c_vec.append(binned_c)

                # Collect for overall
                all_binned_a.append(binned_a)
                all_binned_b.append(binned_b)
                all_binned_c.append(binned_c)

            # Compute pairwise Cohen's Kappa for this vector
            kappa_ab = cohen_kappa_score(binned_a_vec, binned_b_vec)
            kappa_ac = cohen_kappa_score(binned_a_vec, binned_c_vec)
            kappa_bc = cohen_kappa_score(binned_b_vec, binned_c_vec)

            # Average them for multi-rater approximation
            avg_kappa = np.mean([kappa_ab, kappa_ac, kappa_bc])

            kappa_results_list.append({'Vector': vector, 'kappa': avg_kappa}) # Use 'kappa' key for JSON

        # Step 5b: Calculate Overall Kappa (average pairwise Cohen's on all data)
        overall_kappa_ab = cohen_kappa_score(all_binned_a, all_binned_b)
        overall_kappa_ac = cohen_kappa_score(all_binned_a, all_binned_c)
        overall_kappa_bc = cohen_kappa_score(all_binned_b, all_binned_c)
        overall_kappa = np.mean([overall_kappa_ab, overall_kappa_ac, overall_kappa_bc])

        # Step 6: Prepare data for JSON output
        json_output = {
            "per_vector_agreement": kappa_results_list,
            "overall_agreement": {"kappa": overall_kappa} # Use 'kappa' key for overall
        }

        # Step 7: Save results to JSON file
        # Ensure the directory exists
        os.makedirs(output_dir, exist_ok=True)

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(correlation_results, f, indent=4)

        print(f"\nAgreement results saved to {output_file}")

        # Step 8: Create and format the final tables for display
        df_results = pd.DataFrame(kappa_results_list) # Use the list for DataFrame

        # Format the kappa value for display
        df_results['kappa'] = df_results['kappa'].round(2)
        df_results = df_results.rename(columns={'kappa': 'κ'}) # Rename for display

        # Display the per-vector results
        display(HTML("<h3>Table: Inter-Rater Agreement (Fleiss' Kappa) by Vector (Three Raters)</h3>"))
        display(df_results)

        # Display the overall results
        print("\n" + "=" * 50)
        print(f"Overall Fleiss' Kappa for All Units and Experts: {overall_kappa:.2f}")
        print("=" * 50)


    except FileNotFoundError as e:
        print(f"ERROR: {e}")
        print("Please make sure 'human_annotations_expert_a.json', 'human_annotations_expert_b.json', and 'human_annotations_expert_c.json' are uploaded to the stated directory.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        import traceback
        traceback.print_exc() # Print traceback for debugging

# Run the main function
calculate_and_display_agreement()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Reading data from:
- ..\data\processed\human_annotations_expert_a.json
- ..\data\processed\human_annotations_expert_b.json
- ..\data\processed\human_annotations_expert_c.json

--------------------------------------------------
Calculating Inter-Rater Agreement for Three Raters (Avg. Pairwise Cohen's Kappa approx. Fleiss')
--------------------------------------------------

Agreement results saved to ../data/results\human_expert_agreement_metrics.json


Unnamed: 0,Vector,κ
0,Cosmic Warfare & Deception,0.92
1,Judicial Wrath & Punitive Action,0.9
2,"Lament, Persecution & Endurance",0.73
3,Other/Neutral Content,-0.02
4,Prophetic Exhortation & Warning,0.83
5,Theophanic Awe & Terror,0.7
6,"Victory, Consolation & New-Creation Hope",0.73
7,Worship & Praise,0.86



Overall Fleiss' Kappa for All Units and Experts: 0.82


In [2]:
# @title table with justifications
import json
import pandas as pd
from IPython.display import display, HTML
from pathlib import Path
import os # Import os for path manipulation
import sys

sys.path.append('../')
from src.config import *

def display_expert_comments():
    """
    Loads annotation data from three JSON files and displays a table
    showing the comments/justifications from each expert for each unit.
    """
    try:
        # Step 1: Define file paths and load the JSON annotation files
        file_path_a = Path(PATH_PROCESSED) / 'human_annotations_expert_a.json'
        file_path_b = Path(PATH_PROCESSED) / 'human_annotations_expert_b.json'
        file_path_c = Path(PATH_PROCESSED) / 'human_annotations_expert_c.json'

        print(f"Reading data from:\n- {file_path_a}\n- {file_path_b}\n- {file_path_c}\n")

        with open(file_path_a, 'r') as f:
            data_a = json.load(f)
        with open(file_path_b, 'r') as f:
            data_b = json.load(f)
        with open(file_path_c, 'r') as f:
            data_c = json.load(f)

        # Step 2: Structure and Combine Data - Extract comments/justifications
        comments_data = []
        # Create dictionaries for easier lookup by unit_id
        data_a_dict = {item['unit_id']: item for item in data_a}
        data_b_dict = {item['unit_id']: item for item in data_b}
        data_c_dict = {item['unit_id']: item for item in data_c}

        # Find common unit_ids
        unit_ids = sorted(list(set(data_a_dict.keys()) & set(data_b_dict.keys()) & set(data_c_dict.keys())))

        if not unit_ids:
            print("Error: No common unit_ids found between the three files.")
            return

        for uid in unit_ids:
            expert_a_comment = data_a_dict[uid].get('justification', data_a_dict[uid].get('expert_a_comment', ''))
            expert_b_comment = data_b_dict[uid].get('justification', data_b_dict[uid].get('expert_b_comment', ''))
            expert_c_comment = data_c_dict[uid].get('justification', data_c_dict[uid].get('expert_c_comment', ''))

            comments_data.append({
                'Unit ID': uid,
                'Expert A Comment/Justification': expert_a_comment,
                'Expert B Comment/Justification': expert_b_comment,
                'Expert C Comment/Justification': expert_c_comment
            })

        # Step 3: Create DataFrame
        df_comments = pd.DataFrame(comments_data)

        # Step 4: Format and Display Table
        display(HTML("<h3>Table: Expert Comments and Justifications by Unit</h3>"))
        display(df_comments)

    except FileNotFoundError as e:
        print(f"ERROR: {e}")
        print("Please make sure 'human_annotations_expert_a.json', 'human_annotations_expert_b.json', and 'human_annotations_expert_c.json' are uploaded to the stated directory.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Run the function
display_expert_comments()

Reading data from:
- ..\data\processed\human_annotations_expert_a.json
- ..\data\processed\human_annotations_expert_b.json
- ..\data\processed\human_annotations_expert_c.json



Unnamed: 0,Unit ID,Expert A Comment/Justification,Expert B Comment/Justification,Expert C Comment/Justification
0,unit_001,Prologue mixes exhortation ('hear') with awe a...,"Polyphonic prologue: doxology, exhortation, an...","Prologue blends praise for the eternal God, ex..."
1,unit_002,Vision of the glorified Christ evokes theophan...,Inaugural vision of Christ among lampstands ev...,Vision evokes awe and terror in Christ's majes...
2,unit_003,Letters to churches emphasize prophetic exhort...,"Letters focus on exhortation, warnings, and ca...",Letters exhort repentance and endurance amid p...
3,unit_004,Throne room scene dominated by worship and pra...,Heavenly worship around the throne; strong awe...,Throne scene filled with ceaseless praise; awe...
4,unit_005,Dominant hymns (áĽ„ÎľÎąÎżĎ‚) frame Lamb's wort...,Worthy Lamb and hymns dominate; subtle lament ...,Hymns exalt the Lamb's worthiness; subtle lame...
5,unit_006,Opening seals unleash judicial wrath (horsemen...,Seals bring wrath and cosmic terror; cries of ...,Seals unleash wrathful calamities; terror and ...
6,unit_007,Sealing offers consolation amid tribulation; m...,Multitude's victory praise after tribulation; ...,Multitude praises God for salvation; consolati...
7,unit_008,Silence in heaven builds theophanic awe and te...,Heavenly silence creates intense awe before th...,Silence builds solemn awe before further judgm...
8,unit_009,Angels with trumpets prepare cosmic judgments;...,Preparation for trumpets with prayers and fire...,Prayers with incense as worship; fire to earth...
9,unit_010,Trumpet plagues as punitive judgment; forensic...,Plagues as wrath; cosmic elements add warfare ...,Trumpets cause destructive wrath on creation; ...


In [5]:
# @title Task B: Quantify AI-to-Expert Alignment (Correlation)
import json
import pandas as pd
from scipy.stats import pearsonr
import os # Import os for path manipulation
from pathlib import Path
import sys

sys.path.append('../')
from src.config import *

def calculate_ai_expert_correlation():
    """
    Calculates the Pearson correlation coefficient between Gemini scores and
    averaged human expert scores for each vector.
    """
    try:
        # Define file paths
        file_path_a = Path(PATH_PROCESSED) / 'human_annotations_expert_a.json'
        file_path_b = Path(PATH_PROCESSED) / 'human_annotations_expert_b.json'
        file_path_c = Path(PATH_PROCESSED) / 'human_annotations_expert_c.json'
        file_path_gemini = Path(PATH_RESULTS) / 'gemini_osborne_greek.json'
        output_dir = Path(PATH_RESULTS) # Define output directory
        output_file = os.path.join(output_dir, 'human_ai_validation_metrics.json') # Construct full output path

        print(f"Reading data from:\n- {file_path_a}\n- {file_path_b}\n- {file_path_c}\n- {file_path_gemini}\n")

        # Load the JSON annotation files
        with open(file_path_a, 'r', encoding='utf-8') as f:
            data_a = json.load(f)
        with open(file_path_b, 'r', encoding='utf-8') as f:
            data_b = json.load(f)
        with open(file_path_c, 'r', encoding='utf-8') as f:
            data_c = json.load(f)
        with open(file_path_gemini, 'r', encoding='utf-8') as f:
            data_gemini = json.load(f)
        # Structure the data for analysis
        scores_a = {item['unit_id']: item['rhetorical_vector'] for item in data_a}
        scores_b = {item['unit_id']: item['rhetorical_vector'] for item in data_b}
        scores_c = {item['unit_id']: item['rhetorical_vector'] for item in data_c}

        # Access the 'narrative_units' key for Gemini data and use 'final_rhetorical_vector'
        if isinstance(data_gemini, dict) and 'narrative_units' in data_gemini:
            scores_gemini = {item['unit_id']: item['final_rhetorical_vector'] for item in data_gemini['narrative_units']}
        else:
            print("Error: 'narrative_units' key not found in gemini_osborne_greek.json or data format is unexpected.")
            return


        # Find common unit_ids among all four files
        unit_ids = sorted(list(set(scores_a.keys()) & set(scores_b.keys()) & set(scores_c.keys()) & set(scores_gemini.keys())))
        if not unit_ids:
            print("Error: No common unit_ids found between the files.")
            return

        # Ensure consistent vector order for calculations
        # Use vectors from human data as the source of truth for vector names
        vectors = sorted(list(data_a[0]['rhetorical_vector'].keys()))

        print("-" * 50)
        print("Calculating AI-to-Expert Alignment (Correlation)")
        print("-" * 50)

        correlation_results = {}

        for vector in vectors:
            human_scores_vec = []
            gemini_scores_vec = []
            for uid in unit_ids:
                # Get ratings for the current vector from all three experts and Gemini
                # Use .get(vector, 0) to handle cases where a vector might be missing for a unit
                rating_a = scores_a.get(uid, {}).get(vector, 0)
                rating_b = scores_b.get(uid, {}).get(vector, 0)
                rating_c = scores_c.get(uid, {}).get(vector, 0)
                rating_gemini = scores_gemini.get(uid, {}).get(vector, 0)

                # Average the human scores
                avg_human_score = (rating_a + rating_b + rating_c) / 3

                human_scores_vec.append(avg_human_score)
                gemini_scores_vec.append(rating_gemini)

            # Calculate Pearson's r correlation coefficient and p-value
            # Ensure there is variation in scores for correlation calculation
            if len(set(human_scores_vec)) > 1 and len(set(gemini_scores_vec)) > 1: # Ensure enough data points and variation
                 r, p_value = pearsonr(human_scores_vec, gemini_scores_vec)
                 correlation_results[vector] = {'pearson_r': r, 'p_value': p_value}
            else:
                 correlation_results[vector] = {'pearson_r': None, 'p_value': None}
                 print(f"Warning: Not enough variation in scores to calculate correlation for vector '{vector}'.")

        # Save results to JSON file
        # Ensure the directory exists
        os.makedirs(output_dir, exist_ok=True)

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(correlation_results, f, indent=4)

        print(f"\nCorrelation results saved to {output_file}")
        print("\nCorrelation Results:")
        # Display formatted results
        for vector, result in correlation_results.items():
            if result['pearson_r'] is not None:
                 print(f"  {vector}: Pearson's r = {result['pearson_r']:.4f}, p-value = {result['p_value']:.4f}")
            else:
                 print(f"  {vector}: Not enough variation to calculate correlation.")


    except FileNotFoundError as e:
        print(f"ERROR: {e}")
        print("Please make sure the required JSON files are uploaded to the stated directories.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        import traceback
        traceback.print_exc() # Print traceback for debugging

# Run the function
calculate_ai_expert_correlation()

Reading data from:
- ..\data\processed\human_annotations_expert_a.json
- ..\data\processed\human_annotations_expert_b.json
- ..\data\processed\human_annotations_expert_c.json
- ..\data\results\gemini_osborne_greek.json

--------------------------------------------------
Calculating AI-to-Expert Alignment (Correlation)
--------------------------------------------------

Correlation results saved to ..\data\results\human_ai_validation_metrics.json

Correlation Results:
  Cosmic Warfare & Deception: Pearson's r = 0.8326, p-value = 0.0000
  Judicial Wrath & Punitive Action: Pearson's r = 0.8983, p-value = 0.0000
  Lament, Persecution & Endurance: Pearson's r = 0.6635, p-value = 0.0001
  Other/Neutral Content: Pearson's r = 0.1694, p-value = 0.3709
  Theophanic Awe & Terror: Pearson's r = 0.7550, p-value = 0.0000
  Victory, Consolation & New-Creation Hope: Pearson's r = 0.8866, p-value = 0.0000
  Worship & Praise: Pearson's r = 0.9141, p-value = 0.0000
