In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set publication-quality plot parameters
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.size'] = 10

# Load data from Excel file
file_path = r"E:\test\sleep\Final_results.xlsx"

human1 = pd.read_excel(file_path, sheet_name="human1")
human2 = pd.read_excel(file_path, sheet_name="human2")
info = pd.read_excel(file_path, sheet_name="Info")

print("Data loaded successfully!")
print(f"Human1 shape: {human1.shape}")
print(f"Human2 shape: {human2.shape}")
print(f"Info shape: {info.shape}")
print(f"\nColumns in human1: {list(human1.columns)}")

Data loaded successfully!
Human1 shape: (129, 14)
Human2 shape: (129, 12)
Info shape: (129, 14)

Columns in human1: ['Unnamed: 0', 'Age', 'Gender', 'Truth Disease diagnosis', 'Grok_results', 'Claude_results', 'Deepseek_results', 'GPT_results', 'Gemini_results', 'Grok', 'Claude', 'Deepseek', 'GPT', 'Gemini']


In [2]:
# Define model column mappings for human1 and human2
model_columns = {
    "Grok": ("Grok", "Grok"),
    "Claude": ("Claude", "Claude"),
    "Deepseek": ("Deepseek", "Deepseek"),
    "GPT": ("GPT", "GPT"),
    "Gemini": ("Gemini", "Gemini"),
}

model_names = list(model_columns.keys())
print(f"Models to analyze: {model_names}")

Models to analyze: ['Grok', 'Claude', 'Deepseek', 'GPT', 'Gemini']


In [3]:
# Calculate Cohen's Kappa (quadratic weighted) for each model
kappa_results = {}

print("=" * 60)
print("COHEN'S KAPPA (Inter-rater Reliability) - Per Model")
print("=" * 60)

for model, (col_h1, col_h2) in model_columns.items():
    scores_h1 = human1[col_h1]
    scores_h2 = human2[col_h2]
    
    # Drop rows with missing values
    valid = scores_h1.notna() & scores_h2.notna()
    n_valid = valid.sum()
    
    kappa = cohen_kappa_score(
        scores_h1[valid],
        scores_h2[valid],
        weights="quadratic"
    )
    
    kappa_results[model] = kappa
    print(f"{model:12s}: κ = {kappa:.4f} (n = {n_valid})")

print("=" * 60)

COHEN'S KAPPA (Inter-rater Reliability) - Per Model
Grok        : κ = 0.6490 (n = 129)
Claude      : κ = 0.6825 (n = 129)
Deepseek    : κ = 0.6126 (n = 129)
GPT         : κ = 0.5685 (n = 129)
Gemini      : κ = 0.6421 (n = 129)


In [4]:
# Calculate overall Cohen's Kappa across all models
all_h1 = []
all_h2 = []

for model, (col_h1, col_h2) in model_columns.items():
    valid = human1[col_h1].notna() & human2[col_h2].notna()
    all_h1.extend(human1.loc[valid, col_h1])
    all_h2.extend(human2.loc[valid, col_h2])

overall_kappa = cohen_kappa_score(
    all_h1,
    all_h2,
    weights="quadratic"
)

print("\n" + "=" * 60)
print("OVERALL COHEN'S KAPPA (All Models Combined)")
print("=" * 60)
print(f"Overall κ = {overall_kappa:.4f} (n = {len(all_h1)} total ratings)")
print("=" * 60)


OVERALL COHEN'S KAPPA (All Models Combined)
Overall κ = 0.6349 (n = 645 total ratings)


In [5]:
# Create averaged scores (mean of human1 and human2)
human_avg = info.copy()  # Start with Info sheet to keep metadata

# Calculate average scores for each model
for model, (col_h1, col_h2) in model_columns.items():
    human_avg[model] = (human1[col_h1] + human2[col_h2]) / 2

print("\n" + "=" * 60)
print("AVERAGED SCORES COMPUTED")
print("=" * 60)
print(f"human_avg shape: {human_avg.shape}")
print(f"Columns in human_avg: {list(human_avg.columns)}")

# Save back to the same Excel file with human_avg sheet
with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    human_avg.to_excel(writer, sheet_name='human_avg', index=False)

print("\n✓ 'human_avg' sheet successfully saved to Excel file!")
print("=" * 60)


AVERAGED SCORES COMPUTED
human_avg shape: (129, 19)
Columns in human_avg: ['Publisher', 'Book', 'Publically avalibale', 'Status', 'Clinical specialty', 'Disease diagnosis', 'Differential Diagnosis', 'Patient Text', 'Truth Disease diagnosis', 'Grok_results', 'Claude_results', 'Deepseek_results', 'GPT_results', 'Gemini_results', 'Grok', 'Claude', 'Deepseek', 'GPT', 'Gemini']

✓ 'human_avg' sheet successfully saved to Excel file!
