In [None]:
import pandas as pd

In [None]:
# # Read the Excel files
df_jc = pd.read_excel('xxx')
df_mg = pd.read_excel('xxx')


In [None]:
merged_df = df_jc.merge(df_mg, on='Id', suffixes=('_jc', '_mg'))

columns_to_rename = [f"model_{i}_jc" for i in range(1, 6)]
merged_df.rename(columns={old_col: old_col.replace('_jc', '') for old_col in columns_to_rename}, inplace=True)

columns_to_drop = [f"model_{i}_mg" for i in range(1, 6)]
merged_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
merged_df


In [None]:
consensus = pd.read_excel('/Users/mariusvach/Code/python/leitlinien_rag_2_0/data/fragen/differences_jc_mg.xlsx')

In [None]:
consensus_ids = consensus.Id.tolist()

for id in consensus_ids:
    for rating in ['rating_model_1_jc', 'rating_model_2_jc', 'rating_model_3_jc', 'rating_model_4_jc', 'rating_model_5_jc']:
        merged_df.loc[merged_df.Id == id, rating] = consensus.loc[consensus.Id == id, rating].values[0]
    for rating in ['rating_model_1_mg', 'rating_model_2_mg', 'rating_model_3_mg', 'rating_model_4_mg', 'rating_model_5_mg']:
        merged_df.loc[merged_df.Id == id, rating] = consensus.loc[consensus.Id == id, rating].values[0]
cs = [f"consensus_model_{x}" for x in range(1, 6)]
    
merged = merged_df.merge(consensus[cs + ['Id']], on='Id', how='left')

for c in [f"model_{x}" for x in range(1,6)]:
    for idx in merged[merged[f"consensus_{c}"].isna()].index:
        merged.at[idx, f"consensus_{c}"] = merged.at[idx, f"rating_{c}_jc"]

merged = merged[~merged.isna().any(axis=1)]

In [None]:
merged.to_csv('xxx', index=False)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams.update({'font.size': 14})  # Increase base font size

columns_of_interest = [f"consensus_model_{i}" for i in range(1, 6)]
df_selected = merged[columns_of_interest]

df_melted = df_selected.melt(var_name='model', value_name='rating')

df_counts = df_melted.groupby('model')['rating'].value_counts().unstack(fill_value=0)
df_percentages = df_counts.div(df_counts.sum(axis=1), axis=0) * 100
df_counts = df_counts.reset_index()
df_percentages = df_percentages.reset_index()

with sns.axes_style("white"):
    plt.figure(figsize=(12,6))
    ax = sns.barplot(x='model', y='value', hue='rating', data=df_counts.melt(id_vars='model', var_name='rating', value_name='value'), palette="crest")

    plt.xlabel('Model', fontsize=16)
    ax.set_xticklabels(["GPT-4o w/o RAG", "GPT-4o w/ RAG", "Llama 3.1 405B Instruct Turbo", "Mixtral 8x22B Instruct", "Claude Sonnet 3.5"], 
                       fontsize=14, rotation=20)
    ax.set_ylim(0,100)
    plt.ylabel('Percentage (%)', fontsize=16)


    # Get the crest palette colors explicitly
    palette = sns.color_palette("crest", n_colors=3)
    
    # Create custom legend handles with palette colors
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=palette[i], label=l) 
                      for i, l in enumerate(['Wrong', 'Inaccurate', 'Correct'])]  # noqa: E741
    
    # Create legend with custom colored patches
    ax.legend(handles=legend_elements, title='Rating', 
             title_fontsize=14, fontsize=12)
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    bar_width = 0.8 / 3  
    for i, model in enumerate(df_counts['model']):
        for j, rating in enumerate([0, 1, 2]):
            count = df_counts.iloc[i, j+1]
            percentage = df_percentages.iloc[i, j+1]
            x_position = i + (j - 1) * bar_width
            ax.text(x_position, count, f'{percentage:.1f}%', 
                   ha='center', va='bottom', fontsize=12)

    plt.tight_layout()
    plt.show()

In [None]:
with sns.axes_style("white"):
    plt.figure(figsize=(12,6))
    
    # Get the crest palette colors explicitly
    palette = sns.color_palette("crest", n_colors=3)
    
    # Create plot with explicit palette
    ax = sns.barplot(x='model', y='value', hue='rating', 
                    data=df_counts.melt(id_vars='model', var_name='rating', value_name='value'), 
                    palette=palette)
    
    # Create custom legend handles with palette colors
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=palette[i], label=l) 
                      for i, l in enumerate(['Wrong', 'Inaccurate', 'Correct'])]  # noqa: E741
    
    # Create legend with custom colored patches
    ax.legend(handles=legend_elements, title='Rating', 
             title_fontsize=14, fontsize=12)
    
    plt.xlabel('Model', fontsize=16)
    ax.set_xticklabels(["GPT-4o w/o RAG", "GPT-4o w/ RAG", "Llama 3.1 405B Instruct Turbo", 
                        "Mixtral 8x22B Instruct", "Claude Sonnet 3.5"], 
                       fontsize=14, rotation=20)
    ax.set_ylim(0,100)
    plt.ylabel('Percentage of Ratings', fontsize=16)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

In [None]:
import numpy as np
from scipy import stats
import pandas as pd
from statsmodels.stats.multitest import multipletests

total_samples = 84

model_results = {
    'GPT4_wo_RAG': {'wrong': 32.9, 'inaccurate': 47.1, 'correct': 20.0},
    'GPT4_w_RAG': {'wrong': 15.3, 'inaccurate': 27.1, 'correct': 57.6},
    'Llama_3.1': {'wrong': 15.3, 'inaccurate': 20.0, 'correct': 64.7},
    'Mixtral': {'wrong': 17.6, 'inaccurate': 25.9, 'correct': 56.6},
    'Claude': {'wrong': 10.6, 'inaccurate': 18.8, 'correct': 70.6}
}

model_counts = {}
for model, scores in model_results.items():
    model_counts[model] = {
        category: round(percentage * total_samples / 100)
        for category, percentage in scores.items()
    }

model_counts


In [None]:
def confidence_interval(p, n, confidence=0.95):
    z = stats.norm.ppf((1 + confidence) / 2)
    se = np.sqrt((p * (100 - p)) / n)
    return np.maximum(0, p - z * se), np.minimum(100, p + z * se)

def chi_square_test(model1_counts, model2_counts):
    categories = ['wrong', 'inaccurate', 'correct']
    observed = np.array([
        [model1_counts[cat] for cat in categories],
        [model2_counts[cat] for cat in categories]
    ])
    chi2, p_value, _, _ = stats.chi2_contingency(observed)
    return chi2, p_value


In [None]:

print("Model Performance Statistics (with 95% CIs):")
for model, scores in model_results.items():
    print(f"\n{model}:")
    for category, percentage in scores.items():
        ci_lower, ci_upper = confidence_interval(percentage, total_samples)
        print(f"{category}: {percentage:.1f}% [{ci_lower:.1f}%, {ci_upper:.1f}%]")

print("\nPairwise Statistical Comparisons (Chi-square tests):")
model_names = list(model_counts.keys())
p_values = []
comparisons = []
chi_squares = []

for i in range(len(model_names)):
    for j in range(i + 1, len(model_names)):
        model1, model2 = model_names[i], model_names[j]
        chi2, p_value = chi_square_test(model_counts[model1], model_counts[model2])
        comparisons.append(f"{model1} vs {model2}")
        chi_squares.append(chi2)
        p_values.append(p_value)

rejected, p_values_corrected, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')

for comp, chi2, p, p_corr, rej in zip(comparisons, chi_squares, p_values, p_values_corrected, rejected):
    print(f"\n{comp}:")
    print(f"Chi-square: {chi2:.3f}")
    print(f"Uncorrected P-value: {p:.4f}")
    print(f"Bonferroni-corrected P-value: {p_corr:.4f}")
    print(f"Significant at α=0.05 (after Bonferroni correction): {'YES' if rej else 'NO'}")

counts_df = pd.DataFrame(model_counts).T
print("\nCounts DataFrame:")
print(counts_df)

percentages_df = pd.DataFrame(model_results).T
print("\nPercentages DataFrame:")
print(percentages_df)

In [None]:
from scipy.stats import kruskal 

wrong_values = [model_results[model]['wrong'] for model in model_results]
rest_values = [model_results[model]['inaccurate'] + model_results[model]['correct'] for model in model_results]

kruskal_wrong_vs_rest = kruskal(wrong_values, rest_values)
print("Kruskal-Wallis Test für 'wrong' vs. rest:")
print(f"H-statistic: {kruskal_wrong_vs_rest.statistic:.2f}, p-value: {kruskal_wrong_vs_rest.pvalue:.2f}")

correct_values = [model_results[model]['correct'] for model in model_results]
rest_values = [model_results[model]['wrong'] + model_results[model]['inaccurate'] for model in model_results]

kruskal_correct_vs_rest = kruskal(correct_values, rest_values)
print("Kruskal-Wallis Test für 'correct' vs. rest:")
print(f"H-statistic: {kruskal_correct_vs_rest.statistic:.2f}, p-value: {kruskal_correct_vs_rest.pvalue:.2f}")

In [None]:

wrong_values = [model_results[model]['wrong'] for model in model_results]
rest_values = [model_results[model]['inaccurate'] + model_results[model]['correct'] for model in model_results]

model_names = list(model_results.keys())
p_values = []
comparisons = []
chi_squares = []

for i in range(len(model_names)):
    for j in range(i + 1, len(model_names)):
        model1, model2 = model_names[i], model_names[j]
        observed = np.array([
            [wrong_values[i], rest_values[i]],
            [wrong_values[j], rest_values[j]]
        ])
        chi2, p_value, _, _ = stats.chi2_contingency(observed)
        comparisons.append(f"{model1} vs {model2}")
        chi_squares.append(chi2)
        p_values.append(p_value)

# Bonferroni-Korrektur
rejected, p_values_corrected, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')

# Ergebnisse ausgeben
for comp, chi2, p, p_corr, rej in zip(comparisons, chi_squares, p_values, p_values_corrected, rejected):
    print(f"\n{comp}:")
    print(f"Chi-square: {chi2:.3f}")
    print(f"Uncorrected P-value: {p:.4f}")
    print(f"Bonferroni-corrected P-value: {p_corr:.4f}")
    print(f"Significant at α=0.05 (after Bonferroni correction): {'YES' if rej else 'NO'}")

In [None]:
import numpy as np

models = {
    'GPT4_wo_RAG': {'wrong': 32.9, 'inaccurate': 47.1, 'correct': 20.0},
    'GPT4_w_RAG': {'wrong': 15.3, 'inaccurate': 27.1, 'correct': 57.6},
    'Llama_3.1': {'wrong': 15.3, 'inaccurate': 20.0, 'correct': 64.7},
    'Mixtral': {'wrong': 17.6, 'inaccurate': 25.9, 'correct': 56.6},
    'Claude': {'wrong': 10.6, 'inaccurate': 18.8, 'correct': 70.6}
}

# Convert percentages to counts
total_samples = 84

from scipy import stats

# Convert percentages to counts and create arrays for analysis
scores = []
groups = []

# Assign scores: wrong=0, inaccurate=1, correct=2
for model, data in models.items():
    # Add 'wrong' responses (score 0)
    scores.extend([0] * round(data['wrong'] * total_samples / 100))
    groups.extend([model] * round(data['wrong'] * total_samples / 100))
    
    # Add 'inaccurate' responses (score 1)
    scores.extend([1] * round(data['inaccurate'] * total_samples / 100))
    groups.extend([model] * round(data['inaccurate'] * total_samples / 100))
    
    # Add 'correct' responses (score 2)
    scores.extend([2] * round(data['correct'] * total_samples / 100))
    groups.extend([model] * round(data['correct'] * total_samples / 100))

# Perform Kruskal-Wallis H-test
h_statistic, p_value = stats.kruskal(*[
    np.array(scores)[np.array(groups) == model] 
    for model in models.keys()
])

print(f"Kruskal-Wallis H-statistic: {h_statistic:.3f}")
print(f"p-value: {p_value:.4f}")
