In [None]:
import pandas as pd
import os, sys
from openai import OpenAI
import numpy as np
from metrics import Metrics
global metrics
metrics = Metrics()
# Extract the expert opinions using a structured format of 4 different models.




  from .autonotebook import tqdm as notebook_tqdm














In [None]:
models = ["chatgpt", "claude", "deepseek", "gemini"]
# Define a function of model scores.
def generate_model_score(model):
    if model == "chatgpt" or model == "claude" or model == "deepseek" or model == "gemini":
        df_of_scores = pd.read_csv("..\\data\\" + model + "_scores.csv")
        score = df_of_scores["Final Score"].mean()
        return score

In [43]:
scores = {}
for model in models:
    scores[model] = generate_model_score(model)
print(scores)

{'chatgpt': 0.45246591544984077, 'claude': 0.41105391642572264, 'deepseek': 0.44169403399330187, 'gemini': 0.44601659659090737}


In [20]:
files = {
    "gpt": "../data/chatgpt_llm_answers_structured_output.csv",
    "claude": "../data/claude_llm_answers_structured_output.csv",
    "deepseek": "../data/deepseek_llm_answers_structured_output.csv",
    "gemini": "../data/gemini_llm_answers_structured_output.csv"
}
column_key_map = {"Expert Key Factors in Consideration": "Key Factors"}

model_dfs = {model: pd.read_csv(path) for model, path in files.items()}

def build_expert_dict(row_idx):
    combined_dict = {}
    for model, df in model_dfs.items():
        combined_dict[model] = {
            column_key_map[col]: df.loc[row_idx, col] for col in column_key_map
        }
    return combined_dict

expert_lookup = { i: build_expert_dict(i) for i in range(len(model_dfs["claude"])) }

human_df = pd.read_csv("..\\data\\human_answers_processed_wide.csv")
human_df["Expert Data Dictionary"] = human_df.index.map(expert_lookup)

In [23]:
def calculate_human_score(df: pd.DataFrame, metric_fn, metric_name: str) -> pd.DataFrame:
    processed_cols = ["processed_1", "processed_2", "processed_3", "processed_4"]
    section_key    = "Key Factors"
    models         = ["gpt", "claude", "deepseek", "gemini"]

    for col in processed_cols:
        def row_score(row):
            cand = row[col] if pd.notna(row[col]) else ""
            refs = []
            expert_dict = row["Expert Data Dictionary"]
            for m in models:
                ref = expert_dict[m].get(section_key, "")
                if pd.isna(ref):
                    ref = ""
                refs.append(ref)

            scores = [metric_fn(cand, ref) for ref in refs]
            return sum(scores) / len(scores) if scores else 0.0

        score_col = f"Score - {col} - {metric_name}"
        df[score_col] = df.apply(row_score, axis=1)

    return df


In [None]:
# Define a function of human baseline scores.
score_table = pd.DataFrame()
met = [metrics.compute_cosine_similarity, metrics.damerau_levenshtein, metrics.USE_similarity, metrics.compute_bleu]
met_names = ["Tf-idf", "DL", "USE", "BLEU"]
for i in range (len(met)):
    score_table = calculate_human_score(human_df, met[i], met_names[i])


In [None]:
print(score_table.columns)

Index(['dilemma_description', 'processed_1', 'processed_2', 'processed_3',
       'processed_4', 'Expert Data Dictionary', '_refs',
       'Score - processed_1 - Tf-idf', 'Score - processed_2 - Tf-idf',
       'Score - processed_3 - Tf-idf', 'Score - processed_4 - Tf-idf',
       'Score - processed_1 - DL', 'Score - processed_2 - DL',
       'Score - processed_3 - DL', 'Score - processed_4 - DL',
       'Score - processed_1 - USE', 'Score - processed_2 - USE',
       'Score - processed_3 - USE', 'Score - processed_4 - USE',
       'Score - processed_1 - BLEU', 'Score - processed_2 - BLEU',
       'Score - processed_3 - BLEU', 'Score - processed_4 - BLEU'],
      dtype='object')


In [28]:
processed_cols = ["processed_1", "processed_2", "processed_3", "processed_4"]
for col in processed_cols:
    score_table[f"Total Score {col}"] = score_table[f"Score - {col} - DL"] * 0.0768 + score_table[f"Score - {col} - BLEU"] * 0.1547 + score_table[f"Score - {col} - USE"] * 0.5386 + score_table[f"Score - {col} - Tf-idf"] * 0.2299

score_table["Average Human Score"] = score_table[["Total Score processed_1", "Total Score processed_2", "Total Score processed_3", "Total Score processed_4"]].mean(axis=1)

print(score_table.columns)

Index(['dilemma_description', 'processed_1', 'processed_2', 'processed_3',
       'processed_4', 'Expert Data Dictionary', '_refs',
       'Score - processed_1 - Tf-idf', 'Score - processed_2 - Tf-idf',
       'Score - processed_3 - Tf-idf', 'Score - processed_4 - Tf-idf',
       'Score - processed_1 - DL', 'Score - processed_2 - DL',
       'Score - processed_3 - DL', 'Score - processed_4 - DL',
       'Score - processed_1 - USE', 'Score - processed_2 - USE',
       'Score - processed_3 - USE', 'Score - processed_4 - USE',
       'Score - processed_1 - BLEU', 'Score - processed_2 - BLEU',
       'Score - processed_3 - BLEU', 'Score - processed_4 - BLEU',
       'Total Score processed_1', 'Total Score processed_2',
       'Total Score processed_3', 'Total Score processed_4',
       'Average Human Score'],
      dtype='object')


In [None]:
score_table.to_csv("..\\data\\human_scores.csv")