In [1]:
from metrics import Metrics
import pandas as pd




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
global metrics
metrics = Metrics()













In [3]:
def all_scores_(reference_text, hypothesis_text):
    bleu_score = metrics.compute_bleu(reference_text, hypothesis_text)
    nist_score = metrics.compute_nist(reference_text, hypothesis_text)
    meteor_s = metrics.compute_meteor(reference_text, hypothesis_text)
    rouge_scores = metrics.compute_rouge(reference_text, hypothesis_text)
    em = metrics.exact_match(reference_text, hypothesis_text)
    levenshtein_dist = metrics.levenshtein_distance(reference_text, hypothesis_text)
    damerau_levenshtein_score = metrics.damerau_levenshtein(reference_text, hypothesis_text)
    word_error_rate_score = metrics.word_error_rate(reference_text, hypothesis_text)
    gestalt_score = metrics.gestalt_pattern_matching(reference_text, hypothesis_text)
    jaccard_score = metrics.jaccard(reference_text, hypothesis_text)
    overlap_coeff_score = metrics.overlap_coeff(reference_text, hypothesis_text)
    dice_sorensen_score = metrics.dice_sorensen_coefficient(reference_text, hypothesis_text)
    tf_idf_score = metrics.compute_cosine_similarity(reference_text, hypothesis_text, "tf-idf")
    word_freq_score = metrics.compute_cosine_similarity(reference_text, hypothesis_text, "word_frequency")
    spacy_similarity = metrics.semantic_similarity_spacy(reference_text, hypothesis_text)
    wmd_score = metrics.wmd(reference_text, hypothesis_text)
    gensim_cosine_score = metrics.gensim_cosine(reference_text, hypothesis_text)
    sbert_score = metrics.sbert_cosine(reference_text, hypothesis_text)
    use_score = metrics.USE_similarity(reference_text, hypothesis_text)
    soft_cosine_score = metrics.soft_cosine(reference_text, hypothesis_text)
    
    return [
        bleu_score, nist_score, meteor_s, rouge_scores, em, levenshtein_dist, 
        damerau_levenshtein_score, word_error_rate_score, gestalt_score, jaccard_score, 
        overlap_coeff_score, dice_sorensen_score, tf_idf_score, word_freq_score, 
        spacy_similarity, wmd_score, gensim_cosine_score, sbert_score, use_score, soft_cosine_score
    ]


In [4]:
reference_text = "The transition from winter to spring is one of the most anticipated times of the year. The cold weather begins to fade, and the warmth of the sun gradually returns. As the days grow longer, nature awakens, with flowers blooming and trees regrowing their leaves. People feel renewed, eager to spend more time outdoors and enjoy the fresh air. It is a season of hope, growth, and change."
hypothesis_text = "The transition from winter to spring is one of the most anticipated times of the year. The cold begins to disappear and the heat of the sun gradually returns. As the days get longer, nature awakens, flowers bloom and trees grow again. People feel refreshed and look forward to spending more time outdoors enjoying the fresh air. It is a time of hope, growth and change."
all_scores_(reference_text, hypothesis_text)

100%|██████████| 63/63 [00:05<00:00, 11.13it/s]


[0.8260465164578865,
 4.853602676071032,
 0.8494417728578494,
 {'r': 0.7358490566037735, 'p': 0.78, 'f': 0.7572815484023001},
 0,
 83,
 0.7844155844155845,
 0.7844155844155845,
 0.32625994694960214,
 0.9666666666666667,
 1.0,
 0.9830508474576272,
 0.8603579934292731,
 0.9225806451612905,
 0.9948442498071253,
 0.2749621074490797,
 0.99941224,
 0.9730768799781799,
 0.95047617,
 0.12815315]

In [3]:
files = {
    "gpt": "../data/chatgpt_llm_answers_structured_output.csv",
    "claude": "../data/claude_llm_answers_structured_output.csv",
    "deepseek": "../data/deepseek_llm_answers_structured_output.csv",
    "gemini": "../data/gemini_llm_answers_structured_output.csv"
}
column_key_map = {
    "Expert Introduction": "Introduction",
    "Expert Key Factors in Consideration": "Key Factors",
    "Expert Historical & Theoretical Perspectives": "Historical & Theoretical Perspectives",
    "Expert Proposed Resolution Strategies": "Proposed Resolution Strategies",
    "Expert Key Takeaways": "Key Takeaways"
}

model_dfs = {model: pd.read_csv(path) for model, path in files.items()}

base_df = model_dfs["claude"].copy()
non_expert_columns = [col for col in base_df.columns if col not in column_key_map]

def build_expert_dict(row_idx):
    combined_dict = {}
    for model, df in model_dfs.items():
        combined_dict[model] = {
            column_key_map[col]: df.loc[row_idx, col] for col in column_key_map
        }
    return combined_dict

base_df["Expert Data Dictionary"] = base_df.index.map(build_expert_dict)

final_df = base_df[non_expert_columns + ["Expert Data Dictionary"]]
final_df = final_df.drop(["Unnamed: 0", "dilemma_output", "Dilemma Introduction", "Dilemma Key Factors in Consideration", "Dilemma Historical & Theoretical Perspectives", "Dilemma Proposed Resolution Strategies", "Dilemma Key Takeaways"],axis = 1)
final_df.head(5)

Unnamed: 0,Dilemma Description,Expert Opinion,expert_output,Expert Data Dictionary
0,I joined a lab during graduate school and was ...,We were surprised to discover that the literat...,Here's the structured analysis with the reques...,{'gpt': {'Introduction': 'This dilemma reflect...
1,Graduate students A and B are working on somew...,Disputes over assigning credit in science are ...,%Introduction:\nThis case presents a complex d...,{'gpt': {'Introduction': 'This dilemma highlig...
2,David is a new postdoc in Dr. Goliath’s lab. U...,A key source of ethical guidance in resolving ...,Here's the structured analysis:\n\n%Introducti...,{'gpt': {'Introduction': 'This dilemma centers...
3,"A PI moves his lab to a different university, ...","At first blush, this PI certainly seems to be ...",Here's the structured analysis with the reques...,{'gpt': {'Introduction': 'This dilemma highlig...
4,George Washington is one of two postdocs worki...,"Dr. Big might be a fine scientist, but he’s no...",Here's the structured analysis:\n\n%Introducti...,{'gpt': {'Introduction': 'This dilemma highlig...


In [4]:
final_df["Expert Data Dictionary"][0]["claude"].keys()

dict_keys(['Introduction', 'Key Factors', 'Historical & Theoretical Perspectives', 'Proposed Resolution Strategies', 'Key Takeaways'])

In [5]:
final_df.shape

(55, 4)

In [6]:
final_df["Expert Data Dictionary"][0]["claude"]["Introduction"]

'The dilemma centers on a fundamental conflict in scientific research regarding credit allocation between a graduate student and post-doc, highlighting the challenges of recognizing individual contributions in collaborative work.'

In [26]:
gemini = pd.read_csv("../data/gemini_llm_answers_structured_output.csv")
gemini = gemini[["Expert Opinion", "dilemma_output", "Dilemma Introduction", "Dilemma Key Factors in Consideration", "Dilemma Historical & Theoretical Perspectives", "Dilemma Proposed Resolution Strategies", "Dilemma Key Takeaways"]]
gemini.head(5)

Unnamed: 0,Expert Opinion,dilemma_output,Dilemma Introduction,Dilemma Key Factors in Consideration,Dilemma Historical & Theoretical Perspectives,Dilemma Proposed Resolution Strategies,Dilemma Key Takeaways
0,We were surprised to discover that the literat...,%Introduction:\r\nThis case presents a serious...,This case presents a serious ethical dilemma c...,Several key factors must be considered. First...,This situation highlights long-standing issues...,The lab should initiate a formal investigation...,This case underscores the importance of clear ...
1,Disputes over assigning credit in science are ...,%Introduction:\r\nThis case presents a classic...,This case presents a classic ethical dilemma i...,Key factors include the originality and novelt...,This situation highlights long-standing debate...,A mediated discussion facilitated by a neutral...,This case emphasizes the importance of clear c...
2,A key source of ethical guidance in resolving ...,%Introduction:\r\nThis case presents a complex...,This case presents a complex authorship disput...,Key factors include the intellectual contribut...,This situation reflects ongoing debates surrou...,A fair resolution would involve a thorough rea...,This case underscores the crucial need for cle...
3,"At first blush, this PI certainly seems to be ...",%Introduction:\r\nThis scenario presents a ser...,This scenario presents a serious ethical breac...,"Key factors include authorship contribution, i...",This action violates established norms of scie...,The affected postdocs and students should docu...,The PI's actions are unethical and potentially...
4,"Dr. Big might be a fine scientist, but he’s no...",%Introduction:\r\nThis case presents a clear e...,This case presents a clear ethical dilemma con...,Several key factors must be considered. First...,This case touches upon long-standing debates i...,George could consider several options. He coul...,This case underscores the importance of transp...


In [27]:
merged = final_df.merge(gemini, how='left', on='Expert Opinion')
merged.head(5)

Unnamed: 0,Dilemma Description,Expert Opinion,expert_output,Expert Data Dictionary,dilemma_output,Dilemma Introduction,Dilemma Key Factors in Consideration,Dilemma Historical & Theoretical Perspectives,Dilemma Proposed Resolution Strategies,Dilemma Key Takeaways
0,I joined a lab during graduate school and was ...,We were surprised to discover that the literat...,Here's the structured analysis with the reques...,{'gpt': {'Introduction': 'This dilemma reflect...,%Introduction:\r\nThis case presents a serious...,This case presents a serious ethical dilemma c...,Several key factors must be considered. First...,This situation highlights long-standing issues...,The lab should initiate a formal investigation...,This case underscores the importance of clear ...
1,Graduate students A and B are working on somew...,Disputes over assigning credit in science are ...,%Introduction:\nThis case presents a complex d...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis case presents a classic...,This case presents a classic ethical dilemma i...,Key factors include the originality and novelt...,This situation highlights long-standing debate...,A mediated discussion facilitated by a neutral...,This case emphasizes the importance of clear c...
2,David is a new postdoc in Dr. Goliath’s lab. U...,A key source of ethical guidance in resolving ...,Here's the structured analysis:\n\n%Introducti...,{'gpt': {'Introduction': 'This dilemma centers...,%Introduction:\r\nThis case presents a complex...,This case presents a complex authorship disput...,Key factors include the intellectual contribut...,This situation reflects ongoing debates surrou...,A fair resolution would involve a thorough rea...,This case underscores the crucial need for cle...
3,"A PI moves his lab to a different university, ...","At first blush, this PI certainly seems to be ...",Here's the structured analysis with the reques...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis scenario presents a ser...,This scenario presents a serious ethical breac...,"Key factors include authorship contribution, i...",This action violates established norms of scie...,The affected postdocs and students should docu...,The PI's actions are unethical and potentially...
4,George Washington is one of two postdocs worki...,"Dr. Big might be a fine scientist, but he’s no...",Here's the structured analysis:\n\n%Introducti...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis case presents a clear e...,This case presents a clear ethical dilemma con...,Several key factors must be considered. First...,This case touches upon long-standing debates i...,George could consider several options. He coul...,This case underscores the importance of transp...


In [28]:
merged = merged.head(20)

In [29]:
#compute_cosine_similarity
#damerau_levenshtein
#semantic_similarity_spacy
#USE_similarity
#compute_bleu

In [30]:
def calculate_score(df, metric, metric_name):
    cols = {
        "Introduction": "Dilemma Introduction",
        "Key Factors": "Dilemma Key Factors in Consideration",
        "Historical & Theoretical Perspectives": "Dilemma Historical & Theoretical Perspectives",
        "Proposed Resolution Strategies": "Dilemma Proposed Resolution Strategies",
        "Key Takeaways": "Dilemma Key Takeaways"
    }

    models = ["gpt", "claude", "deepseek", "gemini"]

    for section_key, dil_col in cols.items():

        def average_model_score(row):
            dil_text = row[dil_col]
            expert_dict = row["Expert Data Dictionary"]
            model_score = []
            for m in models:

                if pd.isna(expert_dict[m][section_key]):
                    expert_dict[m][section_key] = ""
                model_score.append(metric(dil_text, expert_dict[m][section_key]))
            # print(model_score)
            return sum(model_score) / len(model_score)

        df[f"Score - {section_key} - {metric_name}"] = df.apply(average_model_score, axis=1)
    
    df[f"Total Score {metric_name}"] = df[f"Score - Introduction - {metric_name}"] * 0.1 + df[f"Score - Key Factors - {metric_name}"] * 0.2 + df[f"Score - Historical & Theoretical Perspectives - {metric_name}"] * 0.15 + df[f"Score - Proposed Resolution Strategies - {metric_name}"] * 0.33 + df[f"Score - Key Takeaways - {metric_name}"] * 0.22
    return df


In [31]:
met = [metrics.compute_cosine_similarity, metrics.damerau_levenshtein, metrics.USE_similarity, metrics.compute_bleu]
met_names = ["Tf-idf", "DL", "USE", "BLEU"]

for i in range (len(met)):
    score_table = calculate_score(merged, met[i], met_names[i])

score_table.head(5)

Unnamed: 0,Dilemma Description,Expert Opinion,expert_output,Expert Data Dictionary,dilemma_output,Dilemma Introduction,Dilemma Key Factors in Consideration,Dilemma Historical & Theoretical Perspectives,Dilemma Proposed Resolution Strategies,Dilemma Key Takeaways,...,Score - Historical & Theoretical Perspectives - USE,Score - Proposed Resolution Strategies - USE,Score - Key Takeaways - USE,Total Score USE,Score - Introduction - BLEU,Score - Key Factors - BLEU,Score - Historical & Theoretical Perspectives - BLEU,Score - Proposed Resolution Strategies - BLEU,Score - Key Takeaways - BLEU,Total Score BLEU
0,I joined a lab during graduate school and was ...,We were surprised to discover that the literat...,Here's the structured analysis with the reques...,{'gpt': {'Introduction': 'This dilemma reflect...,%Introduction:\r\nThis case presents a serious...,This case presents a serious ethical dilemma c...,Several key factors must be considered. First...,This situation highlights long-standing issues...,The lab should initiate a formal investigation...,This case underscores the importance of clear ...,...,0.630308,0.506001,0.618751,0.579223,0.458428,0.421139,0.405652,0.204736,0.446772,0.356771
1,Graduate students A and B are working on somew...,Disputes over assigning credit in science are ...,%Introduction:\nThis case presents a complex d...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis case presents a classic...,This case presents a classic ethical dilemma i...,Key factors include the originality and novelt...,This situation highlights long-standing debate...,A mediated discussion facilitated by a neutral...,This case emphasizes the importance of clear c...,...,0.480032,0.478335,0.634872,0.531709,0.409109,0.430088,0.362864,0.311062,0.359046,0.362999
2,David is a new postdoc in Dr. Goliath’s lab. U...,A key source of ethical guidance in resolving ...,Here's the structured analysis:\n\n%Introducti...,{'gpt': {'Introduction': 'This dilemma centers...,%Introduction:\r\nThis case presents a complex...,This case presents a complex authorship disput...,Key factors include the intellectual contribut...,This situation reflects ongoing debates surrou...,A fair resolution would involve a thorough rea...,This case underscores the crucial need for cle...,...,0.585979,0.413829,0.704563,0.581844,0.38173,0.417252,0.358079,0.294934,0.376798,0.355559
3,"A PI moves his lab to a different university, ...","At first blush, this PI certainly seems to be ...",Here's the structured analysis with the reques...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis scenario presents a ser...,This scenario presents a serious ethical breac...,"Key factors include authorship contribution, i...",This action violates established norms of scie...,The affected postdocs and students should docu...,The PI's actions are unethical and potentially...,...,0.479097,0.440468,0.667901,0.538487,0.390102,0.433784,0.351784,0.238274,0.413934,0.34823
4,George Washington is one of two postdocs worki...,"Dr. Big might be a fine scientist, but he’s no...",Here's the structured analysis:\n\n%Introducti...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis case presents a clear e...,This case presents a clear ethical dilemma con...,Several key factors must be considered. First...,This case touches upon long-standing debates i...,George could consider several options. He coul...,This case underscores the importance of transp...,...,0.550401,0.331159,0.570896,0.493935,0.361015,0.238945,0.234242,0.1798,0.325857,0.250049


In [32]:
score_table.columns

Index(['Dilemma Description', 'Expert Opinion', 'expert_output',
       'Expert Data Dictionary', 'dilemma_output', 'Dilemma Introduction',
       'Dilemma Key Factors in Consideration',
       'Dilemma Historical & Theoretical Perspectives',
       'Dilemma Proposed Resolution Strategies', 'Dilemma Key Takeaways',
       'Score - Introduction - Tf-idf', 'Score - Key Factors - Tf-idf',
       'Score - Historical & Theoretical Perspectives - Tf-idf',
       'Score - Proposed Resolution Strategies - Tf-idf',
       'Score - Key Takeaways - Tf-idf', 'Total Score Tf-idf',
       'Score - Introduction - DL', 'Score - Key Factors - DL',
       'Score - Historical & Theoretical Perspectives - DL',
       'Score - Proposed Resolution Strategies - DL',
       'Score - Key Takeaways - DL', 'Total Score DL',
       'Score - Introduction - USE', 'Score - Key Factors - USE',
       'Score - Historical & Theoretical Perspectives - USE',
       'Score - Proposed Resolution Strategies - USE',
       '

In [33]:
score_table["Final Score"] = score_table["Total Score DL"] * 0.0768  + score_table["Total Score BLEU"] * 0.1547 + score_table["Total Score USE"] * 0.5386 + score_table["Total Score Tf-idf"] * 0.2299
score_table.head(5)

Unnamed: 0,Dilemma Description,Expert Opinion,expert_output,Expert Data Dictionary,dilemma_output,Dilemma Introduction,Dilemma Key Factors in Consideration,Dilemma Historical & Theoretical Perspectives,Dilemma Proposed Resolution Strategies,Dilemma Key Takeaways,...,Score - Proposed Resolution Strategies - USE,Score - Key Takeaways - USE,Total Score USE,Score - Introduction - BLEU,Score - Key Factors - BLEU,Score - Historical & Theoretical Perspectives - BLEU,Score - Proposed Resolution Strategies - BLEU,Score - Key Takeaways - BLEU,Total Score BLEU,Final Score
0,I joined a lab during graduate school and was ...,We were surprised to discover that the literat...,Here's the structured analysis with the reques...,{'gpt': {'Introduction': 'This dilemma reflect...,%Introduction:\nThis case presents a serious e...,This case presents a serious ethical dilemma c...,Several key factors must be considered. First...,This situation highlights long-standing issues...,The lab should initiate a formal investigation...,This case underscores the importance of clear ...,...,0.506001,0.618751,0.579223,0.458428,0.421139,0.405652,0.204736,0.446772,0.356771,0.474948
1,Graduate students A and B are working on somew...,Disputes over assigning credit in science are ...,%Introduction:\nThis case presents a complex d...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\nThis case presents a classic e...,This case presents a classic ethical dilemma i...,Key factors include the originality and novelt...,This situation highlights long-standing debate...,A mediated discussion facilitated by a neutral...,This case emphasizes the importance of clear c...,...,0.478335,0.634872,0.531709,0.409109,0.430088,0.362864,0.311062,0.359046,0.362999,0.449494
2,David is a new postdoc in Dr. Goliath’s lab. U...,A key source of ethical guidance in resolving ...,Here's the structured analysis:\n\n%Introducti...,{'gpt': {'Introduction': 'This dilemma centers...,%Introduction:\nThis case presents a complex a...,This case presents a complex authorship disput...,Key factors include the intellectual contribut...,This situation reflects ongoing debates surrou...,A fair resolution would involve a thorough rea...,This case underscores the crucial need for cle...,...,0.413829,0.704563,0.581844,0.38173,0.417252,0.358079,0.294934,0.376798,0.355559,0.485071
3,"A PI moves his lab to a different university, ...","At first blush, this PI certainly seems to be ...",Here's the structured analysis with the reques...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\nThis scenario presents a serio...,This scenario presents a serious ethical breac...,"Key factors include authorship contribution, i...",This action violates established norms of scie...,The affected postdocs and students should docu...,The PI's actions are unethical and potentially...,...,0.440468,0.667901,0.538487,0.390102,0.433784,0.351784,0.238274,0.413934,0.34823,0.434723
4,George Washington is one of two postdocs worki...,"Dr. Big might be a fine scientist, but he’s no...",Here's the structured analysis:\n\n%Introducti...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\nThis case presents a clear eth...,This case presents a clear ethical dilemma con...,Several key factors must be considered. First...,This case touches upon long-standing debates i...,George could consider several options. He coul...,This case underscores the importance of transp...,...,0.331159,0.570896,0.493935,0.361015,0.238945,0.234242,0.1798,0.325857,0.250049,0.386539


In [34]:
score_table.to_csv("gemini_scores.csv")