In [1]:
from metrics import Metrics
import pandas as pd




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
global metrics
metrics = Metrics()













In [3]:
def all_scores_(reference_text, hypothesis_text):
    bleu_score = metrics.compute_bleu(reference_text, hypothesis_text)
    nist_score = metrics.compute_nist(reference_text, hypothesis_text)
    meteor_s = metrics.compute_meteor(reference_text, hypothesis_text)
    rouge_scores = metrics.compute_rouge(reference_text, hypothesis_text)
    em = metrics.exact_match(reference_text, hypothesis_text)
    levenshtein_dist = metrics.levenshtein_distance(reference_text, hypothesis_text)
    damerau_levenshtein_score = metrics.damerau_levenshtein(reference_text, hypothesis_text)
    word_error_rate_score = metrics.word_error_rate(reference_text, hypothesis_text)
    gestalt_score = metrics.gestalt_pattern_matching(reference_text, hypothesis_text)
    jaccard_score = metrics.jaccard(reference_text, hypothesis_text)
    overlap_coeff_score = metrics.overlap_coeff(reference_text, hypothesis_text)
    dice_sorensen_score = metrics.dice_sorensen_coefficient(reference_text, hypothesis_text)
    tf_idf_score = metrics.compute_cosine_similarity(reference_text, hypothesis_text, "tf-idf")
    word_freq_score = metrics.compute_cosine_similarity(reference_text, hypothesis_text, "word_frequency")
    spacy_similarity = metrics.semantic_similarity_spacy(reference_text, hypothesis_text)
    wmd_score = metrics.wmd(reference_text, hypothesis_text)
    gensim_cosine_score = metrics.gensim_cosine(reference_text, hypothesis_text)
    sbert_score = metrics.sbert_cosine(reference_text, hypothesis_text)
    use_score = metrics.USE_similarity(reference_text, hypothesis_text)
    soft_cosine_score = metrics.soft_cosine(reference_text, hypothesis_text)
    
    return [
        bleu_score, nist_score, meteor_s, rouge_scores, em, levenshtein_dist, 
        damerau_levenshtein_score, word_error_rate_score, gestalt_score, jaccard_score, 
        overlap_coeff_score, dice_sorensen_score, tf_idf_score, word_freq_score, 
        spacy_similarity, wmd_score, gensim_cosine_score, sbert_score, use_score, soft_cosine_score
    ]


In [4]:
reference_text = "The transition from winter to spring is one of the most anticipated times of the year. The cold weather begins to fade, and the warmth of the sun gradually returns. As the days grow longer, nature awakens, with flowers blooming and trees regrowing their leaves. People feel renewed, eager to spend more time outdoors and enjoy the fresh air. It is a season of hope, growth, and change."
hypothesis_text = "The transition from winter to spring is one of the most anticipated times of the year. The cold begins to disappear and the heat of the sun gradually returns. As the days get longer, nature awakens, flowers bloom and trees grow again. People feel refreshed and look forward to spending more time outdoors enjoying the fresh air. It is a time of hope, growth and change."
all_scores_(reference_text, hypothesis_text)

100%|██████████| 63/63 [00:04<00:00, 14.29it/s]


[0.8260465164578865,
 4.853602676071032,
 0.8494417728578494,
 {'r': 0.7358490566037735, 'p': 0.78, 'f': 0.7572815484023001},
 0,
 83,
 0.7844155844155845,
 0.7844155844155845,
 0.32625994694960214,
 0.9666666666666667,
 1.0,
 0.9830508474576272,
 0.8603579934292731,
 0.9225806451612905,
 0.9948442498071253,
 0.2749621074490797,
 0.99941224,
 0.9730768799781799,
 0.95047617,
 0.12815315]

In [5]:
files = {
    "gpt": "../data/chatgpt_llm_answers_structured_output.csv",
    "claude": "../data/claude_llm_answers_structured_output.csv",
    "deepseek": "../data/deepseek_llm_answers_structured_output.csv",
    "gemini": "../data/gemini_llm_answers_structured_output.csv"
}
column_key_map = {
    "Expert Introduction": "Introduction",
    "Expert Key Factors in Consideration": "Key Factors",
    "Expert Historical & Theoretical Perspectives": "Historical & Theoretical Perspectives",
    "Expert Proposed Resolution Strategies": "Proposed Resolution Strategies",
    "Expert Key Takeaways": "Key Takeaways"
}

model_dfs = {model: pd.read_csv(path) for model, path in files.items()}

base_df = model_dfs["gpt"].copy()
non_expert_columns = [col for col in base_df.columns if col not in column_key_map]

def build_expert_dict(row_idx):
    combined_dict = {}
    for model, df in model_dfs.items():
        combined_dict[model] = {
            column_key_map[col]: df.loc[row_idx, col] for col in column_key_map
        }
    return combined_dict

base_df["Expert Data Dictionary"] = base_df.index.map(build_expert_dict)

final_df = base_df[non_expert_columns + ["Expert Data Dictionary"]]
final_df = final_df.drop(["Unnamed: 0", "dilemma_output", "Dilemma Introduction", "Dilemma Key Factors in Consideration", "Dilemma Historical & Theoretical Perspectives", "Dilemma Proposed Resolution Strategies", "Dilemma Key Takeaways"],axis = 1)
final_df.head(5)

Unnamed: 0,Dilemma Description,Expert Opinion,expert_output,Expert Data Dictionary
0,I joined a lab during graduate school and was ...,We were surprised to discover that the literat...,%Introduction: \r\nThis dilemma reflects a co...,{'gpt': {'Introduction': 'This dilemma reflect...
1,Graduate students A and B are working on somew...,Disputes over assigning credit in science are ...,%Introduction: \r\nThis dilemma highlights a ...,{'gpt': {'Introduction': 'This dilemma highlig...
2,David is a new postdoc in Dr. Goliath’s lab. U...,A key source of ethical guidance in resolving ...,%Introduction: \r\nThis dilemma centers on au...,{'gpt': {'Introduction': 'This dilemma centers...
3,"A PI moves his lab to a different university, ...","At first blush, this PI certainly seems to be ...",%Introduction: \r\nThis dilemma highlights th...,{'gpt': {'Introduction': 'This dilemma highlig...
4,George Washington is one of two postdocs worki...,"Dr. Big might be a fine scientist, but he’s no...",%Introduction: \r\nThis dilemma highlights th...,{'gpt': {'Introduction': 'This dilemma highlig...


In [6]:
final_df["Expert Data Dictionary"][0]["gpt"].keys()

dict_keys(['Introduction', 'Key Factors', 'Historical & Theoretical Perspectives', 'Proposed Resolution Strategies', 'Key Takeaways'])

In [7]:
final_df.shape

(10, 4)

In [8]:
final_df["Expert Data Dictionary"][0]["gpt"]["Introduction"]

'This dilemma reflects a conflict over credit allocation in scientific research, where differing perceptions of contribution can lead to disputes over recognition. The situation highlights the complexities involved in determining who deserves credit for a discovery, especially in collaborative environments.'

In [9]:
gemini = pd.read_csv("../data/gemini_llm_answers_structured_output.csv")
gemini = gemini[["Expert Opinion", "dilemma_output", "Dilemma Introduction", "Dilemma Key Factors in Consideration", "Dilemma Historical & Theoretical Perspectives", "Dilemma Proposed Resolution Strategies", "Dilemma Key Takeaways"]]
gemini.head(5)

Unnamed: 0,Expert Opinion,dilemma_output,Dilemma Introduction,Dilemma Key Factors in Consideration,Dilemma Historical & Theoretical Perspectives,Dilemma Proposed Resolution Strategies,Dilemma Key Takeaways
0,We were surprised to discover that the literat...,%Introduction:\r\nThis case presents a serious...,This case presents a serious ethical dilemma c...,Several key factors must be considered. First...,This situation highlights long-standing issues...,The lab should initiate a formal investigation...,This case underscores the importance of clear ...
1,Disputes over assigning credit in science are ...,%Introduction:\r\nThis case presents a classic...,This case presents a classic ethical dilemma i...,Key factors include the originality and novelt...,This situation highlights long-standing debate...,A mediated discussion facilitated by a neutral...,This case emphasizes the importance of clear c...
2,A key source of ethical guidance in resolving ...,%Introduction:\r\nThis case presents a complex...,This case presents a complex authorship disput...,Key factors include the intellectual contribut...,This situation reflects ongoing debates surrou...,A fair resolution would involve a thorough rea...,This case underscores the crucial need for cle...
3,"At first blush, this PI certainly seems to be ...",%Introduction:\r\nThis scenario presents a ser...,This scenario presents a serious ethical breac...,"Key factors include authorship contribution, i...",This action violates established norms of scie...,The affected postdocs and students should docu...,The PI's actions are unethical and potentially...
4,"Dr. Big might be a fine scientist, but he’s no...",%Introduction:\r\nThis case presents a clear e...,This case presents a clear ethical dilemma con...,Several key factors must be considered. First...,This case touches upon long-standing debates i...,George could consider several options. He coul...,This case underscores the importance of transp...


In [10]:
merged = final_df.merge(gemini, how='left', on='Expert Opinion')
merged.head(5)

Unnamed: 0,Dilemma Description,Expert Opinion,expert_output,Expert Data Dictionary,dilemma_output,Dilemma Introduction,Dilemma Key Factors in Consideration,Dilemma Historical & Theoretical Perspectives,Dilemma Proposed Resolution Strategies,Dilemma Key Takeaways
0,I joined a lab during graduate school and was ...,We were surprised to discover that the literat...,%Introduction: \r\nThis dilemma reflects a co...,{'gpt': {'Introduction': 'This dilemma reflect...,%Introduction:\r\nThis case presents a serious...,This case presents a serious ethical dilemma c...,Several key factors must be considered. First...,This situation highlights long-standing issues...,The lab should initiate a formal investigation...,This case underscores the importance of clear ...
1,Graduate students A and B are working on somew...,Disputes over assigning credit in science are ...,%Introduction: \r\nThis dilemma highlights a ...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis case presents a classic...,This case presents a classic ethical dilemma i...,Key factors include the originality and novelt...,This situation highlights long-standing debate...,A mediated discussion facilitated by a neutral...,This case emphasizes the importance of clear c...
2,David is a new postdoc in Dr. Goliath’s lab. U...,A key source of ethical guidance in resolving ...,%Introduction: \r\nThis dilemma centers on au...,{'gpt': {'Introduction': 'This dilemma centers...,%Introduction:\r\nThis case presents a complex...,This case presents a complex authorship disput...,Key factors include the intellectual contribut...,This situation reflects ongoing debates surrou...,A fair resolution would involve a thorough rea...,This case underscores the crucial need for cle...
3,"A PI moves his lab to a different university, ...","At first blush, this PI certainly seems to be ...",%Introduction: \r\nThis dilemma highlights th...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis scenario presents a ser...,This scenario presents a serious ethical breac...,"Key factors include authorship contribution, i...",This action violates established norms of scie...,The affected postdocs and students should docu...,The PI's actions are unethical and potentially...
4,George Washington is one of two postdocs worki...,"Dr. Big might be a fine scientist, but he’s no...",%Introduction: \r\nThis dilemma highlights th...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis case presents a clear e...,This case presents a clear ethical dilemma con...,Several key factors must be considered. First...,This case touches upon long-standing debates i...,George could consider several options. He coul...,This case underscores the importance of transp...


In [11]:
def calculate_score(df):
    cols = {
        "Introduction": "Dilemma Introduction",
        "Key Factors": "Dilemma Key Factors in Consideration",
        "Historical & Theoretical Perspectives": "Dilemma Historical & Theoretical Perspectives",
        "Proposed Resolution Strategies": "Dilemma Proposed Resolution Strategies",
        "Key Takeaways": "Dilemma Key Takeaways"
    }

    models = ["gpt", "claude", "deepseek", "gemini"]

    for section_key, dil_col in cols.items():

        def average_model_score(row):
            dil_text = row[dil_col]
            expert_dict = row["Expert Data Dictionary"]
            model_score = []
            for m in models:
                model_score.append(metrics.USE_similarity(dil_text, expert_dict[m][section_key]))
            print(model_score)
            return sum(model_score) / len(model_score)

        df[f"Score - {section_key}"] = df.apply(average_model_score, axis=1)
    
    df["Total Score"] = df["Score - Introduction"] * 0.1 + df["Score - Key Factors"] * 0.2 + df["Score - Historical & Theoretical Perspectives"] * 0.15 + df["Score - Proposed Resolution Strategies"] * 0.33 + df["Score - Key Takeaways"] * 0.22
    return df
score_table = calculate_score(merged)

[0.486958, 0.61947036, 0.6315869, 0.77250147]
[0.6857171, 0.5881678, 0.6501129, 0.6355348]
[0.64345133, 0.53959596, 0.7252557, 0.7853695]
[0.5884261, 0.64697933, 0.4944509, 0.63869536]
[0.6866144, 0.5551945, 0.7382636, 0.7627608]
[0.7187883, 0.5630652, 0.7595765, 0.8590675]
[0.69859064, 0.78635985, 0.6495005, 0.68176174]
[0.69154674, 0.7298751, 0.70190704, 0.71920323]
[0.6553609, 0.67580205, 0.6059674, 0.8115598]
[0.7286385, 0.71382356, 0.7818788, 0.82594776]
[0.5246147, 0.6121112, 0.6268064, 0.61264074]
[0.4220849, 0.55417776, 0.4813902, 0.47731745]
[0.67974997, 0.66966796, 0.6319578, 0.7320186]
[0.5884148, 0.5916116, 0.5600053, 0.5581021]
[0.5913545, 0.3692686, 0.6365589, 0.5880171]
[0.63238215, 0.6555726, 0.64496577, 0.7183908]
[0.61333853, 0.42973852, 0.5542761, 0.74428236]
[0.15070674, 0.04286252, 0.1077607, 0.7448981]
[0.5258863, 0.61607206, 0.59452474, 0.64783734]
[0.6594641, 0.666486, 0.6916072, 0.7964308]
[0.6093919, 0.65624815, 0.60442555, 0.65116787]
[0.5338688, 0.4144101, 0

In [12]:
score_table.head(5)

Unnamed: 0,Dilemma Description,Expert Opinion,expert_output,Expert Data Dictionary,dilemma_output,Dilemma Introduction,Dilemma Key Factors in Consideration,Dilemma Historical & Theoretical Perspectives,Dilemma Proposed Resolution Strategies,Dilemma Key Takeaways,Score - Introduction,Score - Key Factors,Score - Historical & Theoretical Perspectives,Score - Proposed Resolution Strategies,Score - Key Takeaways,Total Score
0,I joined a lab during graduate school and was ...,We were surprised to discover that the literat...,%Introduction: \r\nThis dilemma reflects a co...,{'gpt': {'Introduction': 'This dilemma reflect...,%Introduction:\r\nThis case presents a serious...,This case presents a serious ethical dilemma c...,Several key factors must be considered. First...,This situation highlights long-standing issues...,The lab should initiate a formal investigation...,This case underscores the importance of clear ...,0.627629,0.594043,0.630308,0.506819,0.615707,0.578824
1,Graduate students A and B are working on somew...,Disputes over assigning credit in science are ...,%Introduction: \r\nThis dilemma highlights a ...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis case presents a classic...,This case presents a classic ethical dilemma i...,Key factors include the originality and novelt...,This situation highlights long-standing debate...,A mediated discussion facilitated by a neutral...,This case emphasizes the importance of clear c...,0.639883,0.483743,0.487318,0.446922,0.635488,0.521126
2,David is a new postdoc in Dr. Goliath’s lab. U...,A key source of ethical guidance in resolving ...,%Introduction: \r\nThis dilemma centers on au...,{'gpt': {'Introduction': 'This dilemma centers...,%Introduction:\r\nThis case presents a complex...,This case presents a complex authorship disput...,Key factors include the intellectual contribut...,This situation reflects ongoing debates surrou...,A fair resolution would involve a thorough rea...,This case underscores the crucial need for cle...,0.673418,0.678349,0.581425,0.466864,0.701097,0.598532
3,"A PI moves his lab to a different university, ...","At first blush, this PI certainly seems to be ...",%Introduction: \r\nThis dilemma highlights th...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis scenario presents a ser...,This scenario presents a serious ethical breac...,"Key factors include authorship contribution, i...",This action violates established norms of scie...,The affected postdocs and students should docu...,The PI's actions are unethical and potentially...,0.592138,0.574533,0.508466,0.455981,0.652659,0.544449
4,George Washington is one of two postdocs worki...,"Dr. Big might be a fine scientist, but he’s no...",%Introduction: \r\nThis dilemma highlights th...,{'gpt': {'Introduction': 'This dilemma highlig...,%Introduction:\r\nThis case presents a clear e...,This case presents a clear ethical dilemma con...,Several key factors must be considered. First...,This case touches upon long-standing debates i...,George could consider several options. He coul...,This case underscores the importance of transp...,0.685708,0.5463,0.568264,0.340379,0.584674,0.504024


In [13]:
sequence = score_table["Total Score"].sort_values(ascending=False).index.to_list()
sequence

[5, 9, 2, 0, 6, 3, 8, 1, 4, 7]

In [14]:
score_table["Total Score"].sort_values(ascending=False)

5    0.655112
9    0.639926
2    0.598532
0    0.578824
6    0.559823
3    0.544449
8    0.532348
1    0.521126
4    0.504024
7    0.386599
Name: Total Score, dtype: float64

In [15]:
ground_truth_ranks = [
    [5, 9],
    [2],             
    [0],
    [6],
    [1, 3],
    [8],
    [4],        
    [7]             
]
rank_map = {}
for i, group in enumerate(ground_truth_ranks):
    for num in group:
        rank_map[num] = i

def count_inversions(arr):
    def merge_sort(arr):
        if len(arr) <= 1:
            return arr, 0
        mid = len(arr) // 2
        left, inv_left = merge_sort(arr[:mid])
        right, inv_right = merge_sort(arr[mid:])
        merged, inv_split = merge(left, right)
        return merged, inv_left + inv_right + inv_split

    def merge(left, right):
        merged = []
        i = j = inv = 0
        while i < len(left) and j < len(right):
            if left[i] <= right[j]:
                merged.append(left[i])
                i += 1
            else:
                merged.append(right[j])
                inv += len(left) - i
                j += 1
        merged.extend(left[i:])
        merged.extend(right[j:])
        return merged, inv

    _, inv_count = merge_sort(arr)
    return inv_count

def compute_loss(sequence):
    """
    Computes the number of adjacent swaps (inversions) needed to sort the input 
    sequence based on group rank placeholders.

    Args:
        sequence (List[int]): Input sequence of numbers.

    Returns:
        int: Number of adjacent swaps needed to match group order.
    """
    group_ranks = [rank_map[num] for num in sequence]
    return count_inversions(group_ranks)
compute_loss(sequence)

1