In [90]:
import os
import sys

import pandas as pd

from from_root import from_root
from glob import glob

sys.path.insert(0, str(from_root("src")))

from read_and_write_docs import read_excel_sheets

In [91]:
phrases = pd.read_excel('/Volumes/BCross/paraphrase examples slurm/wiki-phrase-list-reviewed.xlsx')
phrases = phrases[phrases['keep_phrase'] == 1]

raw_save_loc = '/Volumes/BCross/paraphrase examples slurm/Wiki-Test/combined_token_level_results_raw_v3.xlsx'
agg_save_loc = '/Volumes/BCross/paraphrase examples slurm/Wiki-Test/combined_token_level_results_agg_v3.xlsx'

base_loc = '/Volumes/BCross/paraphrase examples slurm/Wiki-Test'

results_dict = [
    {"paraphrasing_model": "gpt5","scoring_model": "qwen"},
    {"paraphrasing_model": "gpt5","scoring_model": "gemma"},
    {"paraphrasing_model": "gpt5","scoring_model": "llama"},
    {"paraphrasing_model": "gpt5","scoring_model": "gpt2"},
    {"paraphrasing_model": "ModernBERT-base","scoring_model": "gpt2"},
    {"paraphrasing_model": "ModernBERT-large","scoring_model": "gpt2"},
]

### Load Original Data and Check Model Combos

In [92]:
from pathlib import Path

if Path(raw_save_loc).exists():
    raw_results = pd.read_excel(raw_save_loc)

    existing_combos = set(
        raw_results[['paraphrasing_model', 'scoring_model']]
        .itertuples(index=False, name=None)
    )
else:
    # file does not exist → no existing results
    raw_results = pd.DataFrame(
        columns=['paraphrasing_model', 'scoring_model']
    )
    existing_combos = set()

existing_combos


set()

### Aggregate any New Model Data

In [93]:
all_model_data = []

for result in results_dict:
    paraphrasing_model = result['paraphrasing_model']
    scoring_model = result['scoring_model']
    combo = (paraphrasing_model, scoring_model)

    # ⛔ Skip if already processed
    if combo in existing_combos:
        print(f"Skipping existing combo: {combo}")
        continue
    
    print(f"Loading Paraphrase Model: {paraphrasing_model} - Scoring Model: {scoring_model}")
    
    if paraphrasing_model == "gpt5":
        adjusted_base_loc = base_loc
    else:
        adjusted_base_loc = f"{base_loc}/{paraphrasing_model}"
    
    print(adjusted_base_loc)
    filtered_results_loc = f"{adjusted_base_loc}/{scoring_model} results/filtered_inc_rank_v3"
    excel_files = sorted(glob(os.path.join(filtered_results_loc, "*.xlsx")))
    print(f"    Number of Files Found: {len(excel_files)}")
    all_merged = []
    
    for file in excel_files:
        data = read_excel_sheets(file, ['metadata', 'LLR'])

        metadata = data['metadata']
        metadata['paraphrasing_model'] = paraphrasing_model
        metadata['scoring_model'] = scoring_model
        llr = data['LLR']

        metadata_info = metadata[[
            'paraphrasing_model', 'scoring_model', 'sample_id', 'problem', 'corpus', 'known_author',
            'unknown_author', 'unknown_doc_id', 'known_doc_id', 'target'
        ]].copy()

        metadata_repeated = pd.concat([metadata_info] * len(llr), ignore_index=True)
        llr_with_metadata = pd.concat([metadata_repeated, llr.reset_index(drop=True)], axis=1)

        merged = llr_with_metadata.merge(
            phrases,
            left_on='original_phrase',
            right_on='phrase',
            how='inner'
        )

        all_merged.append(merged)

    final_merged_table = pd.concat(all_merged, ignore_index=True)
    all_model_data.append(final_merged_table)

Loading Paraphrase Model: gpt5 - Scoring Model: qwen
/Volumes/BCross/paraphrase examples slurm/Wiki-Test
    Number of Files Found: 661
Loading Paraphrase Model: gpt5 - Scoring Model: gemma
/Volumes/BCross/paraphrase examples slurm/Wiki-Test
    Number of Files Found: 661
Loading Paraphrase Model: gpt5 - Scoring Model: llama
/Volumes/BCross/paraphrase examples slurm/Wiki-Test
    Number of Files Found: 661
Loading Paraphrase Model: gpt5 - Scoring Model: gpt2
/Volumes/BCross/paraphrase examples slurm/Wiki-Test
    Number of Files Found: 661
Loading Paraphrase Model: ModernBERT-base - Scoring Model: gpt2
/Volumes/BCross/paraphrase examples slurm/Wiki-Test/ModernBERT-base
    Number of Files Found: 672
Loading Paraphrase Model: ModernBERT-large - Scoring Model: gpt2
/Volumes/BCross/paraphrase examples slurm/Wiki-Test/ModernBERT-large
    Number of Files Found: 616


In [94]:
all_merged

[   paraphrasing_model scoring_model  sample_id  \
 0    ModernBERT-large          gpt2          1   
 1    ModernBERT-large          gpt2          1   
 2    ModernBERT-large          gpt2          1   
 3    ModernBERT-large          gpt2          1   
 4    ModernBERT-large          gpt2          1   
 5    ModernBERT-large          gpt2          1   
 6    ModernBERT-large          gpt2          1   
 7    ModernBERT-large          gpt2          1   
 8    ModernBERT-large          gpt2          1   
 9    ModernBERT-large          gpt2          1   
 10   ModernBERT-large          gpt2          1   
 11   ModernBERT-large          gpt2          1   
 12   ModernBERT-large          gpt2          1   
 13   ModernBERT-large          gpt2          1   
 
                                problem corpus     known_author  \
 0   Hodja_Nasreddin vs Hodja_Nasreddin   Wiki  Hodja_Nasreddin   
 1   Hodja_Nasreddin vs Hodja_Nasreddin   Wiki  Hodja_Nasreddin   
 2   Hodja_Nasreddin vs Hodja_Na

### Append any New Data

In [95]:
if all_model_data:
    new_results = pd.concat(all_model_data, ignore_index=True)
    raw_results = pd.concat([raw_results, new_results], ignore_index=True)
else:
    print("No new model combinations to add.")

In [96]:
(
    raw_results
    .loc[raw_results['paraphrasing_model'].isin(['ModernBERT-base', 'ModernBERT-large'])]
    .loc[raw_results['unknown_doc_id']=='hodja_nasreddin_text_3']
    .loc[raw_results['known_doc_id']=='hodja_nasreddin_text_1']
    .sort_values(by=['problem', 'known_doc_id', 'phrase_num', 'paraphrasing_model'])
    .reset_index(drop=True)
)

Unnamed: 0,paraphrasing_model,scoring_model,sample_id,problem,corpus,known_author,unknown_author,unknown_doc_id,known_doc_id,target,...,pmf_unknown,llr_no_context,llr_known,llr_unknown,unknown_ref_vs_top_rank,unknown_ref_vs_best_rest,phrase,tokens,num_tokens,keep_phrase
0,ModernBERT-base,gpt2,1.0,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,...,0.233898,1.296925,0.420161,0.630974,-0.348991,-0.348991,", this is not","(',', 'Ġthis', 'Ġis', 'Ġnot')",4.0,1.0
1,ModernBERT-large,gpt2,1.0,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,...,0.267325,0.975195,0.375648,0.57296,-0.348991,-0.348991,", this is not","(',', 'Ġthis', 'Ġis', 'Ġnot')",4.0,1.0
2,ModernBERT-base,gpt2,1.0,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,...,0.003509,4.073694,2.803271,2.454808,-5.379062,-5.379062,", but this","(',', 'Ġbut', 'Ġthis')",3.0,1.0
3,ModernBERT-base,gpt2,1.0,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,...,,4.073694,2.101961,,,,", but this","(',', 'Ġbut', 'Ġthis')",3.0,1.0
4,ModernBERT-large,gpt2,1.0,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,...,0.072827,1.937721,1.197405,1.137707,-0.677753,-0.677753,", but this","(',', 'Ġbut', 'Ġthis')",3.0,1.0
5,ModernBERT-large,gpt2,1.0,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,...,,1.937721,0.781913,,,,", but this","(',', 'Ġbut', 'Ġthis')",3.0,1.0
6,ModernBERT-base,gpt2,1.0,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,...,0.005498,2.883499,2.132039,2.259801,-5.077235,-5.077235,", you are","(',', 'Ġyou', 'Ġare')",3.0,1.0
7,ModernBERT-large,gpt2,1.0,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,...,0.388678,1.114093,0.419327,0.41041,0.0,0.584846,", you are","(',', 'Ġyou', 'Ġare')",3.0,1.0
8,ModernBERT-base,gpt2,1.0,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,...,0.150811,3.248927,0.75008,0.821568,-1.643953,-1.643953,do not have,"('Ġdo', 'Ġnot', 'Ġhave')",3.0,1.0
9,ModernBERT-large,gpt2,1.0,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,...,0.818634,0.480852,0.290147,0.08691,0.0,2.508958,do not have,"('Ġdo', 'Ġnot', 'Ġhave')",3.0,1.0


In [97]:
raw_results.columns

Index(['paraphrasing_model', 'scoring_model', 'sample_id', 'problem', 'corpus',
       'known_author', 'unknown_author', 'unknown_doc_id', 'known_doc_id',
       'target', 'phrase_num', 'phrase_occurence', 'original_phrase',
       'num_phrases', 'phrases_kept', 'no_context_log_prob', 'known_log_prob',
       'unknown_log_prob', 'known_vs_no_context_log_prob',
       'unknown_vs_no_context_log_prob', 'pmf_no_context', 'pmf_known',
       'pmf_unknown', 'llr_no_context', 'llr_known', 'llr_unknown',
       'unknown_ref_vs_top_rank', 'unknown_ref_vs_best_rest', 'phrase',
       'tokens', 'num_tokens', 'keep_phrase'],
      dtype='object')

In [98]:
raw_results.to_excel(raw_save_loc, index=False)

### Aggregate First by Phrase Occurence

In [99]:
group_cols = [
    'paraphrasing_model', 'scoring_model', 'problem', 'corpus', 'known_author', 
    'unknown_author', 'target', 'original_phrase', 'num_tokens', 'phrase_occurence'
]

avg_cols = [
    'no_context_log_prob', 'known_log_prob', 'unknown_log_prob', 'known_vs_no_context_log_prob',
    'unknown_vs_no_context_log_prob','llr_no_context', 'llr_known', 'llr_unknown', 'unknown_ref_vs_top_rank',
    'unknown_ref_vs_best_rest'
]

# Group and compute the mean
grouped_results = (
    raw_results
    .groupby(group_cols, as_index=False)[avg_cols]
    .mean()
)

In [100]:
grouped_results.head()

Unnamed: 0,paraphrasing_model,scoring_model,problem,corpus,known_author,unknown_author,target,original_phrase,num_tokens,phrase_occurence,no_context_log_prob,known_log_prob,unknown_log_prob,known_vs_no_context_log_prob,unknown_vs_no_context_log_prob,llr_no_context,llr_known,llr_unknown,unknown_ref_vs_top_rank,unknown_ref_vs_best_rest
0,ModernBERT-base,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,because you,2.0,1.0,-10.144112,-6.040575,-4.394758,4.103536,5.749353,1.198378,1.080677,0.954926,-1.473892,-1.473892
1,ModernBERT-base,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,think that,2.0,1.0,-10.566478,-5.100667,-7.734904,5.465811,2.831573,0.886214,0.655695,1.484332,-2.080505,-2.080505
2,ModernBERT-base,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,this discussion,2.0,1.0,-14.530581,-8.944603,-6.505734,5.585978,8.024846,3.616166,3.192995,1.798613,-2.787564,-2.787564
3,ModernBERT-base,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,this discussion,2.0,2.0,-14.530581,-0.265315,,14.265266,,2.118289,0.046771,,,
4,ModernBERT-base,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,to a,2.0,1.0,-8.222481,-3.98182,-6.197324,4.240661,2.025156,1.257488,1.109304,0.758939,-1.290634,-1.290634


### Then Create Problem Level Scores

In [101]:
# Define grouping and aggregation columns
group_cols = [
    'paraphrasing_model', 'scoring_model', 'problem', 'corpus', 'known_author', 'unknown_author', 'target'
]

avg_cols = [
    'no_context_log_prob', 'known_log_prob', 'unknown_log_prob', 'known_vs_no_context_log_prob',
    'unknown_vs_no_context_log_prob', 'llr_no_context', 'llr_known', 'llr_unknown', 'unknown_ref_vs_top_rank',
    'unknown_ref_vs_best_rest'
]

# Get unique num_tokens thresholds (sorted ascending)
token_thresholds = sorted(grouped_results['num_tokens'].dropna().unique())

# Container for results
all_summaries = []

# Loop through each token threshold
for min_token_size in token_thresholds:
    # Filter rows where num_tokens >= threshold
    filtered = grouped_results[grouped_results['num_tokens'] >= min_token_size]

    # Group and sum
    grouped = (
        filtered
        .groupby(group_cols, as_index=False)[avg_cols]
        .sum()
    )

    # Add the min_token_size column
    grouped['min_token_size'] = min_token_size

    # Append to results
    all_summaries.append(grouped)

# Combine all grouped results
final_summary = pd.concat(all_summaries, ignore_index=True)

In [102]:
final_summary.head()

Unnamed: 0,paraphrasing_model,scoring_model,problem,corpus,known_author,unknown_author,target,no_context_log_prob,known_log_prob,unknown_log_prob,known_vs_no_context_log_prob,unknown_vs_no_context_log_prob,llr_no_context,llr_known,llr_unknown,unknown_ref_vs_top_rank,unknown_ref_vs_best_rest,min_token_size
0,ModernBERT-base,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,-1086.173933,-1014.713952,-1028.211071,71.459981,33.580711,15.111625,9.556317,8.272844,-148.784267,-130.07932,2.0
1,ModernBERT-base,gpt2,HOOTmag vs Iain99,Wiki,HOOTmag,Iain99,False,-295.11516,-129.883982,-148.400089,106.595531,106.805026,44.436204,27.618698,20.92234,-31.003853,-7.468327,2.0
2,ModernBERT-base,gpt2,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,True,-505.363825,-222.115703,-245.537363,206.582116,216.533137,81.934722,38.438466,35.314581,-49.359655,-21.578381,2.0
3,ModernBERT-base,gpt2,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,False,-270.561187,-124.918503,-110.991039,119.150737,80.758483,53.636379,24.896999,20.443878,-29.411836,-23.894137,2.0
4,ModernBERT-base,gpt2,HonestopL vs HOOTmag,Wiki,HonestopL,HOOTmag,False,-201.054109,-80.390139,-92.130298,55.681705,108.923811,33.299647,19.776218,18.887623,-33.522402,-12.995834,2.0


In [103]:
summary_combos = set(
    final_summary[['paraphrasing_model', 'scoring_model']]
    .itertuples(index=False, name=None)
)
summary_combos

{('ModernBERT-base', 'gpt2'),
 ('ModernBERT-large', 'gpt2'),
 ('gpt5', 'gemma'),
 ('gpt5', 'gpt2'),
 ('gpt5', 'llama'),
 ('gpt5', 'qwen')}

In [104]:
(
    final_summary
    .loc[final_summary['paraphrasing_model'].isin(['ModernBERT-base', 'ModernBERT-large'])]
    .sort_values(by=['problem', 'min_token_size', 'paraphrasing_model'])
    .reset_index(drop=True)
)

Unnamed: 0,paraphrasing_model,scoring_model,problem,corpus,known_author,unknown_author,target,no_context_log_prob,known_log_prob,unknown_log_prob,known_vs_no_context_log_prob,unknown_vs_no_context_log_prob,llr_no_context,llr_known,llr_unknown,unknown_ref_vs_top_rank,unknown_ref_vs_best_rest,min_token_size
0,ModernBERT-base,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,-1086.173933,-1014.713952,-1028.211071,71.459981,33.580711,15.111625,9.556317,8.272844,-148.784267,-130.079320,2.0
1,ModernBERT-large,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,-1086.173933,-1014.713952,-1028.211071,71.459981,33.580711,12.254373,7.818142,5.675133,-195.478371,-190.480142,2.0
2,ModernBERT-base,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,-989.841666,-977.852312,-985.656273,11.989354,4.185393,1.001411,1.409055,0.252841,-135.802106,-120.671645,3.0
3,ModernBERT-large,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,-989.841666,-977.852312,-985.656273,11.989354,4.185393,1.029856,1.413231,0.268531,-188.273287,-186.849545,3.0
4,ModernBERT-base,gpt2,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,-978.309493,-968.849912,-978.309493,9.459582,0.000000,0.000000,0.000000,0.000000,-135.802106,-122.095387,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1698,ModernBERT-large,gpt2,ZjarriRrethues vs ZjarriRrethues,Wiki,ZjarriRrethues,ZjarriRrethues,True,-111.178142,-61.619699,-59.953099,49.558443,51.225043,12.217504,5.465984,4.085565,-5.073585,-4.210147,3.0
1699,ModernBERT-base,gpt2,ZjarriRrethues vs ZjarriRrethues,Wiki,ZjarriRrethues,ZjarriRrethues,True,-80.176260,-43.610159,-41.649855,36.566101,38.526405,10.513995,6.828050,6.046698,-11.839556,-11.457391,4.0
1700,ModernBERT-large,gpt2,ZjarriRrethues vs ZjarriRrethues,Wiki,ZjarriRrethues,ZjarriRrethues,True,-80.176260,-43.610159,-41.649855,36.566101,38.526405,8.984861,3.609476,2.776936,-3.906268,-3.042830,4.0
1701,ModernBERT-base,gpt2,ZjarriRrethues vs ZjarriRrethues,Wiki,ZjarriRrethues,ZjarriRrethues,True,-67.146282,-33.737844,-28.168412,33.408437,38.977869,7.359040,3.579265,2.170481,-2.971171,-2.589006,5.0


In [105]:
final_summary.to_excel(agg_save_loc, index=False)