# Create Aggregated Token Level Results

In this script we combine the scores for all of the corpuses and data types in the datasets.

In [167]:
import os
import sys

import pandas as pd

from from_root import from_root
from glob import glob
from pathlib import Path

sys.path.insert(0, str(from_root("src")))

from read_and_write_docs import read_excel_sheets, read_rds

## Load Data

Here we load the data.

In [168]:
base_data_dir = "/Volumes/BCross/av_datasets_experiments/ngram_masking"
raw_save_loc = f"{base_data_dir}/raw_token_level_scores.xlsx"
agg_save_loc = f"{base_data_dir}/raw_agg_token_level_scores.xlsx"
token_level_problems_loc = f"{base_data_dir}/raw_problem_list.xlsx"

# Metadata for the data
data_types = ["test", "training"]
corpuses = ["ACL", "Enron", "Perverted Justice", "StackExchange",
            "The Telegraph", "TripAdvisor", "Wiki"]
paraphrasing_models = ["ModernBERT-base", "ModernBERT-large"]
scoring_models = ["gpt2"]

# Load phrases and keep phrases if exists
# phrases = pd.read_excel('/Volumes/BCross/paraphrase examples slurm/wiki-phrase-list-reviewed.xlsx')
phrases=None

if phrases:
    phrases = phrases[phrases['keep_phrase'] == 1]

### Load Original Data and Check Model Combos

The columns to currently compare are paraphrasing_model, scoring_model, corpus, data_type, and filename. The code below gets the unique combinations of each from the raw results table.

In [169]:
combo_columns = ['paraphrasing_model', 'scoring_model', 'corpus', 'data_type', 'filename']

if Path(raw_save_loc).exists():
    raw_results = pd.read_excel(raw_save_loc)
    
    # Get the 
    existing_combos = set(
        raw_results[combo_columns]
        .itertuples(index=False, name=None)
    )
else:
    # file does not exist → no existing results
    raw_results = pd.DataFrame(
        columns=combo_columns
    )
    existing_combos = set()

### Aggregate any New Model Data

In [170]:
all_model_data = []

for dt in data_types:
    print(f"Working on {dt} data")
    for cp in corpuses:
        print(f"Working on the {cp} corpus")
        for pm in paraphrasing_models:
            print(f"Working on the {pm} paraphrasing model")
            for sm in scoring_models:
                print(f"Working on the {sm} scoring model")

                data_dir = f"{base_data_dir}/{dt}/{cp}/{pm}/{sm} results/raw_inc_rank"

                # Skip cleanly if folder doesn't exist
                if not os.path.isdir(data_dir):
                    print(f"Skipping (missing dir): {data_dir}")
                    continue

                excel_files = sorted(glob(os.path.join(data_dir, "*.xlsx")))
                print(f"Num Files in data_dir {len(excel_files)}")

                # If no files, nothing to do
                if not excel_files:
                    continue

                all_merged = []

                for file in excel_files:
                    base_name = os.path.basename(file)
                    combo = (pm, sm, cp, dt, base_name)

                    # Skip if already processed
                    if combo in existing_combos:
                        # optional: keep it quieter
                        # print(f"Skipping existing combo: {combo}")
                        continue

                    data = read_excel_sheets(file, ['metadata', 'LLR'])

                    metadata = data['metadata']
                    metadata['data_type'] = dt
                    metadata['paraphrasing_model'] = pm
                    metadata['scoring_model'] = sm
                    metadata['filename'] = base_name

                    llr = data['LLR']

                    metadata_info = metadata[[
                        'paraphrasing_model', 'scoring_model', 'corpus', 'data_type',
                        'sample_id', 'problem', 'filename', 'known_author',
                        'unknown_author', 'unknown_doc_id', 'known_doc_id', 'target'
                    ]].copy()

                    metadata_repeated = pd.concat([metadata_info] * len(llr), ignore_index=True)
                    llr_with_metadata = pd.concat([metadata_repeated, llr.reset_index(drop=True)], axis=1)

                    if phrases is not None and not phrases.empty:
                        llr_with_metadata = llr_with_metadata.merge(
                            phrases,
                            left_on='original_phrase',
                            right_on='phrase',
                            how='inner'
                        )

                    all_merged.append(llr_with_metadata)

                # ✅ Only concat if something survived skipping
                if not all_merged:
                    print(f"All files already processed for {dt}/{cp}/{pm}/{sm} — nothing new.")
                    continue

                final_merged_table = pd.concat(all_merged, ignore_index=True)
                all_model_data.append(final_merged_table)

Working on test data
Working on the ACL corpus
Working on the ModernBERT-base paraphrasing model
Working on the gpt2 scoring model
Num Files in data_dir 5
Working on the ModernBERT-large paraphrasing model
Working on the gpt2 scoring model
Num Files in data_dir 3
Working on the Enron corpus
Working on the ModernBERT-base paraphrasing model
Working on the gpt2 scoring model
Num Files in data_dir 200
All files already processed for test/Enron/ModernBERT-base/gpt2 — nothing new.
Working on the ModernBERT-large paraphrasing model
Working on the gpt2 scoring model
Num Files in data_dir 232
All files already processed for test/Enron/ModernBERT-large/gpt2 — nothing new.
Working on the Perverted Justice corpus
Working on the ModernBERT-base paraphrasing model
Working on the gpt2 scoring model
Num Files in data_dir 464
All files already processed for test/Perverted Justice/ModernBERT-base/gpt2 — nothing new.
Working on the ModernBERT-large paraphrasing model
Working on the gpt2 scoring model
Nu

ValueError: No objects to concatenate

### Append any New Data

In [171]:
# rows in raw BEFORE adding anything
raw_before = 0 if raw_results is None else len(raw_results)

if all_model_data:
    new_results = pd.concat(all_model_data, ignore_index=True)
    new_rows = len(new_results)

    raw_results = pd.concat([raw_results, new_results], ignore_index=True)
    raw_after = len(raw_results)

    added_rows = raw_after - raw_before  # should equal new_rows unless something odd happened

    print(f"Raw results rows (before): {raw_before:,}")
    print(f"New results rows:          {new_rows:,}")
    print(f"New entries added:         {added_rows:,}")
    print(f"Raw results rows (after):  {raw_after:,}")
    
    raw_results.to_excel(raw_save_loc, index=False)
else:
    print("No new model combinations to add.")
    print(f"Raw results rows (unchanged): {raw_before:,}")

Raw results rows (before): 33,327
New results rows:          281
New entries added:         281
Raw results rows (after):  33,608


In [172]:
raw_results.head()

Unnamed: 0,paraphrasing_model,scoring_model,corpus,data_type,sample_id,problem,filename,known_author,unknown_author,unknown_doc_id,...,known_vs_no_context_log_prob,unknown_vs_no_context_log_prob,pmf_no_context,pmf_known,pmf_unknown,llr_no_context,llr_known,llr_unknown,unknown_ref_vs_top_rank,unknown_ref_vs_best_rest
0,ModernBERT-base,gpt2,Enron,test,13,Kimberly.watson vs Larry.campbell,kimberly_watson_mail_1 vs larry_campbell_mail_...,Kimberly.watson,Larry.campbell,larry_campbell_mail_1,...,1.760929,4.005409,0.013471,0.014447,0.295786,1.870607,1.840216,0.529023,-0.728053,-0.728053
1,ModernBERT-base,gpt2,Enron,test,13,Kimberly.watson vs Larry.campbell,kimberly_watson_mail_1 vs larry_campbell_mail_...,Kimberly.watson,Larry.campbell,larry_campbell_mail_1,...,5.0936,4.361169,0.06763,0.12706,0.063233,1.169858,0.895991,1.199054,-2.133572,-2.133572
2,ModernBERT-base,gpt2,Enron,test,13,Kimberly.watson vs Larry.campbell,kimberly_watson_mail_1 vs larry_campbell_mail_...,Kimberly.watson,Larry.campbell,larry_campbell_mail_1,...,,2.381106,0.06763,,0.020153,1.169858,,1.695661,-3.070706,-3.070706
3,ModernBERT-base,gpt2,Enron,test,13,Kimberly.watson vs Larry.campbell,kimberly_watson_mail_1 vs larry_campbell_mail_...,Kimberly.watson,Larry.campbell,larry_campbell_mail_1,...,5.85241,2.336323,0.001646,0.121286,0.093036,2.783573,0.916189,1.031351,-1.9805,-1.9805
4,ModernBERT-base,gpt2,Enron,test,13,Kimberly.watson vs Larry.campbell,kimberly_watson_mail_1 vs larry_campbell_mail_...,Kimberly.watson,Larry.campbell,larry_campbell_mail_1,...,-0.091553,0.370007,0.001646,0.010688,0.029069,2.783573,1.971102,1.536573,-3.015689,-3.015689


### Get the Complete Problems

Currently doing this by getting complete problems based on what we have paraphrased and scored in the raw data but the best way would be to load the known and unknown for each corpus and then get the number of documents for each.

In [173]:
# The problem and document level columns
distinct_problem_cols = [
    'paraphrasing_model', 'scoring_model', 'corpus', 'data_type', 'problem',
    'known_doc_id', 'unknown_doc_id', 'target'
]

# The problem level columns
distinct_grouping_cols = [
    'paraphrasing_model', 'scoring_model', 'corpus', 'data_type', 'problem', 'target'
]

# Here we get the number of different document comparisons in each problem
distinct_problems = (
    raw_results[distinct_problem_cols]
    .drop_duplicates()
    .groupby(distinct_grouping_cols, as_index=False)
    .size()
    .rename(columns={"size": "n_docs"})
    .sort_values("n_docs", ascending=True)
    .reset_index(drop=True)
)

# Here we get the maximum for each corpus and data type
corpus_problem_max = (
    distinct_problems
    .groupby(['corpus', 'data_type'], as_index=False)['n_docs']
    .agg(n_docs='max')
)

# Get the complete problem dataframe
complete_problems = (
    distinct_problems
    .merge(
        corpus_problem_max,
        on=['corpus', 'data_type', 'n_docs'],
        how='inner'
    )
    .reset_index(drop=True)
    .drop(columns=['n_docs'])
)

complete_problems.to_excel(token_level_problems_loc, index=False)

### Aggregate First by Phrase Occurence

In [174]:
# First we want to filter to just the problems in the problem list
# These are the complete problems
before_merge_rows = len(raw_results)

complete_raw_results = (
    raw_results
    .merge(
        complete_problems,
        on=['paraphrasing_model', 'scoring_model', 'corpus', 'data_type', 'problem', 'target'],
        how='inner'
    )
)

after_merge_rows = len(complete_raw_results)

print(f"Rows before merge: {before_merge_rows:,}")
print(f"Rows after merge:  {after_merge_rows:,}")
print(f"Rows kept:         {after_merge_rows / before_merge_rows:.2%}" if before_merge_rows else "Rows kept: N/A")
print(f"Rows dropped:      {before_merge_rows - after_merge_rows:,}")


Rows before merge: 33,608
Rows after merge:  21,783
Rows kept:         64.81%
Rows dropped:      11,825


In [159]:
group_cols = [
    'paraphrasing_model', 'scoring_model', 'corpus', 'data_type',
    'problem', 'known_author', 'unknown_author', 'target', 'original_phrase',
    'num_tokens', 'phrase_occurence'
]

avg_cols = [
    'no_context_log_prob', 'known_log_prob', 'unknown_log_prob', 'known_vs_no_context_log_prob',
    'unknown_vs_no_context_log_prob','llr_no_context', 'llr_known', 'llr_unknown', 'unknown_ref_vs_top_rank',
    'unknown_ref_vs_best_rest'
]

# Group and compute the mean
grouped_results = (
    complete_raw_results
    .groupby(group_cols, as_index=False)[avg_cols]
    .mean()
)

In [175]:
grouped_results.head()

Unnamed: 0,paraphrasing_model,scoring_model,corpus,data_type,problem,known_author,unknown_author,target,original_phrase,num_tokens,...,no_context_log_prob,known_log_prob,unknown_log_prob,known_vs_no_context_log_prob,unknown_vs_no_context_log_prob,llr_no_context,llr_known,llr_unknown,unknown_ref_vs_top_rank,unknown_ref_vs_best_rest
0,ModernBERT-base,gpt2,Enron,test,Kimberly.watson vs Larry.campbell,Kimberly.watson,Larry.campbell,False,amount of,18,...,-9.663528,-7.902599,-5.658119,1.760929,4.005409,1.870607,1.840216,0.529023,-0.728053,-0.728053
1,ModernBERT-base,gpt2,Enron,test,Kimberly.watson vs Larry.campbell,Kimberly.watson,Larry.campbell,False,and i,14,...,-12.44396,-6.310178,-14.665708,6.133782,-2.221747,3.873449,1.817539,3.875098,-8.505962,-8.505962
2,ModernBERT-base,gpt2,Enron,test,Kimberly.watson vs Larry.campbell,Kimberly.watson,Larry.campbell,False,and i,14,...,-12.44396,-6.365383,-6.029979,6.078577,6.413981,3.873449,2.046698,1.840245,-3.946299,-3.946299
3,ModernBERT-base,gpt2,Enron,test,Kimberly.watson vs Larry.campbell,Kimberly.watson,Larry.campbell,False,and i,14,...,-12.44396,-6.760739,,5.683221,,4.27058,2.187395,,,
4,ModernBERT-base,gpt2,Enron,test,Kimberly.watson vs Larry.campbell,Kimberly.watson,Larry.campbell,False,and i,14,...,-12.44396,-4.820793,,7.623167,,4.27058,1.756902,,,


### Then Create Problem Level Scores

In [None]:
# Define grouping and aggregation columns
group_cols = [
    'paraphrasing_model', 'scoring_model', 'corpus', 'data_type',
    'problem', 'known_author', 'unknown_author', 'target'
]

avg_cols = [
    'no_context_log_prob', 'known_log_prob', 'unknown_log_prob', 'known_vs_no_context_log_prob',
    'unknown_vs_no_context_log_prob', 'llr_no_context', 'llr_known', 'llr_unknown', 'unknown_ref_vs_top_rank',
    'unknown_ref_vs_best_rest'
]

# Get unique num_tokens thresholds (sorted ascending)
token_thresholds = sorted(grouped_results['num_tokens'].dropna().unique())

# Container for results
all_summaries = []

# Loop through each token threshold
for min_token_size in token_thresholds:
    # Filter rows where num_tokens >= threshold
    filtered = grouped_results[grouped_results['num_tokens'] >= min_token_size]

    # Group and sum
    grouped = (
        filtered
        .groupby(group_cols, as_index=False)[avg_cols]
        .sum()
    )

    # Add the min_token_size column
    grouped['min_token_size'] = min_token_size

    # Append to results
    all_summaries.append(grouped)

# Combine all grouped results
problem_summary = pd.concat(all_summaries, ignore_index=True)



In [189]:
problem_summary.head()

Unnamed: 0,paraphrasing_model,scoring_model,corpus,data_type,problem,known_author,unknown_author,target,no_context_log_prob,known_log_prob,unknown_log_prob,known_vs_no_context_log_prob,unknown_vs_no_context_log_prob,llr_no_context,llr_known,llr_unknown,unknown_ref_vs_top_rank,unknown_ref_vs_best_rest,min_token_size
0,ModernBERT-base,gpt2,Enron,test,Kimberly.watson vs Larry.campbell,Kimberly.watson,Larry.campbell,False,-1124.176939,-476.140733,-435.04535,442.046262,379.57157,264.962419,125.190571,102.402221,-169.519448,-129.767518,12
1,ModernBERT-base,gpt2,Enron,test,Larry.campbell vs Larry.campbell,Larry.campbell,Larry.campbell,True,-1069.574019,-462.28104,-473.039619,423.690896,400.981041,239.970598,116.689719,111.853246,-182.020652,-139.79426,12
2,ModernBERT-base,gpt2,Enron,test,Larry.campbell vs Lindy.donoho,Larry.campbell,Lindy.donoho,False,-1032.656678,-418.073481,-410.445017,422.340383,433.517064,240.054272,116.580416,84.3875,-136.335129,-61.439897,12
3,ModernBERT-base,gpt2,Enron,test,Lindy.donoho vs Lindy.donoho,Lindy.donoho,Lindy.donoho,True,-986.314673,-441.77048,-410.508914,406.42776,413.139095,221.974775,110.642685,85.310283,-130.141024,-76.749981,12
4,ModernBERT-base,gpt2,Enron,test,Lindy.donoho vs Liz.taylor,Lindy.donoho,Liz.taylor,False,-564.662253,-241.866188,-212.336756,214.464741,273.64157,116.686219,60.760377,41.26602,-61.97553,-4.397208,12


### Remove problems with not a result in each

This is a check which ensures that for each corpus, data_type, paraphrasing_model, scoring_model and min_token_size combo that we have at least a single result which is a TRUE problem and also a single FALSE problem. This is to ensure that the idiolect performance results can be carried out.

In [201]:
# Calculate the number of distinct problems for each combo
token_level_results_info = (
    problem_summary
    .groupby(['corpus', 'data_type', 'paraphrasing_model', 'scoring_model', 'min_token_size', 'target'])['problem']
    .nunique()
    .unstack(fill_value=0)
    .reset_index()
)

# remove the annoying "target" header (it's df.columns.name)
token_level_results_info.columns.name = None

# force a clean RangeIndex
token_level_results_info = token_level_results_info.reset_index(drop=True)

# Now filter for at least one in each and keep the combos
valid_rows = (
    token_level_results_info.loc[
        (token_level_results_info[True] >= 1) &
        (token_level_results_info[False] >= 1),
        ['corpus', 'data_type', 'paraphrasing_model', 'scoring_model', 'min_token_size']
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)

In [202]:
valid_rows.head()

Unnamed: 0,corpus,data_type,paraphrasing_model,scoring_model,min_token_size
0,Enron,test,ModernBERT-base,gpt2,12
1,Enron,test,ModernBERT-base,gpt2,13
2,Enron,test,ModernBERT-base,gpt2,14
3,Enron,test,ModernBERT-base,gpt2,15
4,Enron,test,ModernBERT-base,gpt2,16


In [203]:
# Now complete the join and get all valid problems
min_token_level_summary = (
    problem_summary
    .merge(
        valid_rows,
        on=['corpus', 'data_type', 'paraphrasing_model', 'scoring_model', 'min_token_size'],
        how='inner'
    )
)

#### Check the output

In [204]:
group_cols = ['corpus', 'data_type', 'paraphrasing_model', 'scoring_model', 'min_token_size']

# total distinct problems per group
total = (
    min_token_level_summary
    .groupby(group_cols)['problem']
    .nunique()
    .rename('total_problems')
)

# distinct problems per group split by target True/False
by_target = (
    min_token_level_summary
    .groupby(group_cols + ['target'])['problem']
    .nunique()
    .unstack('target', fill_value=0)
    .reindex(columns=[True, False], fill_value=0)   # ensure both columns exist
    .rename(columns={True: 'true_problems', False: 'false_problems'})
)

# final table + sort by the same groupby cols
model_summary = (
    total.to_frame()
    .join(by_target)
    .reset_index()
    .sort_values(group_cols)
    .reset_index(drop=True)
)

In [205]:
summary_combos = set(
    min_token_level_summary [['paraphrasing_model', 'scoring_model', 'corpus', 'data_type']]
    .itertuples(index=False, name=None)
)
summary_combos

{('ModernBERT-base', 'gpt2', 'Enron', 'test'),
 ('ModernBERT-base', 'gpt2', 'Perverted Justice', 'test'),
 ('ModernBERT-large', 'gpt2', 'Enron', 'test'),
 ('ModernBERT-large', 'gpt2', 'Perverted Justice', 'test')}

In [206]:
min_token_level_summary.to_excel(agg_save_loc, index=False)