# Combine Test Results

This notebook is used to combine the results for the different models used to score the Author Verification method utilising common n-grams between texts.

In [297]:
import sys
import os

import pandas as pd

from functools import reduce
from from_root import from_root
from glob import glob

sys.path.insert(0, str(from_root("src")))

from performance import performance
from read_and_write_docs import read_excel_sheets

## Load data and set save locations

Load all of the results data into the notebook and set the save locations. I call each dataframe by the model used to score the data.

In [298]:
def insert_model_name(df, model, col_after='sample_id'):
    """Insert the model name after user defined column default sample_id"""
    insert_loc = df.columns.get_loc(col_after) + 1
    df.insert(insert_loc, "model", model)
    return df

In [299]:

base_loc = '/Volumes/BCross/paraphrase examples slurm'
# base_loc = '/Users/user/Library/CloudStorage/OneDrive-TheUniversityofManchester/paraphrase examples slurm'

phrases = pd.read_excel(f"{base_loc}/wiki-phrase-list-reviewed.xlsx")
phrases = phrases[phrases['keep_phrase'] == 1]

results_save_loc = f"{base_loc}/wiki-test-results.xlsx"
agg_results_save_loc = f"{base_loc}/wiki-test-results_agg.xlsx"

## ---- LambdaG Results ---- ##
lambdag_metrics = pd.read_csv(f"{base_loc}/LambdaG_results.csv")
lambdag_metrics['corpus']='Wiki'
lambdag_metrics = lambdag_metrics[['problem', 'corpus', 'known_author', 'unknown_author', 'target', 'score', 'Magnitude']]

## ---- Qwen Results ---- ##
qwen_metrics = pd.read_excel(f"{base_loc}/qwen results/filtered_results.xlsx")
qwen_metrics = insert_model_name(qwen_metrics, "Qwen")

## ---- Google Gemma Results ---- ##
gemma_metrics = pd.read_excel(f'{base_loc}/gemma results/filtered_results.xlsx')
gemma_metrics = insert_model_name(gemma_metrics, "Gemma")

## ---- Meta Llama Results ---- ##
llama_metrics = pd.read_excel(f'{base_loc}/llama results/filtered_results.xlsx')
llama_metrics = insert_model_name(llama_metrics, "Llama")

## ---- GPT 2 Results ---- ##
gpt2_metrics = pd.read_excel(f'{base_loc}/gpt2 results/filtered_results.xlsx')
gpt2_metrics = insert_model_name(gpt2_metrics, "GPT2")

## ---- Print number of records in each table ---- ##
print(f"Number of results in LambdaG Table {len(lambdag_metrics)}")
print(f"Number of problems Qwen Table {len(qwen_metrics)}")
print(f"Number of problems Gemma Table {len(gemma_metrics)}")
print(f"Number of problems Llama Table {len(llama_metrics)}")
print(f"Number of problems GPT 2 Table {len(gpt2_metrics)}")

Number of results in LambdaG Table 224
Number of problems Qwen Table 661
Number of problems Gemma Table 661
Number of problems Llama Table 661
Number of problems GPT 2 Table 661


## Merge dataframes

The function below allows us to merge the dataframes together in a loop keeping the columns the user wishes and also what columns are used for metrics.

The dataframes are given as a dictionary with the key being the suffix in the new metric columns.

In [300]:
def merge_model_metrics(model_dfs, id_cols, metric_cols):
    """
    model_dfs: dict mapping model name -> dataframe
               e.g. {"gemma": gemma_metrics, "llama": llama_metrics}
    id_cols: list of columns to join on
    metric_cols: list of metric columns to rename per model
    """
    columns_to_keep = id_cols + metric_cols
    prepared = []

    for model_name, df in model_dfs.items():
        tmp = df[columns_to_keep].copy()
        tmp = tmp.rename(
            columns={col: f"{col}_{model_name}" for col in metric_cols}
        )
        prepared.append(tmp)

    # Outer-join them all on id_cols
    merged = reduce(
        lambda left, right: left.merge(right, on=id_cols, how="outer"),
        prepared
    )
    return merged

In [301]:
# Columns that identify a row (join keys)
id_cols = [
    "index", "sample_id", "problem", "corpus", 
    "known_author", "unknown_author", "known_doc_id", "unknown_doc_id",
    "target",
]

# Metric columns you don’t want to join on
metric_cols = ["llr_unknown"]

model_dfs = {
    "gpt2": gpt2_metrics,
    "qwen": qwen_metrics,
    "gemma": gemma_metrics,
    "llama": llama_metrics,
}

merged = merge_model_metrics(
    model_dfs,
    id_cols=id_cols,
    metric_cols=metric_cols
)

In [302]:
# merged.to_excel(results_save_loc, index=False)

## Aggregate the dataframe

Here we aggregate the dataframe to gather the results together for each problem rather than for each known and unknown document pair in the the problems. Here a sum is used. I also count the number of rows for which there is data for each problem, the problems should all have 3 rows of data. Any that don't should be filtered out before final results, or the code to get the paraphrases should be run again.

In [303]:
# Columns that identify a group
agg_cols = [
    "problem", "corpus", "known_author", "unknown_author", "target"
]

# Metric columns you don’t want to join on
metric_cols_prefix = ["llr_unknown"]

# we treat these as prefixes in the merged df
metric_prefixes = [f"{m}_" for m in metric_cols_prefix]   # -> ["llr_unknown_"]

# ---- aggregation helper ----
def aggregate_by_prefix(df, group_cols, metric_prefixes):
    # Find all columns whose names start with any of the prefixes
    metric_cols = [
        col for col in df.columns
        if any(col.startswith(pref) for pref in metric_prefixes)
    ]

    # Build named aggregations: one SUM and one COUNT (non-null) per metric column
    agg_spec = {
        **{f"{col}_count": (col, "count") for col in metric_cols},
        **{f"{col}_sum":   (col, "sum")   for col in metric_cols},
    }

    return (
        df
        .groupby(group_cols, dropna=False)
        .agg(**agg_spec)
        .reset_index()
    )

In [304]:
aggregated = aggregate_by_prefix(
    merged,
    group_cols=agg_cols,
    metric_prefixes=metric_cols_prefix,
)

In [305]:
aggregated.head()

Unnamed: 0,problem,corpus,known_author,unknown_author,target,llr_unknown_gpt2_count,llr_unknown_qwen_count,llr_unknown_gemma_count,llr_unknown_llama_count,llr_unknown_gpt2_sum,llr_unknown_qwen_sum,llr_unknown_gemma_sum,llr_unknown_llama_sum
0,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,3,3,3,3,7.983477,5.949244,5.791008,40.844842
1,HOOTmag vs Iain99,Wiki,HOOTmag,Iain99,False,3,3,3,3,12.875691,9.703337,9.063104,8.835369
2,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,True,3,3,3,3,21.018562,22.053801,21.359321,17.957647
3,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,False,3,3,3,3,9.453268,7.361303,8.951198,8.193341
4,HonestopL vs HOOTmag,Wiki,HonestopL,HOOTmag,False,3,3,3,3,6.771749,8.472696,9.87146,5.895896


In [306]:
# aggregated.to_excel(agg_results_save_loc, index=False)

## Score the non-aggregated dataset

Now first we score the non-aggregated dataset, I have an altered version of the performance function which also shows the predictions of whether the result is a TP, TN, FP, or FN result.

In [307]:
def collect_preds_errors_with_summary(
    merged_df,
    models,
    id_cols,
    base_metric="llr_unknown",
    target_col="target",
    keep_cols=("corpus",),
    performance_fn=performance,
    include_score=True,   # keep the per-model score in detailed output
):
    """
    Returns
    -------
    summary_all : pd.DataFrame
        Concatenation of raw_summary from each model (with a 'model' column).
    detailed_merged : pd.DataFrame
        id_cols + (score_col) + y_pred_<model>, error_<model> for each model (outer-merged).
    """
    per_model_detailed = []
    per_model_summaries = []

    for model in models:
        score_col = f"{base_metric}_{model}"
        df_non_null = merged_df[merged_df[score_col].notna()]
        if df_non_null.empty:
            continue

        raw_summary, raw_detailed = performance_fn(
            df_non_null,
            additional_metadata={"model": model},
            keep_cols=list(keep_cols),
            score_col=score_col,
            target_col=target_col,
            return_pred_rows=True,
            id_cols=id_cols,
        )

        # --- collect summary ---
        if "model" not in raw_summary.columns:
            raw_summary = raw_summary.copy()
            raw_summary["model"] = model
        per_model_summaries.append(raw_summary)

        # --- bring the score into raw_detailed (it's not included by performance) ---
        if include_score and score_col not in raw_detailed.columns:
            score_frame = (
                df_non_null[id_cols + [score_col]]
                .drop_duplicates(subset=id_cols)  # avoid dup join keys
            )
            raw_detailed = raw_detailed.merge(score_frame, on=id_cols, how="left")

        # --- collect detailed ---
        keep = id_cols + ["y_pred", "error"]
        if include_score:
            keep = id_cols + [score_col, "y_pred", "error"]

        df_small = raw_detailed[keep].rename(
            columns={
                "y_pred": f"y_pred_{model}",
                "error": f"error_{model}",
                # score_col already contains the model suffix; keep as-is
            }
        )

        # If duplicates per key can occur, you can dedupe here:
        # df_small = df_small.drop_duplicates(subset=id_cols, keep="last")

        per_model_detailed.append(df_small)

    # Build final summary
    summary_all = (
        pd.concat(per_model_summaries, ignore_index=True)
        if per_model_summaries else pd.DataFrame()
    )

    # Build final detailed
    if per_model_detailed:
        detailed_merged = reduce(
            lambda left, right: left.merge(right, on=id_cols, how="outer"),
            per_model_detailed
        )
    else:
        detailed_merged = pd.DataFrame(columns=id_cols)

    return summary_all, detailed_merged


In [308]:
id_cols = [
    "index", "sample_id", "problem", "corpus",
    "known_author", "unknown_author",
    "known_doc_id", "unknown_doc_id",
    "target",
]

# Reuse the model dictionary used earlier
models = list(model_dfs.keys())

raw_summary, raw_detailed = collect_preds_errors_with_summary(
    merged_df=merged,
    models=models,
    id_cols=id_cols,
    base_metric="llr_unknown",
    target_col="target",
    keep_cols=("corpus",),
)

In [309]:
raw_detailed.head()

Unnamed: 0,index,sample_id,problem,corpus,known_author,unknown_author,known_doc_id,unknown_doc_id,target,llr_unknown_gpt2,...,error_gpt2,llr_unknown_qwen,y_pred_qwen,error_qwen,llr_unknown_gemma,y_pred_gemma,error_gemma,llr_unknown_llama,y_pred_llama,error_llama
0,0,1,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_1,hodja_nasreddin_text_3,True,8.983871,...,FN,9.763942,True,TP,8.856316,False,FN,7.786461,False,FN
1,1,2,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_10,hodja_nasreddin_text_3,True,6.480075,...,FN,6.377775,False,FN,7.284163,False,FN,6.932723,False,FN
2,2,3,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_11,hodja_nasreddin_text_3,True,5.554616,...,FN,5.912084,False,FN,5.218841,False,FN,3.238463,False,FN
3,3,4,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,hodja_nasreddin_text_1,honestopl_text_1,False,5.072673,...,TN,3.635344,False,TN,4.395038,False,TN,4.684318,False,TN
4,4,5,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,hodja_nasreddin_text_10,honestopl_text_1,False,2.046288,...,TN,1.965029,False,TN,2.188174,False,TN,1.711093,False,TN


In [310]:
raw_summary

Unnamed: 0,model,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,gpt2,Wiki,0.903748,0.903748,0.339339,0.139717,-0.121838,328,333,0.692824,0.656046,0.678445,0.585366,0.628478,192,91,136,242
1,qwen,Wiki,0.916797,0.916797,0.366366,0.120072,-0.105766,328,333,0.680144,0.645513,0.663194,0.582317,0.62013,191,97,137,236
2,gemma,Wiki,0.908034,0.908034,0.357357,0.136814,-0.116221,328,333,0.688255,0.647014,0.665505,0.582317,0.621138,191,96,137,237
3,llama,Wiki,0.922444,0.922444,0.36036,0.113556,-0.098572,328,333,0.675456,0.639461,0.65625,0.57622,0.613636,189,99,139,234


In [311]:
# raw_detailed.to_excel(f"{base_loc}/raw_results_detailed.xlsx", index=False)
# raw_summary.to_excel(f"{base_loc}/raw_results_summary.xlsx", index=False)

## Score the aggregated dataset

Next we score the aggregated dataset, we must ensure to only include those problems which have the correct number of rows in the dataset.

In [312]:
lambdag_sum, lambdag_detailed = performance(
    lambdag_metrics,
    score_col='score',
    target_col='target',
    return_pred_rows=True,
    id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
    additional_metadata={"model": "LambdaG"}
)

lambdag_detailed = (
    lambdag_detailed
      .rename(columns={"y_pred": "y_pred_lambdag", "error": "error_lambdag"})
      .drop(columns=["pred_prob", "pred_llr", 'index'])
)

In [313]:
lambdag_metrics.rename(columns={"target": "y_true", "score": "llr_lambdaG"}, inplace=True)

lambdag_merged = ( lambdag_detailed
    .merge(lambdag_metrics, on=['problem', 'corpus', 'known_author', 'unknown_author', 'y_true'], how='left')
    .drop(columns="Magnitude")
    )

In [314]:
aggregated

Unnamed: 0,problem,corpus,known_author,unknown_author,target,llr_unknown_gpt2_count,llr_unknown_qwen_count,llr_unknown_gemma_count,llr_unknown_llama_count,llr_unknown_gpt2_sum,llr_unknown_qwen_sum,llr_unknown_gemma_sum,llr_unknown_llama_sum
0,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,3,3,3,3,7.983477,5.949244,5.791008,40.844842
1,HOOTmag vs Iain99,Wiki,HOOTmag,Iain99,False,3,3,3,3,12.875691,9.703337,9.063104,8.835369
2,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,True,3,3,3,3,21.018562,22.053801,21.359321,17.957647
3,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,False,3,3,3,3,9.453268,7.361303,8.951198,8.193341
4,HonestopL vs HOOTmag,Wiki,HonestopL,HOOTmag,False,3,3,3,3,6.771749,8.472696,9.871460,5.895896
...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,Xtv vs Yoenit,Wiki,Xtv,Yoenit,False,3,3,3,3,30.242804,28.710436,26.981082,24.958740
220,Yoenit vs Yoenit,Wiki,Yoenit,Yoenit,True,2,2,2,2,30.917279,23.723406,22.681577,22.114786
221,Yoenit vs ZjarriRrethues,Wiki,Yoenit,ZjarriRrethues,False,3,3,3,3,22.737351,19.217741,20.592835,16.853033
222,ZjarriRrethues vs 142.196.88.228,Wiki,ZjarriRrethues,142.196.88.228,False,3,3,3,3,13.619872,11.119647,13.298563,9.891038


In [315]:
lambdag_merged

Unnamed: 0,problem,corpus,known_author,unknown_author,y_true,y_pred_lambdag,error_lambdag,llr_lambdaG
0,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,True,TP,161.614000
1,Icarus3 vs Icarus3,Wiki,Icarus3,Icarus3,True,True,TP,153.800067
2,Rjecina vs Rjecina,Wiki,Rjecina,Rjecina,True,True,TP,118.796333
3,Lear_21 vs Lear_21,Wiki,Lear_21,Lear_21,True,True,TP,101.188267
4,Richard_Daft vs Richard_Daft,Wiki,Richard_Daft,Richard_Daft,True,True,TP,97.970333
...,...,...,...,...,...,...,...,...
219,N419BH vs Nableezy,Wiki,N419BH,Nableezy,False,False,TN,0.753467
220,O_Fenian vs Paul_Siebert,Wiki,O_Fenian,Paul_Siebert,False,False,TN,-0.663733
221,Nigel_Ish vs Nigel_Ish,Wiki,Nigel_Ish,Nigel_Ish,True,False,FN,0.502133
222,Kashmiri vs KBlott,Wiki,Kashmiri,KBlott,False,False,TN,0.376267


In [316]:
gpt2_agg = aggregated[aggregated['llr_unknown_gpt2_count'] == 3].copy()

gpt2_agg_sum, gpt2_agg_detailed = performance(
    gpt2_agg,
    score_col='llr_unknown_gpt2_sum',
    target_col='target',
    return_pred_rows=True,
    id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
    additional_metadata={"model": "gpt2"}
)

gpt2_agg_detailed = (
    gpt2_agg_detailed
      .rename(columns={"y_pred": "y_pred_gpt2", "error": "error_gpt2"})
      .drop(columns=["pred_prob", "pred_llr", "index"])
)

gemma_agg = aggregated[aggregated['llr_unknown_gemma_count'] == 3].copy()

gemma_agg_sum, gemma_agg_detailed = performance(
    gemma_agg,
    score_col='llr_unknown_gemma_sum',
    target_col='target',
    return_pred_rows=True,
    id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
    additional_metadata={"model": "gemma"}
)

gemma_agg_detailed = (
    gemma_agg_detailed
      .rename(columns={"y_pred": "y_pred_gemma", "error": "error_gemma"})
      .drop(columns=["pred_prob", "pred_llr", "index"])
)

qwen_agg = aggregated[aggregated['llr_unknown_qwen_count'] == 3].copy()

qwen_agg_sum, qwen_agg_detailed = performance(
    qwen_agg,
    score_col='llr_unknown_qwen_sum',
    target_col='target',
    return_pred_rows=True,
    id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
    additional_metadata={"model": "qwen"}
)

qwen_agg_detailed = (
    qwen_agg_detailed
      .rename(columns={"y_pred": "y_pred_qwen", "error": "error_qwen"})
      .drop(columns=["pred_prob", "pred_llr", "index"])
)

llama_agg = aggregated[aggregated['llr_unknown_llama_count'] == 3].copy()

llama_agg_sum, llama_agg_detailed = performance(
    llama_agg,
    score_col='llr_unknown_llama_sum',
    target_col='target',
    return_pred_rows=True,
    id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
    additional_metadata={"model": "llama"}
)

llama_agg_detailed = (
    llama_agg_detailed
      .rename(columns={"y_pred": "y_pred_llama", "error": "error_llama"})
      .drop(columns=["pred_prob", "pred_llr", "index"])
)

combined_df = pd.concat([lambdag_sum, gpt2_agg_sum, qwen_agg_sum, gemma_agg_sum, llama_agg_sum], axis=0, ignore_index=True)


In [317]:
combined_df

Unnamed: 0,model,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,LambdaG,0.573212,0.573212,0.169643,1.088023,-0.75436,112,112,0.900351,0.825893,0.828829,0.821429,0.825112,92,19,20,93
1,gpt2,0.85707,0.85673,0.357798,0.204198,-0.207509,104,109,0.735974,0.669857,0.684783,0.605769,0.642857,63,29,41,80
2,qwen,0.873891,0.873545,0.366972,0.180513,-0.184612,104,109,0.720625,0.646701,0.652632,0.596154,0.623116,62,33,42,76
3,gemma,0.858543,0.858205,0.348624,0.205317,-0.205123,104,109,0.735709,0.670078,0.680851,0.615385,0.646465,64,30,40,79
4,llama,0.8738,0.873459,0.357798,0.178813,-0.183374,104,109,0.720625,0.656096,0.663158,0.605769,0.633166,63,32,41,77


In [318]:
# combined_df.to_excel(f"{base_loc}/agg_results_summary.xlsx", index=False)

In [319]:
model_detailed_results = (
    qwen_agg_detailed
    .merge(gemma_agg_detailed, on=['problem', 'corpus', 'known_author', 'unknown_author', 'y_true'], how='outer')
    .merge(llama_agg_detailed, on=['problem', 'corpus', 'known_author', 'unknown_author', 'y_true'], how='outer')
    .merge(gpt2_agg_detailed, on=['problem', 'corpus', 'known_author', 'unknown_author', 'y_true'], how='outer')
    .merge(aggregated, on=['problem', 'corpus', 'known_author', 'unknown_author'], how='left')
    .drop(columns=["target", "llr_unknown_qwen_count", "llr_unknown_gemma_count", "llr_unknown_llama_count", "llr_unknown_gpt2_count"])
    .rename(columns={"llr_unknown_qwen_sum": "llr_unknown_qwen", "llr_unknown_gemma_sum": "llr_unknown_gemma",
                     "llr_unknown_llama_sum": "llr_unknown_llama", "llr_unknown_gpt2_sum": "llr_unknown_gpt2"})
    .merge(lambdag_merged, on=['problem', 'corpus', 'known_author', 'unknown_author', 'y_true'], how='outer')
    .loc[:, [
        'problem', 'corpus', 'known_author', 'unknown_author', 'y_true',
        'llr_lambdaG', 'y_pred_lambdag', 'error_lambdag',
        'llr_unknown_gpt2', 'y_pred_gpt2', 'error_gpt2',
        'llr_unknown_qwen', 'y_pred_qwen', 'error_qwen',
        'llr_unknown_gemma', 'y_pred_gemma', 'error_gemma',
        'llr_unknown_llama', 'y_pred_llama', 'error_llama'
    ]]
)

In [320]:
model_detailed_results

Unnamed: 0,problem,corpus,known_author,unknown_author,y_true,llr_lambdaG,y_pred_lambdag,error_lambdag,llr_unknown_gpt2,y_pred_gpt2,error_gpt2,llr_unknown_qwen,y_pred_qwen,error_qwen,llr_unknown_gemma,y_pred_gemma,error_gemma,llr_unknown_llama,y_pred_llama,error_llama
0,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,161.614000,True,TP,7.983477,False,FN,5.949244,False,FN,5.791008,False,FN,40.844842,True,TP
1,HOOTmag vs Iain99,Wiki,HOOTmag,Iain99,False,-42.095733,False,TN,12.875691,False,TN,9.703337,False,TN,9.063104,False,TN,8.835369,False,TN
2,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,True,21.316867,True,TP,21.018562,False,FN,22.053801,False,FN,21.359321,False,FN,17.957647,False,FN
3,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,False,9.259067,True,FP,9.453268,False,TN,7.361303,False,TN,8.951198,False,TN,8.193341,False,TN
4,HonestopL vs HOOTmag,Wiki,HonestopL,HOOTmag,False,-13.158267,False,TN,6.771749,False,TN,8.472696,False,TN,9.871460,False,TN,5.895896,False,TN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,Xtv vs Yoenit,Wiki,Xtv,Yoenit,False,5.556133,True,FP,30.242804,False,TN,28.710436,True,FP,26.981082,False,TN,24.958740,True,FP
220,Yoenit vs Yoenit,Wiki,Yoenit,Yoenit,True,60.363067,True,TP,,,,,,,,,,,,
221,Yoenit vs ZjarriRrethues,Wiki,Yoenit,ZjarriRrethues,False,-22.850667,False,TN,22.737351,False,TN,19.217741,False,TN,20.592835,False,TN,16.853033,False,TN
222,ZjarriRrethues vs 142.196.88.228,Wiki,ZjarriRrethues,142.196.88.228,False,-33.414400,False,TN,13.619872,False,TN,11.119647,False,TN,13.298563,False,TN,9.891038,False,TN


In [321]:
# model_detailed_results.to_excel(f"{base_loc}/agg_results_detailed.xlsx", index=False)

## View by Token Sizes

In [322]:
gpt2_token_level_results = pd.read_excel(f'{base_loc}/gpt2 results/token_level_results.xlsx')
gpt2_token_level_results['model'] = "GPT2"

llama_token_level_results = pd.read_excel(f'{base_loc}/llama results/token_level_results.xlsx')
llama_token_level_results['model'] = "Llama"

gemma_token_level_results = pd.read_excel(f'{base_loc}/gemma results/token_level_results.xlsx')
gemma_token_level_results['model'] = "Gemma"

qwen_token_level_results = pd.read_excel(f'{base_loc}/qwen results/token_level_results.xlsx')
qwen_token_level_results['model'] = "Qwen"

token_level_results = pd.concat([gpt2_token_level_results, llama_token_level_results, gemma_token_level_results, qwen_token_level_results])

### Find Valid Token Ranges

To get a result using **performance** we need to ensure that there is a True and False example for at least a single case for each token size.

In [323]:
token_level_results_info = (
    token_level_results
    .groupby(['model', 'min_token_size', 'target'])
    .size()
    .unstack(fill_value=0)
)

# keep only rows where both True and False counts are >= 1
valid_rows = token_level_results_info[
    (token_level_results_info[True]  > 1) &
    (token_level_results_info[False] > 1)
]

# Get the level values for min_token_size from the multi-index
min_token_sizes = valid_rows.index.get_level_values('min_token_size')

# Find the highest min_token_size
highest_min_token_size = int(min_token_sizes.max()) if not valid_rows.empty else None

print(f"The highest token range we can test on is {highest_min_token_size}")

filtered_token_level_results = (
    token_level_results
    .query(f"min_token_size <= {highest_min_token_size}")
)

agg_token_problems_to_keep = (
    filtered_token_level_results
    [['problem', 'corpus', 'min_token_size']]
    .drop_duplicates()
)

agg_token_dataset = (
    token_level_results
    .merge(agg_token_problems_to_keep, on = ['problem', 'corpus', 'min_token_size'], how='inner')
)

The highest token range we can test on is 5


In [324]:
id_cols = [
    "model", "index", "sample_id", "problem", "corpus",
    "known_author", "unknown_author",
    "known_doc_id", "unknown_doc_id",
    "target", "min_token_size"
]

token_summary, token_detailed = performance(
    filtered_token_level_results,
    # additional_metadata={"model": "gpt2"},
    keep_cols=["corpus"],
    score_col="llr_unknown",
    target_col="target",
    return_pred_rows=True,
    id_cols=id_cols,
    group_cols=["model", "min_token_size"]
)

In [325]:
# Sum by token size
aggregated_filtered_token_level_results = (
    agg_token_dataset
    .groupby(
        ["model", "problem", "corpus", "known_author", "unknown_author", "target", "min_token_size"],
        as_index=False
    )["llr_unknown"]
    .sum()
)

In [326]:
# Get the grouped scores
id_cols = [
    "problem", "corpus",
    "known_author", "unknown_author",
    "target", "min_token_size"
]

token_summary, token_detailed = performance(
    aggregated_filtered_token_level_results,
    keep_cols=["corpus"],
    score_col="llr_unknown",
    target_col="target",
    return_pred_rows=True,
    id_cols=id_cols,
    group_cols=["model", "min_token_size"]
)
token_summary = token_summary.sort_values(by=['min_token_size', 'model'])

In [327]:
token_summary

Unnamed: 0,model,min_token_size,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,GPT2,2,Wiki,0.865559,0.865559,0.348214,0.208998,-0.173308,112,112,0.728715,0.665179,0.67619,0.633929,0.654378,71,34,41,78
4,Gemma,2,Wiki,0.865899,0.865899,0.339286,0.211959,-0.172018,112,112,0.728476,0.665179,0.679612,0.625,0.651163,70,33,42,79
8,Llama,2,Wiki,0.881299,0.881299,0.357143,0.186027,-0.151607,112,112,0.713967,0.638393,0.64486,0.616071,0.630137,69,38,43,74
12,Qwen,2,Wiki,0.880486,0.880486,0.357143,0.188741,-0.15315,112,112,0.71317,0.629464,0.638095,0.598214,0.617512,67,38,45,74
1,GPT2,3,Wiki,0.75794,0.75794,0.309091,0.644993,-0.283621,111,110,0.803358,0.719902,0.775281,0.621622,0.69,69,20,42,90
5,Gemma,3,Wiki,0.748783,0.748783,0.254545,0.755199,-0.297221,111,110,0.807862,0.747011,0.802198,0.657658,0.722772,73,18,38,92
9,Llama,3,Wiki,0.765751,0.765751,0.272727,0.598792,-0.27431,111,110,0.796888,0.706183,0.739583,0.63964,0.68599,71,25,40,85
13,Qwen,3,Wiki,0.78435,0.78435,0.263636,0.58554,-0.247986,111,110,0.782965,0.71077,0.752688,0.630631,0.686275,70,23,41,87
2,GPT2,4,Wiki,0.922905,0.901566,0.3,0.474158,0.050562,72,50,0.63,0.5725,0.652174,0.625,0.638298,45,24,27,26
6,Gemma,4,Wiki,0.91083,0.889861,0.44,0.589082,0.039772,72,50,0.635833,0.559444,0.638889,0.638889,0.638889,46,26,26,24


In [328]:
all_filtered_lambdag_sum = pd.DataFrame()

token_ranges = agg_token_problems_to_keep['min_token_size'].drop_duplicates().tolist()

for token in token_ranges:
    problems = (
        agg_token_problems_to_keep
        .query(f'min_token_size == {token}')
        [['problem']]
    )
    
    filtered_lambda_g = (
        lambdag_metrics
        .merge(problems, on=['problem'], how='inner')
    )
    
    filtered_lambdag_sum, filtered_lambdag_detailed = performance(
        filtered_lambda_g,
        score_col='llr_lambdaG',
        target_col='y_true',
        return_pred_rows=True,
        id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
        additional_metadata={"model": "LambdaG", "min_token_size": token, "corpus": "Wiki"}
    )
    
    all_filtered_lambdag_sum = pd.concat([all_filtered_lambdag_sum, filtered_lambdag_sum], ignore_index=True)
    

In [329]:
token_range_summary = (
    pd.concat([token_summary, all_filtered_lambdag_sum], ignore_index=True)
    .sort_values(by=['min_token_size', 'model'])
)

In [330]:
token_range_summary.to_excel(f'{base_loc}/token_results_summary.xlsx')

In [331]:
token_range_summary

Unnamed: 0,model,min_token_size,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,GPT2,2,Wiki,0.865559,0.865559,0.348214,0.208998,-0.173308,112,112,0.728715,0.665179,0.67619,0.633929,0.654378,71,34,41,78
1,Gemma,2,Wiki,0.865899,0.865899,0.339286,0.211959,-0.172018,112,112,0.728476,0.665179,0.679612,0.625,0.651163,70,33,42,79
18,LambdaG,2,Wiki,0.573212,0.573212,0.169643,1.088023,-0.75436,112,112,0.900351,0.825893,0.828829,0.821429,0.825112,92,19,20,93
2,Llama,2,Wiki,0.881299,0.881299,0.357143,0.186027,-0.151607,112,112,0.713967,0.638393,0.64486,0.616071,0.630137,69,38,43,74
3,Qwen,2,Wiki,0.880486,0.880486,0.357143,0.188741,-0.15315,112,112,0.71317,0.629464,0.638095,0.598214,0.617512,67,38,45,74
4,GPT2,3,Wiki,0.75794,0.75794,0.309091,0.644993,-0.283621,111,110,0.803358,0.719902,0.775281,0.621622,0.69,69,20,42,90
5,Gemma,3,Wiki,0.748783,0.748783,0.254545,0.755199,-0.297221,111,110,0.807862,0.747011,0.802198,0.657658,0.722772,73,18,38,92
17,LambdaG,3,Wiki,0.562312,0.562312,0.172727,1.139454,-0.780592,111,110,0.904177,0.828051,0.828829,0.828829,0.828829,92,19,19,91
6,Llama,3,Wiki,0.765751,0.765751,0.272727,0.598792,-0.27431,111,110,0.796888,0.706183,0.739583,0.63964,0.68599,71,25,40,85
7,Qwen,3,Wiki,0.78435,0.78435,0.263636,0.58554,-0.247986,111,110,0.782965,0.71077,0.752688,0.630631,0.686275,70,23,41,87


In [332]:
lambdag_metrics.head()

Unnamed: 0,problem,corpus,known_author,unknown_author,y_true,llr_lambdaG,Magnitude
0,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,161.614,161.614
1,Icarus3 vs Icarus3,Wiki,Icarus3,Icarus3,True,153.800067,153.800067
2,Rjecina vs Rjecina,Wiki,Rjecina,Rjecina,True,118.796333,118.796333
3,Lear_21 vs Lear_21,Wiki,Lear_21,Lear_21,True,101.188267,101.188267
4,Richard_Daft vs Richard_Daft,Wiki,Richard_Daft,Richard_Daft,True,97.970333,97.970333


In [333]:
token_detailed

Unnamed: 0,index,problem,corpus,known_author,unknown_author,target,min_token_size,y_true,pred_prob,pred_llr,y_pred,error,model
0,0,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,2,True,0.148831,-0.757322,False,FN,GPT2
1,2,HOOTmag vs Iain99,Wiki,HOOTmag,Iain99,False,2,False,0.218624,-0.553163,False,TN,GPT2
2,4,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,True,2,True,0.332957,-0.301765,False,FN,GPT2
3,8,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,False,2,False,0.177761,-0.665160,False,TN,GPT2
4,11,HonestopL vs HOOTmag,Wiki,HonestopL,HOOTmag,False,2,False,0.150081,-0.753053,False,TN,GPT2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399,2347,Viewfinder vs Viewfinder,Wiki,Viewfinder,Viewfinder,True,5,True,0.679126,0.325615,True,TP,Qwen
2400,2358,Vr vs Vr,Wiki,Vr,Vr,True,5,True,0.897528,0.942443,True,TP,Qwen
2401,2368,VsevolodKrolikov vs WIKI-GUY-16,Wiki,VsevolodKrolikov,WIKI-GUY-16,False,5,False,0.627653,0.226772,True,FP,Qwen
2402,2387,Wobble vs Wobble,Wiki,Wobble,Wobble,True,5,True,0.549256,0.085845,True,TP,Qwen


# New Aggregations

In [334]:
models = ["gpt2", "gemma", "llama", "qwen"]
all_model_data = []

for model in models:
    print(f"Working on model: {model}")
    filtered_results_loc = f"{base_loc}/{model} results/filtered"
    excel_files = sorted(glob(os.path.join(filtered_results_loc, "*.xlsx")))
    all_merged = []

    for file in excel_files:
        data = read_excel_sheets(file, ['metadata', 'LLR'])

        metadata = data['metadata']
        metadata['model'] = model
        llr = data['LLR']

        metadata_info = metadata[[
            'model', 'sample_id', 'problem', 'corpus', 'known_author',
            'unknown_author', 'unknown_doc_id', 'known_doc_id', 'target'
        ]].copy()

        metadata_repeated = pd.concat([metadata_info] * len(llr), ignore_index=True)
        llr_with_metadata = pd.concat([metadata_repeated, llr.reset_index(drop=True)], axis=1)

        merged = llr_with_metadata.merge(
            phrases,
            left_on='original_phrase',
            right_on='phrase',
            how='inner'
        )

        all_merged.append(merged)

    final_merged_table = pd.concat(all_merged, ignore_index=True)
    all_model_data.append(final_merged_table)

# Combine all model results
results = pd.concat(all_model_data, ignore_index=True)

Working on model: gpt2
Working on model: gemma
Working on model: llama
Working on model: qwen


In [389]:
results

Unnamed: 0,model,sample_id,problem,corpus,known_author,unknown_author,unknown_doc_id,known_doc_id,target,phrase_num,...,pmf_no_context,pmf_known,pmf_unknown,llr_no_context,llr_known,llr_unknown,phrase,tokens,num_tokens,keep_phrase
0,gpt2,1,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,phrase_01,...,1.786260e-01,0.538874,0.342302,0.748055,0.268512,0.465591,", this is not","(',', 'Ġthis', 'Ġis', 'Ġnot')",4,1.0
1,gpt2,1,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,phrase_02,...,1.456316e-02,0.025498,0.025505,1.836744,1.593489,1.593377,", but this","(',', 'Ġbut', 'Ġthis')",3,1.0
2,gpt2,1,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,phrase_02,...,1.456316e-02,0.037797,0.000000,1.836744,1.422542,0.000000,", but this","(',', 'Ġbut', 'Ġthis')",3,1.0
3,gpt2,1,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,phrase_03,...,3.041954e-01,0.798107,0.634982,0.516847,0.097939,0.197239,", you are","(',', 'Ġyou', 'Ġare')",3,1.0
4,gpt2,1,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,phrase_04,...,2.193018e-02,0.182487,0.337093,1.658958,0.738769,0.472250,do not have,"('Ġdo', 'Ġnot', 'Ġhave')",3,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79555,qwen,672,ZjarriRrethues vs ZjarriRrethues,Wiki,ZjarriRrethues,ZjarriRrethues,zjarrirrethues_text_2,zjarrirrethues_text_5,True,phrase_12,...,5.797459e-08,0.017291,0.269715,7.236762,1.762178,0.569095,regarded as,"('Ġregarded', 'Ġas')",2,1.0
79556,qwen,672,ZjarriRrethues vs ZjarriRrethues,Wiki,ZjarriRrethues,ZjarriRrethues,zjarrirrethues_text_2,zjarrirrethues_text_5,True,phrase_13,...,1.685062e-04,0.312985,0.172824,3.773384,0.504476,0.762397,the two,"('Ġthe', 'Ġtwo')",2,1.0
79557,qwen,672,ZjarriRrethues vs ZjarriRrethues,Wiki,ZjarriRrethues,ZjarriRrethues,zjarrirrethues_text_2,zjarrirrethues_text_5,True,phrase_13,...,1.685062e-04,0.516038,0.000000,3.773384,0.287318,0.000000,the two,"('Ġthe', 'Ġtwo')",2,1.0
79558,qwen,672,ZjarriRrethues vs ZjarriRrethues,Wiki,ZjarriRrethues,ZjarriRrethues,zjarrirrethues_text_2,zjarrirrethues_text_5,True,phrase_14,...,1.602955e-02,0.552751,0.957715,1.795079,0.257471,0.018764,with the,"('Ġwith', 'Ġthe')",2,1.0


In [335]:
group_cols = [
    'model', 'problem', 'corpus', 'known_author', 'unknown_author', 'target',
    'original_phrase', 'num_tokens', 'phrase_occurence'
]

avg_cols = [
    'llr_no_context', 'llr_known', 'llr_unknown'
]

# Group and compute the mean
grouped_results = (
    results
    .groupby(group_cols, as_index=False)[avg_cols]
    .mean()
)

In [336]:
# Define grouping and aggregation columns
group_cols = [
    'model', 'problem', 'corpus', 'known_author', 'unknown_author', 'target'
]

avg_cols = [
    'llr_no_context', 'llr_known', 'llr_unknown'
]

# Get unique num_tokens thresholds (sorted ascending)
token_thresholds = sorted(grouped_results['num_tokens'].dropna().unique())

# Container for results
all_summaries = []

# Loop through each token threshold
for min_token_size in token_thresholds:
    # Filter rows where num_tokens >= threshold
    filtered = grouped_results[grouped_results['num_tokens'] >= min_token_size]

    # Group and sum
    grouped = (
        filtered
        .groupby(group_cols, as_index=False)[avg_cols]
        .sum()
    )

    # Add the min_token_size column
    grouped['min_token_size'] = min_token_size

    # Append to results
    all_summaries.append(grouped)

# Combine all grouped results
final_summary = pd.concat(all_summaries, ignore_index=True)


In [390]:
final_summary

Unnamed: 0,model,problem,corpus,known_author,unknown_author,target,llr_no_context,llr_known,llr_unknown,min_token_size
0,gemma,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,11.465530,6.466093,5.573082,2
1,gemma,HOOTmag vs Iain99,Wiki,HOOTmag,Iain99,False,20.322664,9.369514,8.975726,2
2,gemma,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,True,37.195349,18.902095,21.116380,2
3,gemma,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,False,20.874336,11.303276,6.707240,2
4,gemma,HonestopL vs HOOTmag,Wiki,HonestopL,HOOTmag,False,14.216544,5.793687,8.871717,2
...,...,...,...,...,...,...,...,...,...,...
4103,qwen,Icarus3 vs Icarus3,Wiki,Icarus3,Icarus3,True,0.000000,0.000000,0.000000,300
4104,gemma,Icarus3 vs Icarus3,Wiki,Icarus3,Icarus3,True,0.000000,0.000000,0.000000,320
4105,gpt2,Icarus3 vs Icarus3,Wiki,Icarus3,Icarus3,True,0.000000,0.000000,0.000000,320
4106,llama,Icarus3 vs Icarus3,Wiki,Icarus3,Icarus3,True,0.000000,0.000000,0.000000,320


In [396]:
# final_summary.to_excel('/Volumes/BCross/paraphrase examples slurm/score_by_token_size_avg.xlsx', index=False)

In [399]:
filtered_token_level_results = (
    final_summary
    .query(f"min_token_size <= {highest_min_token_size}")
)

agg_token_problems_to_keep = (
    filtered_token_level_results
    [['problem', 'corpus', 'min_token_size', 'target']]
    .drop_duplicates()
)

agg_token_dataset = (
    final_summary
    .merge(agg_token_problems_to_keep, on = ['problem', 'corpus', 'min_token_size', 'target'], how='inner')
)

In [None]:
# agg_token_problems_to_keep.to_excel('/Volumes/BCross/paraphrase examples slurm/token_size_problems.xlsx', index=False)

In [339]:
# Get the grouped scores
id_cols = [
    "problem", "corpus",
    "known_author", "unknown_author",
    "target", "min_token_size"
]

token_summary, token_detailed = performance(
    agg_token_dataset,
    keep_cols=["corpus"],
    score_col="llr_unknown",
    target_col="target",
    return_pred_rows=True,
    id_cols=id_cols,
    group_cols=["model", "min_token_size"]
)
token_summary = token_summary.sort_values(by=['min_token_size', 'model'])

In [340]:
filtered_lambda_g

Unnamed: 0,problem,corpus,known_author,unknown_author,y_true,llr_lambdaG,Magnitude
0,Rjecina vs Rjecina,Wiki,Rjecina,Rjecina,True,118.796333,118.796333
1,Jimharlow99 vs Jimharlow99,Wiki,Jimharlow99,Jimharlow99,True,53.548867,53.548867
2,Vr vs Vr,Wiki,Vr,Vr,True,52.380867,52.380867
3,Sennen_goroshi vs Sennen_goroshi,Wiki,Sennen_goroshi,Sennen_goroshi,True,49.705867,49.705867
4,Paul_Siebert vs Paul_Siebert,Wiki,Paul_Siebert,Paul_Siebert,True,48.3214,48.3214
5,Iain99 vs Iain99,Wiki,Iain99,Iain99,True,46.865867,46.865867
6,Jeffrey_Vernon_Merkey vs Jeffrey_Vernon_Merkey,Wiki,Jeffrey_Vernon_Merkey,Jeffrey_Vernon_Merkey,True,39.656667,39.656667
7,Nirvana888 vs Nirvana888,Wiki,Nirvana888,Nirvana888,True,37.432733,37.432733
8,JerryFriedman vs JerryFriedman,Wiki,JerryFriedman,JerryFriedman,True,33.485,33.485
9,Mystichumwipe vs Mystichumwipe,Wiki,Mystichumwipe,Mystichumwipe,True,32.7086,32.7086


In [345]:
all_filtered_lambdag_sum = pd.DataFrame()

token_ranges = agg_token_problems_to_keep['min_token_size'].drop_duplicates().tolist()

for token in token_ranges:
    problems = (
        agg_token_problems_to_keep
        .query(f'min_token_size == {token}')
        [['problem']]
    )
    
    filtered_lambda_g = (
        lambdag_metrics
        .merge(problems, on=['problem'], how='inner')
    )
    
    filtered_lambdag_sum, filtered_lambdag_detailed = performance(
        filtered_lambda_g,
        score_col='llr_lambdaG',
        target_col='y_true',
        return_pred_rows=True,
        id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
        additional_metadata={"model": "LambdaG", "min_token_size": token, "corpus": "Wiki"}
    )
    
    all_filtered_lambdag_sum = pd.concat([all_filtered_lambdag_sum, filtered_lambdag_sum], ignore_index=True)
    
token_range_summary = (
    pd.concat([token_summary, all_filtered_lambdag_sum], ignore_index=True)
    .sort_values(by=['min_token_size', 'model'])
)

In [347]:
token_range_summary

Unnamed: 0,model,min_token_size,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
16,LambdaG,2,Wiki,0.573212,0.573212,0.169643,1.088023,-0.75436,112,112,0.900351,0.825893,0.828829,0.821429,0.825112,92,19,20,93
0,gemma,2,Wiki,0.820367,0.820367,0.303571,0.310633,-0.230369,112,112,0.765226,0.6875,0.698113,0.660714,0.678899,74,32,38,80
1,gpt2,2,Wiki,0.819109,0.819109,0.330357,0.30045,-0.233368,112,112,0.764509,0.691964,0.700935,0.669643,0.684932,75,32,37,80
2,llama,2,Wiki,0.839782,0.839782,0.330357,0.270608,-0.204705,112,112,0.749761,0.678571,0.688679,0.651786,0.669725,73,33,39,79
3,qwen,2,Wiki,0.840464,0.840464,0.366071,0.267257,-0.20406,112,112,0.74721,0.669643,0.675926,0.651786,0.663636,73,35,39,77
17,LambdaG,3,Wiki,0.562312,0.562312,0.172727,1.139454,-0.780592,111,110,0.904177,0.828051,0.828829,0.828829,0.828829,92,19,19,91
4,gemma,3,Wiki,0.70766,0.70766,0.236364,1.011163,-0.354872,111,110,0.836446,0.74697,0.795699,0.666667,0.72549,74,19,37,91
5,gpt2,3,Wiki,0.718715,0.718715,0.290909,0.83639,-0.335827,111,110,0.830057,0.728911,0.78022,0.63964,0.70297,71,20,40,90
6,llama,3,Wiki,0.722694,0.722694,0.263636,0.880921,-0.330686,111,110,0.826699,0.728829,0.768421,0.657658,0.708738,73,22,38,88
7,qwen,3,Wiki,0.748168,0.748168,0.290909,0.747834,-0.296374,111,110,0.810074,0.719902,0.775281,0.621622,0.69,69,20,42,90


In [348]:
token_range_summary.to_excel(f"{base_loc}/token_results_summary_avg.xlsx")

## Test splitting into train and test splits for n=3

In [370]:
min_3_tokens_dataset = agg_token_dataset[agg_token_dataset['min_token_size'] == 3].copy()

min_3_tokens_problems = min_3_tokens_dataset[['problem', 'target']].drop_duplicates()

In [373]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Suppose your DataFrame is called df and the column to predict is 'target'
X = min_3_tokens_problems.drop(columns=['target'])
y = min_3_tokens_problems['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.4,         # 20% test, change as needed
    stratify=y,            # <- THIS ensures 'target' is evenly spread
    random_state=42        # for reproducibility
)

In [376]:
train_df = min_3_tokens_dataset.merge(X_train, on='problem', how='inner')
test_df = min_3_tokens_dataset.merge(X_test, on='problem', how='inner')

In [379]:
token_summary_3, token_detailed_3 = performance(
    df_train=train_df,
    df_test=test_df,
    keep_cols=["corpus"],
    score_col="llr_unknown",
    target_col="target",
    return_pred_rows=True,
    id_cols=id_cols,
    group_cols=["model", "min_token_size"]
)
token_summary_3 = token_summary_3.sort_values(by=['min_token_size', 'model'])

In [384]:
min_token_3_lambdag_train = lambdag_metrics.merge(X_train, on='problem', how='inner')
min_token_3_lambdag_test = lambdag_metrics.merge(X_test, on='problem', how='inner')

min_token_filtered_lambdag_sum, min_token_filtered_lambdag_detailed = performance(
        df_train=min_token_3_lambdag_train,
        df_test=min_token_3_lambdag_test,
        score_col='llr_lambdaG',
        target_col='y_true',
        return_pred_rows=True,
        id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
        additional_metadata={"model": "LambdaG", "min_token_size": 3, "corpus": "Wiki"}
    )

In [386]:
calibrated_summary = pd.concat([min_token_filtered_lambdag_sum, token_summary_3], ignore_index=True)

In [387]:
calibrated_summary

Unnamed: 0,model,min_token_size,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,LambdaG,3,Wiki,0.487462,0.487271,0.113636,1.275918,-0.753135,45,44,0.938384,0.876768,0.904762,0.844444,0.873563,38,4,7,40
1,gemma,3,Wiki,0.685389,0.683557,0.272727,0.709607,-0.380573,45,44,0.851515,0.754545,0.870968,0.6,0.710526,27,4,18,40
2,gpt2,3,Wiki,0.680496,0.679472,0.295455,0.68392,-0.373504,45,44,0.852525,0.720455,0.794118,0.6,0.683544,27,7,18,37
3,llama,3,Wiki,0.661979,0.66192,0.204545,0.827357,-0.355007,45,44,0.864646,0.765404,0.852941,0.644444,0.734177,29,5,16,39
4,qwen,3,Wiki,0.723914,0.721469,0.318182,0.512276,-0.33319,45,44,0.829798,0.709343,0.787879,0.577778,0.666667,26,7,19,37


In [388]:
calibrated_summary.to_excel(f"{base_loc}/calibrated_token_results_summary_avg.xlsx")