# Combine Test Results

This notebook is used to combine the results for the different models used to score the Author Verification method utilising common n-grams between texts.

In [175]:
import sys

import pandas as pd

from functools import reduce
from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from performance import performance

## Load data and set save locations

Load all of the results data into the notebook and set the save locations. I call each dataframe by the model used to score the data.

In [176]:

base_loc = '/Volumes/BCross/paraphrase examples slurm'
results_save_loc = f"{base_loc}/wiki-test-results.xlsx"
agg_results_save_loc = f"{base_loc}/wiki-test-results_agg.xlsx"

## ---- LambdaG Results ---- ##
lambdag_metrics = pd.read_csv(f"{base_loc}/LambdaG_results.csv")
lambdag_metrics['corpus']='Wiki'
lambdag_metrics = lambdag_metrics[['problem', 'corpus', 'known_author', 'unknown_author', 'target', 'score', 'Magnitude']]

## ---- Qwen Results ---- ##
qwen_metrics = pd.read_excel(f"{base_loc}/qwen results/wiki-test-qwen-filtered-results.xlsx")

## ---- Google Gemma Results ---- ##
gemma_metrics = pd.read_excel(f'{base_loc}/gemma results/wiki-test-gemma-filtered-results.xlsx')

## ---- Meta Llama Results ---- ##
llama_metrics = pd.read_excel(f'{base_loc}/llama results/wiki-test-llama-filtered-results.xlsx')

## ---- Print number of records in each table ---- ##
print(f"Number of results in LambdaG Table {len(lambdag_metrics)}")
print(f"Number of problems Qwen Table {len(qwen_metrics)}")
print(f"Number of problems Gemma Table {len(gemma_metrics)}")
print(f"Number of problems Llama Table {len(llama_metrics)}")

Number of results in LambdaG Table 224
Number of problems Qwen Table 661
Number of problems Gemma Table 661
Number of problems Llama Table 256


## Merge dataframes

The function below allows us to merge the dataframes together in a loop keeping the columns the user wishes and also what columns are used for metrics.

The dataframes are given as a dictionary with the key being the suffix in the new metric columns.

In [177]:
def merge_model_metrics(model_dfs, id_cols, metric_cols):
    """
    model_dfs: dict mapping model name -> dataframe
               e.g. {"gemma": gemma_metrics, "llama": llama_metrics}
    id_cols: list of columns to join on
    metric_cols: list of metric columns to rename per model
    """
    columns_to_keep = id_cols + metric_cols
    prepared = []

    for model_name, df in model_dfs.items():
        tmp = df[columns_to_keep].copy()
        tmp = tmp.rename(
            columns={col: f"{col}_{model_name}" for col in metric_cols}
        )
        prepared.append(tmp)

    # Outer-join them all on id_cols
    merged = reduce(
        lambda left, right: left.merge(right, on=id_cols, how="outer"),
        prepared
    )
    return merged

In [178]:
# Columns that identify a row (join keys)
id_cols = [
    "index", "sample_id", "problem", "corpus",
    "known_author", "unknown_author", "known_doc_id", "unknown_doc_id",
    "target",
]

# Metric columns you don’t want to join on
metric_cols = ["llr_unknown"]

model_dfs = {
    "qwen": qwen_metrics,
    "gemma": gemma_metrics,
    "llama": llama_metrics,
}

merged = merge_model_metrics(
    model_dfs,
    id_cols=id_cols,
    metric_cols=metric_cols
)

In [179]:
merged.head()

Unnamed: 0,index,sample_id,problem,corpus,known_author,unknown_author,known_doc_id,unknown_doc_id,target,llr_unknown_qwen,llr_unknown_gemma,llr_unknown_llama
0,0,1,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_1,hodja_nasreddin_text_3,True,9.763942,8.856316,7.786461
1,1,2,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_10,hodja_nasreddin_text_3,True,6.377775,7.284163,6.932723
2,2,3,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_11,hodja_nasreddin_text_3,True,5.912084,5.218841,3.238463
3,3,4,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,hodja_nasreddin_text_1,honestopl_text_1,False,3.635344,4.395038,4.684318
4,4,5,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,hodja_nasreddin_text_10,honestopl_text_1,False,1.965029,2.188174,1.711093


In [180]:
merged.to_excel(results_save_loc, index=False)

## Aggregate the dataframe

Here we aggregate the dataframe to gather the results together for each problem rather than for each known and unknown document pair in the the problems. Here a sum is used. I also count the number of rows for which there is data for each problem, the problems should all have 3 rows of data. Any that don't should be filtered out before final results, or the code to get the paraphrases should be run again.

In [181]:
# Columns that identify a row (join keys)
agg_cols = [
    "problem", "corpus", "known_author", "unknown_author", "target",
]

# Metric columns you don’t want to join on
metric_cols_prefix = ["llr_unknown"]

In [182]:
# Columns that identify a group
agg_cols = [
    "problem", "corpus", "known_author", "unknown_author", "target",
]

# Metric columns you don’t want to join on
metric_cols_prefix = ["llr_unknown"]

# we treat these as prefixes in the merged df
metric_prefixes = [f"{m}_" for m in metric_cols_prefix]   # -> ["llr_unknown_"]

# ---- aggregation helper ----
def aggregate_by_prefix(df, group_cols, metric_prefixes):
    # Find all columns whose names start with any of the prefixes
    metric_cols = [
        col for col in df.columns
        if any(col.startswith(pref) for pref in metric_prefixes)
    ]

    # Build named aggregations: one SUM and one COUNT (non-null) per metric column
    agg_spec = {
        **{f"{col}_count": (col, "count") for col in metric_cols},
        **{f"{col}_sum":   (col, "sum")   for col in metric_cols},
    }

    return (
        df
        .groupby(group_cols, dropna=False)
        .agg(**agg_spec)
        .reset_index()
    )

In [183]:
aggregated = aggregate_by_prefix(
    merged,
    group_cols=agg_cols,
    metric_prefixes=metric_cols_prefix,
)

In [184]:
aggregated.head()

Unnamed: 0,problem,corpus,known_author,unknown_author,target,llr_unknown_qwen_count,llr_unknown_gemma_count,llr_unknown_llama_count,llr_unknown_qwen_sum,llr_unknown_gemma_sum,llr_unknown_llama_sum
0,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,3,3,3,5.949244,5.791008,40.844842
1,HOOTmag vs Iain99,Wiki,HOOTmag,Iain99,False,3,3,3,9.703337,9.063104,8.835369
2,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,True,3,3,3,22.053801,21.359321,17.957647
3,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,False,3,3,3,7.361303,8.951198,8.193341
4,HonestopL vs HOOTmag,Wiki,HonestopL,HOOTmag,False,3,3,3,8.472696,9.87146,5.895896


In [185]:
aggregated.to_excel(agg_results_save_loc, index=False)

## Score the non-aggregated dataset

Now first we score the non-aggregated dataset, I have an altered version of the performance function which also shows the predictions of whether the result is a TP, TN, FP, or FN result.

In [186]:
def collect_preds_errors_with_summary(
    merged_df,
    models,
    id_cols,
    base_metric="llr_unknown",
    target_col="target",
    keep_cols=("corpus",),
    performance_fn=performance,
    include_score=True,   # keep the per-model score in detailed output
):
    """
    Returns
    -------
    summary_all : pd.DataFrame
        Concatenation of raw_summary from each model (with a 'model' column).
    detailed_merged : pd.DataFrame
        id_cols + (score_col) + y_pred_<model>, error_<model> for each model (outer-merged).
    """
    per_model_detailed = []
    per_model_summaries = []

    for model in models:
        score_col = f"{base_metric}_{model}"
        df_non_null = merged_df[merged_df[score_col].notna()]
        if df_non_null.empty:
            continue

        raw_summary, raw_detailed = performance_fn(
            df_non_null,
            additional_metadata={"model": model},
            keep_cols=list(keep_cols),
            score_col=score_col,
            target_col=target_col,
            return_pred_rows=True,
            id_cols=id_cols,
        )

        # --- collect summary ---
        if "model" not in raw_summary.columns:
            raw_summary = raw_summary.copy()
            raw_summary["model"] = model
        per_model_summaries.append(raw_summary)

        # --- bring the score into raw_detailed (it's not included by performance) ---
        if include_score and score_col not in raw_detailed.columns:
            score_frame = (
                df_non_null[id_cols + [score_col]]
                .drop_duplicates(subset=id_cols)  # avoid dup join keys
            )
            raw_detailed = raw_detailed.merge(score_frame, on=id_cols, how="left")

        # --- collect detailed ---
        keep = id_cols + ["y_pred", "error"]
        if include_score:
            keep = id_cols + [score_col, "y_pred", "error"]

        df_small = raw_detailed[keep].rename(
            columns={
                "y_pred": f"y_pred_{model}",
                "error": f"error_{model}",
                # score_col already contains the model suffix; keep as-is
            }
        )

        # If duplicates per key can occur, you can dedupe here:
        # df_small = df_small.drop_duplicates(subset=id_cols, keep="last")

        per_model_detailed.append(df_small)

    # Build final summary
    summary_all = (
        pd.concat(per_model_summaries, ignore_index=True)
        if per_model_summaries else pd.DataFrame()
    )

    # Build final detailed
    if per_model_detailed:
        detailed_merged = reduce(
            lambda left, right: left.merge(right, on=id_cols, how="outer"),
            per_model_detailed
        )
    else:
        detailed_merged = pd.DataFrame(columns=id_cols)

    return summary_all, detailed_merged


In [187]:
id_cols = [
    "index", "sample_id", "problem", "corpus",
    "known_author", "unknown_author",
    "known_doc_id", "unknown_doc_id",
    "target",
]

# Reuse the model dictionary used earlier
models = list(model_dfs.keys())

raw_summary, raw_detailed = collect_preds_errors_with_summary(
    merged_df=merged,
    models=models,
    id_cols=id_cols,
    base_metric="llr_unknown",
    target_col="target",
    keep_cols=("corpus",),
)

In [188]:
raw_detailed.head()

Unnamed: 0,index,sample_id,problem,corpus,known_author,unknown_author,known_doc_id,unknown_doc_id,target,llr_unknown_qwen,y_pred_qwen,error_qwen,llr_unknown_gemma,y_pred_gemma,error_gemma,llr_unknown_llama,y_pred_llama,error_llama
0,0,1,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_1,hodja_nasreddin_text_3,True,9.763942,True,TP,8.856316,False,FN,7.786461,True,TP
1,1,2,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_10,hodja_nasreddin_text_3,True,6.377775,False,FN,7.284163,False,FN,6.932723,False,FN
2,2,3,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_11,hodja_nasreddin_text_3,True,5.912084,False,FN,5.218841,False,FN,3.238463,False,FN
3,3,4,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,hodja_nasreddin_text_1,honestopl_text_1,False,3.635344,False,TN,4.395038,False,TN,4.684318,False,TN
4,4,5,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,hodja_nasreddin_text_10,honestopl_text_1,False,1.965029,False,TN,2.188174,False,TN,1.711093,False,TN


In [189]:
raw_summary

Unnamed: 0,model,corpus,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,qwen,Wiki,0.916797,0.916797,0.366366,0.120072,-0.105766,328,333,0.680144,0.645513,0.663194,0.582317,0.62013,191,97,137,236
1,gemma,Wiki,0.908034,0.908034,0.357357,0.136814,-0.116221,328,333,0.688255,0.647014,0.665505,0.582317,0.621138,191,96,137,237
2,llama,Wiki,0.951534,0.950519,0.383459,0.049599,-0.097802,123,133,0.643682,0.61324,0.623762,0.512195,0.5625,63,38,60,95


In [190]:
raw_detailed.to_excel(f"{base_loc}/raw_results_detailed.xlsx", index=False)
raw_summary.to_excel(f"{base_loc}/raw_results_summary.xlsx", index=False)

## Score the aggregated dataset

Next we score the aggregated dataset, we must ensure to only include those problems which have the correct number of rows in the dataset.

In [240]:
lambdag_sum, lambdag_detailed = performance(
    lambdag_metrics,
    score_col='score',
    target_col='target',
    return_pred_rows=True,
    id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
    additional_metadata={"model": "LambdaG"}
)

lambdag_detailed = (
    lambdag_detailed
      .rename(columns={"y_pred": "y_pred_lambdag", "error": "error_lambdag"})
      .drop(columns=["pred_prob", "pred_llr", 'index'])
)

In [256]:
lambdag_metrics.rename(columns={"target": "y_true", "score": "llr_lambdaG"}, inplace=True)

lambdag_merged = ( lambdag_detailed
    .merge(lambdag_metrics, on=['problem', 'corpus', 'known_author', 'unknown_author', 'y_true'], how='left')
    .drop(columns="Magnitude")
    )

In [257]:
lambdag_merged

Unnamed: 0,problem,corpus,known_author,unknown_author,y_true,y_pred_lambdag,error_lambdag,llr_lambdaG
0,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,True,TP,161.614000
1,Icarus3 vs Icarus3,Wiki,Icarus3,Icarus3,True,True,TP,153.800067
2,Rjecina vs Rjecina,Wiki,Rjecina,Rjecina,True,True,TP,118.796333
3,Lear_21 vs Lear_21,Wiki,Lear_21,Lear_21,True,True,TP,101.188267
4,Richard_Daft vs Richard_Daft,Wiki,Richard_Daft,Richard_Daft,True,True,TP,97.970333
...,...,...,...,...,...,...,...,...
219,N419BH vs Nableezy,Wiki,N419BH,Nableezy,False,False,TN,0.753467
220,O_Fenian vs Paul_Siebert,Wiki,O_Fenian,Paul_Siebert,False,False,TN,-0.663733
221,Nigel_Ish vs Nigel_Ish,Wiki,Nigel_Ish,Nigel_Ish,True,False,FN,0.502133
222,Kashmiri vs KBlott,Wiki,Kashmiri,KBlott,False,False,TN,0.376267


In [245]:
gemma_agg = aggregated[aggregated['llr_unknown_gemma_count'] == 3].copy()

gemma_agg_sum, gemma_agg_detailed = performance(
    gemma_agg,
    score_col='llr_unknown_gemma_sum',
    target_col='target',
    return_pred_rows=True,
    id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
    additional_metadata={"model": "gemma"}
)

gemma_agg_detailed = (
    gemma_agg_detailed
      .rename(columns={"y_pred": "y_pred_gemma", "error": "error_gemma"})
      .drop(columns=["pred_prob", "pred_llr", "index"])
)

qwen_agg = aggregated[aggregated['llr_unknown_qwen_count'] == 3].copy()

qwen_agg_sum, qwen_agg_detailed = performance(
    qwen_agg,
    score_col='llr_unknown_qwen_sum',
    target_col='target',
    return_pred_rows=True,
    id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
    additional_metadata={"model": "qwen"}
)

qwen_agg_detailed = (
    qwen_agg_detailed
      .rename(columns={"y_pred": "y_pred_qwen", "error": "error_qwen"})
      .drop(columns=["pred_prob", "pred_llr", "index"])
)

llama_agg = aggregated[aggregated['llr_unknown_llama_count'] == 3].copy()

llama_agg_sum, llama_agg_detailed = performance(
    llama_agg,
    score_col='llr_unknown_llama_sum',
    target_col='target',
    return_pred_rows=True,
    id_cols=['problem', 'corpus', 'known_author', 'unknown_author'],
    additional_metadata={"model": "llama"}
)

llama_agg_detailed = (
    llama_agg_detailed
      .rename(columns={"y_pred": "y_pred_llama", "error": "error_llama"})
      .drop(columns=["pred_prob", "pred_llr", "index"])
)

combined_df = pd.concat([lambdag_sum, qwen_agg_sum, gemma_agg_sum, llama_agg_sum], axis=0, ignore_index=True)


In [246]:
combined_df

Unnamed: 0,model,Cllr,Cllr_min,EER,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN
0,LambdaG,0.573212,0.573212,0.169643,1.088023,-0.75436,112,112,0.900351,0.825893,0.828829,0.821429,0.825112,92,19,20,93
1,qwen,0.873891,0.873545,0.366972,0.180513,-0.184612,104,109,0.720625,0.646701,0.652632,0.596154,0.623116,62,33,42,76
2,gemma,0.858543,0.858205,0.348624,0.205317,-0.205123,104,109,0.735709,0.670078,0.680851,0.615385,0.646465,64,30,40,79
3,llama,0.955484,0.946213,0.372093,-0.000193,-0.195766,34,43,0.656635,0.595759,0.571429,0.470588,0.516129,16,12,18,31


In [277]:
combined_df.to_excel(f"{base_loc}/agg_results_summary.xlsx", index=False)

In [273]:
model_detailed_results = (
    qwen_agg_detailed
    .merge(gemma_agg_detailed, on=['problem', 'corpus', 'known_author', 'unknown_author', 'y_true'], how='outer')
    .merge(llama_agg_detailed, on=['problem', 'corpus', 'known_author', 'unknown_author', 'y_true'], how='outer')
    .merge(aggregated, on=['problem', 'corpus', 'known_author', 'unknown_author'], how='left')
    .drop(columns=["target", "llr_unknown_qwen_count", "llr_unknown_gemma_count", "llr_unknown_llama_count"])
    .rename(columns={"llr_unknown_qwen_sum": "llr_unknown_qwen", "llr_unknown_gemma_sum": "llr_unknown_gemma", "llr_unknown_llama_sum": "llr_unknown_llama"})
    .merge(lambdag_merged, on=['problem', 'corpus', 'known_author', 'unknown_author', 'y_true'], how='outer')
    .loc[:, [
        'problem', 'corpus', 'known_author', 'unknown_author', 'y_true',
        'llr_lambdaG', 'y_pred_lambdag', 'error_lambdag',
        'llr_unknown_qwen', 'y_pred_qwen', 'error_qwen',
        'llr_unknown_gemma', 'y_pred_gemma', 'error_gemma',
        'llr_unknown_llama', 'y_pred_llama', 'error_llama'
    ]]
)

In [275]:
model_detailed_results

Unnamed: 0,problem,corpus,known_author,unknown_author,y_true,llr_lambdaG,y_pred_lambdag,error_lambdag,llr_unknown_qwen,y_pred_qwen,error_qwen,llr_unknown_gemma,y_pred_gemma,error_gemma,llr_unknown_llama,y_pred_llama,error_llama
0,HOOTmag vs HOOTmag,Wiki,HOOTmag,HOOTmag,True,161.614000,True,TP,5.949244,False,FN,5.791008,False,FN,40.844842,True,TP
1,HOOTmag vs Iain99,Wiki,HOOTmag,Iain99,False,-42.095733,False,TN,9.703337,False,TN,9.063104,False,TN,8.835369,False,TN
2,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,True,21.316867,True,TP,22.053801,False,FN,21.359321,False,FN,17.957647,False,FN
3,Hodja_Nasreddin vs HonestopL,Wiki,Hodja_Nasreddin,HonestopL,False,9.259067,True,FP,7.361303,False,TN,8.951198,False,TN,8.193341,False,TN
4,HonestopL vs HOOTmag,Wiki,HonestopL,HOOTmag,False,-13.158267,False,TN,8.472696,False,TN,9.871460,False,TN,5.895896,False,TN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,Xtv vs Yoenit,Wiki,Xtv,Yoenit,False,5.556133,True,FP,28.710436,True,FP,26.981082,False,TN,0.000000,,
220,Yoenit vs Yoenit,Wiki,Yoenit,Yoenit,True,60.363067,True,TP,,,,,,,,,
221,Yoenit vs ZjarriRrethues,Wiki,Yoenit,ZjarriRrethues,False,-22.850667,False,TN,19.217741,False,TN,20.592835,False,TN,0.000000,,
222,ZjarriRrethues vs 142.196.88.228,Wiki,ZjarriRrethues,142.196.88.228,False,-33.414400,False,TN,11.119647,False,TN,13.298563,False,TN,0.000000,,


In [276]:
model_detailed_results.to_excel(f"{base_loc}/agg_results_detailed.xlsx", index=False)