In [1]:
import sys
import os

import pandas as pd

from glob import glob
from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from read_and_write_docs import read_excel_sheets

In [2]:
results_dir = '/Volumes/BCross/paraphrase examples slurm/llama results/filtered'
phrase_location = '/Volumes/BCross/paraphrase examples slurm/wiki-phrase-list-reviewed.xlsx'
save_location = '/Volumes/BCross/paraphrase examples slurm/llama results/token_level_results.xlsx'

In [3]:
def summarise_llr_by_token_level(document_location: str, phrase_document_location: str) -> pd.DataFrame:
    """
    Generate a token-level LLR summary from provided LLR and phrase data.

    Parameters:
    ----------
    document_location : str
        Path to the Excel file containing 'metadata' and 'LLR' sheets.
    phrase_document_location : str
        Path to the Excel file containing n-gram phrase data with a 'keep_phrase' column.

    Returns:
    -------
    pd.DataFrame
        A DataFrame where each row corresponds to a token-level threshold summary,
        combining metadata and computed LLR statistics.
    """
    # Read in both sheets from the Excel document
    excel_data = read_excel_sheets(document_location, sheet_names=['metadata', 'LLR'])
    llr = excel_data['LLR']
    metadata = excel_data['metadata']

    # Extract only the key metadata columns to attach later
    metadata_base = metadata[[
        'index', 'sample_id', 'problem', 'corpus',
        'known_author', 'unknown_author',
        'known_doc_id', 'unknown_doc_id', 'target'
    ]]

    # Load and filter ngram phrases, renaming for merge compatibility
    ngram_phrases = (
        pd.read_excel(phrase_document_location)
        .query("keep_phrase == 1")
        .drop(columns=['keep_phrase', 'tokens'])
        .rename(columns={"phrase": "original_phrase"})
    )

    # Merge LLR data with filtered phrase data
    llr_merged = llr.merge(ngram_phrases, on='original_phrase', how='inner')

    # Identify unique token thresholds to iterate over
    token_levels = llr_merged['num_tokens'].drop_duplicates().to_list()

    # Initialize the output summary table
    final_summary = pd.DataFrame()

    # Loop through each token threshold and compute stats
    for token_level in token_levels:
        llr_filtered = llr_merged.query(f"num_tokens >= {token_level}")

        # Compute basic LLR statistics
        LLR_summary = pd.DataFrame([{
            'min_token_size': token_level,
            'num_phrases': llr_filtered['phrase_num'].nunique(),
            'phrases_kept': llr_filtered.loc[llr_filtered['phrases_kept'] > 0, 'phrase_num'].nunique(),
            'llr_no_context': llr_filtered['llr_no_context'].sum(skipna=True),
            'llr_known': llr_filtered['llr_known'].sum(skipna=True),
            'llr_unknown': llr_filtered['llr_unknown'].sum(skipna=True),
            'unknown_ref_vs_top_rank': llr_filtered['unknown_ref_vs_top_rank'].sum(skipna=True),
        }])

        # Normalize the LLR values
        LLR_summary = LLR_summary.assign(
            normalised_llr_no_context=lambda d: d['llr_no_context'] / d['phrases_kept'],
            normalised_llr_known=lambda d: d['llr_known'] / d['phrases_kept'],
            normalised_llr_unknown=lambda d: d['llr_unknown'] / d['phrases_kept']
        )

        # Copy base metadata (just the first row if more than one)
        new_row = metadata_base.copy()
        if len(new_row) > 1:
            new_row = new_row.iloc[[0]]

        # Combine metadata and computed LLR stats
        new_row = pd.concat([new_row.reset_index(drop=True), LLR_summary.reset_index(drop=True)], axis=1)

        # Append to the final summary table
        final_summary = pd.concat([final_summary, new_row], ignore_index=True)

    return final_summary


In [4]:
def batch_summarize_llr(
    directory: str,
    phrase_document_location: str,
    save_location: str
):
    """
    Process all .xlsx files in a directory using summarize_llr_by_token_level,
    appending results and saving the final summary table to the specified location.

    Parameters:
    ----------
    directory : str
        Path to the directory containing Excel (.xlsx) files with 'metadata' and 'LLR' sheets.
    phrase_document_location : str
        Path to the Excel file with filtered phrase data.
    save_location : str
        Path (including filename) where the final summary Excel file will be saved.

    Returns:
    -------
    None
    """

    # Find all .xlsx files in the directory (non-hidden)
    excel_files = sorted(glob(os.path.join(directory, "*.xlsx")))
    all_summaries = []

    total_files = len(excel_files)

    for i, file_path in enumerate(excel_files, start=1):
        print(f"Processing file {i} of {total_files}: {os.path.basename(file_path)}")

        try:
            summary = summarise_llr_by_token_level(file_path, phrase_document_location)

            # Add a column to track the source file
            summary['source_file'] = os.path.basename(file_path)

            all_summaries.append(summary)

        except Exception as e:
            print(f"⚠️ Skipping {file_path} due to error: {e}")

    # Combine all results
    final_summary = pd.concat(all_summaries, ignore_index=True)

    # Save to Excel
    final_summary.to_excel(save_location, index=False)
    print(f"\n✅ Final summary saved to: {save_location}")


In [5]:
# model_list = ['gpt2', 'gemma', 'llama', 'qwen']

# for model in model_list:
#     print(f"Completing the scoring for {model}")
    
#     results_dir = f'/Volumes/BCross/paraphrase examples slurm/{model} results/filtered_inc_rank'
#     phrase_location = '/Volumes/BCross/paraphrase examples slurm/wiki-phrase-list-reviewed.xlsx'
#     save_location = f'/Volumes/BCross/paraphrase examples slurm/{model} results/token_level_results_inc_rank.xlsx'
    
#     batch_summarize_llr(
#         directory=results_dir,
#         phrase_document_location=phrase_location,
#         save_location=save_location
#     )

In [6]:
results_dir = '/Volumes/BCross/paraphrase examples slurm/Wiki-Test/ModernBERT-base/gpt2 results/filtered_inc_rank'
phrase_location = '/Volumes/BCross/paraphrase examples slurm/wiki-phrase-list-reviewed.xlsx'
save_location = '/Volumes/BCross/paraphrase examples slurm/Wiki-Test/ModernBERT-base/gpt2 results/token_level_results_inc_rank.xlsx'

batch_summarize_llr(
    directory=results_dir,
    phrase_document_location=phrase_location,
    save_location=save_location
)

Processing file 1 of 672: hodja_nasreddin_text_1 vs hodja_nasreddin_text_3.xlsx
Processing file 2 of 672: hodja_nasreddin_text_1 vs honestopl_text_1.xlsx
Processing file 3 of 672: hodja_nasreddin_text_10 vs hodja_nasreddin_text_3.xlsx
Processing file 4 of 672: hodja_nasreddin_text_10 vs honestopl_text_1.xlsx
Processing file 5 of 672: hodja_nasreddin_text_11 vs hodja_nasreddin_text_3.xlsx
Processing file 6 of 672: hodja_nasreddin_text_11 vs honestopl_text_1.xlsx
Processing file 7 of 672: honestopl_text_3 vs honestopl_text_1.xlsx
Processing file 8 of 672: honestopl_text_3 vs hootmag_text_13.xlsx
Processing file 9 of 672: honestopl_text_4 vs honestopl_text_1.xlsx
Processing file 10 of 672: honestopl_text_4 vs hootmag_text_13.xlsx
Processing file 11 of 672: honestopl_text_5 vs honestopl_text_1.xlsx
Processing file 12 of 672: honestopl_text_5 vs hootmag_text_13.xlsx
Processing file 13 of 672: hootmag_text_1 vs hootmag_text_13.xlsx
Processing file 14 of 672: hootmag_text_1 vs iain99_text_5.x