In [60]:
import ast
import os
import glob

from pathlib import Path

import numpy as np
import pandas as pd

In [61]:
def create_results_doc_pipeline(doc_loc, write_excel=True, save_dir=None, phrase_loc=None):
    """Pipeline to manually get the results from a document"""
    
    doc_name = os.path.basename(doc_loc)
    
    print(f"Processing Document: {doc_name}")
    # Read the sheets as dataframes
    docs = pd.read_excel(doc_loc, sheet_name="docs")
    known = pd.read_excel(doc_loc, sheet_name="known")
    unknown = pd.read_excel(doc_loc, sheet_name="unknown")
    no_context = pd.read_excel(doc_loc, sheet_name="no context")
    metadata = pd.read_excel(doc_loc, sheet_name="metadata")
    
    # Get phrases to keep
    if phrase_loc:
        phrase_list = pd.read_excel(phrase_loc)
        phrases_to_keep = phrase_list[phrase_list['keep_phrase'] == 1].copy()

        # Convert the stringified tuples into actual tuples, then into lists
        phrases_to_keep['tokens'] = phrases_to_keep['tokens'].apply(lambda x: list(ast.literal_eval(x)) if isinstance(x, str) else list(x))
        phrases_to_keep = phrases_to_keep[['phrase']]
        
        reference_phrases = no_context[no_context['phrase_type'] == 'reference'].copy()

        # Perform the merge using the tuple-based key
        merged_phrases = pd.merge(reference_phrases, phrases_to_keep, on='phrase', how='inner')
        merged_phrases = merged_phrases[['phrase_num']]

        no_context = pd.merge(no_context, merged_phrases, on='phrase_num', how='inner')
        known = pd.merge(known, merged_phrases, on='phrase_num', how='inner')
        unknown= pd.merge(unknown, merged_phrases, on='phrase_num', how='inner')
        
    # Get the base LLR table
    cols = ['phrase_num', 'phrase_occurence', 'original_phrase', 'tokens']
    key_cols = ['phrase_num', 'phrase_occurence', 'original_phrase']
    llr_base = (
        pd.concat([known[cols], unknown[cols]], ignore_index=True)
        .drop_duplicates(subset=key_cols, keep='first')
        .sort_values(cols, ascending=[True, True, True, True])  # explicit
        .reset_index(drop=True)
    )
    llr_base['num_tokens'] = llr_base['tokens'].map(len)
    
    # Now get the phrase statistics
    # 1) No context phrase stats
    no_context_phrase_stats = (
        no_context
        .assign(ref_log_prob = no_context['sum_log_probs']
                .where(no_context['phrase_type'].eq('reference')))
        .assign(ref_raw_prob=no_context['raw_prob']
            .where(no_context['phrase_type'].eq('reference')))
        .groupby('phrase_num', dropna=False)
        .agg(
            no_context_log_prob=('ref_log_prob', 'sum'),
            num_phrases=('phrase_num', 'size'),
            sum_raw_prob=('raw_prob', 'sum'),
            reference_prob=('ref_raw_prob', 'max')
        )
        .assign(
            phrases_kept=lambda d: d['num_phrases'],
            pmf_no_context=lambda d: d['reference_prob'].div(d['sum_raw_prob']),
            llr_no_context=lambda d: np.where(d['pmf_no_context'] > 0, -np.log10(d['pmf_no_context']), 0.0)
        )
        .drop(columns=['sum_raw_prob', 'reference_prob'])
    )

    # 2) Known phrase stats
    known_phrase_stats = (
        known
        .assign(ref_log_prob = known['sum_log_probs_phrase']
                .where(known['phrase_type'].eq('reference')))
        .assign(ref_raw_prob=known['raw_prob']
                .where(known['phrase_type'].eq('reference')))
        .groupby(['phrase_num', 'phrase_occurence'], dropna=False)
        .agg(
            known_log_prob=('ref_log_prob', 'sum'),
            sum_raw_prob=('raw_prob', 'sum'),
            reference_prob=('ref_raw_prob', 'max')
        )
        .assign(
            pmf_known=lambda d: d['reference_prob'].div(d['sum_raw_prob']),
            llr_known=lambda d: np.where(d['pmf_known'] > 0, -np.log10(d['pmf_known']), 0.0)
        )
        .drop(columns=['sum_raw_prob', 'reference_prob'])
    )

    # 3) Unknown phrase stats
    unknown_phrase_stats = (
        unknown
        .assign(ref_log_prob = unknown['sum_log_probs_phrase']
                .where(unknown['phrase_type'].eq('reference')))
        .assign(ref_raw_prob=unknown['raw_prob']
                .where(unknown['phrase_type'].eq('reference')))
        .groupby(['phrase_num', 'phrase_occurence'], dropna=False)
        .agg(
            unknown_log_prob=('ref_log_prob', 'sum'),
            sum_raw_prob=('raw_prob', 'sum'),
            reference_prob=('ref_raw_prob', 'max')
        )
        .assign(
            pmf_unknown=lambda d: d['reference_prob'].div(d['sum_raw_prob']),
            llr_unknown=lambda d: np.where(d['pmf_unknown'] > 0, -np.log10(d['pmf_unknown']), 0.0)
        )
        .drop(columns=['sum_raw_prob', 'reference_prob'])
    )
    
    # ---- START OF RANK TEST ---- #
    # Rank all phrases including the reference phrase
    unknown['rank_including_ref'] = (
        unknown
        .groupby(['phrase_num', 'phrase_occurence'])['sum_log_probs_phrase']
        .rank(ascending=False, method='first')   # or method='dense' if you prefer
    )

    # reference_log_probs: from the 'reference' row
    ref = (
        unknown[unknown['phrase_type'] == 'reference']
        .groupby(['phrase_num', 'phrase_occurence'])['sum_log_probs_phrase']
        .first()                 # or .iloc[0] via .agg('first')
        .rename('reference_log_probs')
    )

    # max_log_probs: from the top-ranked row (rank_including_ref == 1)
    top = (
        unknown[unknown['rank_including_ref'] == 1]
        .groupby(['phrase_num', 'phrase_occurence'])['sum_log_probs_phrase']
        .first()
        .rename('max_log_probs')
    )

    # combine into one dataframe
    agg_df = (
        pd.concat([ref, top], axis=1)
        .reset_index()
    )

    # difference
    agg_df['unknown_ref_vs_top_rank'] = agg_df['reference_log_probs'] - agg_df['max_log_probs']

    agg_df = agg_df.drop(columns=['reference_log_probs', 'max_log_probs']).reset_index()
    
    # ---- END OF RANK TEST ---- #

    # ---- START OF REF vs BEST-OF-REST (rank 0 vs rank 1) ---- #

    keys = ['phrase_num', 'phrase_occurence']
    
    ref = (
        unknown[unknown['rank'] == 0]
        .groupby(keys, dropna=False)['sum_log_probs_phrase']
        .first()
        .rename('ref_logp')
    )

    best_rest = (
        unknown[unknown['rank'] == 1]
        .groupby(keys, dropna=False)['sum_log_probs_phrase']
        .first()
        .rename('best_rest_logp')
    )

    agg_df_ref_vs_best = (
        pd.concat([ref, best_rest], axis=1)
        .reset_index()
    )

    # LLR(ref vs best alternative)
    agg_df_ref_vs_best['unknown_ref_vs_best_rest'] = agg_df_ref_vs_best['ref_logp'] - agg_df_ref_vs_best['best_rest_logp']

    agg_df_ref_vs_best = agg_df_ref_vs_best.drop(columns=['ref_logp', 'best_rest_logp'])

    # ---- END OF REF vs BEST-OF-REST ---- #
    
    # Create final LLR table
    LLR = (
        llr_base
        .assign(
            phrase_num=llr_base['phrase_num'].astype('string'),
            phrase_occurence=pd.to_numeric(llr_base['phrase_occurence'], errors='coerce').astype('Int64')
        )
        .merge(no_context_phrase_stats, on='phrase_num', how='left')
        .merge(known_phrase_stats, on=['phrase_num','phrase_occurence'], how='left')
        .merge(unknown_phrase_stats, on=['phrase_num','phrase_occurence'], how='left')
        .merge(agg_df, on=['phrase_num','phrase_occurence'], how='left')
        .merge(agg_df_ref_vs_best, on=['phrase_num','phrase_occurence'], how='left')
    )
    LLR['known_vs_no_context_log_prob'] = LLR['known_log_prob'] - LLR['no_context_log_prob'] 
    LLR['unknown_vs_no_context_log_prob'] = LLR['unknown_log_prob'] - LLR['no_context_log_prob'] 

    LLR = LLR[['phrase_num', 'phrase_occurence', 'original_phrase', 'tokens', 'num_tokens',
               'num_phrases', 'phrases_kept', 'no_context_log_prob',
               'known_log_prob', 'unknown_log_prob', 'known_vs_no_context_log_prob', 'unknown_vs_no_context_log_prob',
               'pmf_no_context', 'pmf_known', 'pmf_unknown', 'llr_no_context', 'llr_known',
               'llr_unknown', 'unknown_ref_vs_top_rank', 'unknown_ref_vs_best_rest']]
    
    # Summarise the LLR table for the metadata
    LLR_summary = pd.DataFrame([{
        'num_phrases': LLR['phrase_num'].nunique(),
        'phrases_kept': LLR.loc[LLR['phrases_kept'] > 0, 'phrase_num'].nunique(),
        'known_log_prob': LLR['known_log_prob'].sum(skipna=True),
        'unknown_log_prob': LLR['unknown_log_prob'].sum(skipna=True),
        'llr_no_context': LLR['llr_no_context'].sum(skipna=True),
        'llr_known': LLR['llr_known'].sum(skipna=True),
        'llr_unknown': LLR['llr_unknown'].sum(skipna=True),
        'llr_unknown_vs_top_rank': LLR['unknown_ref_vs_top_rank'].sum(skipna=True),
        'unknown_ref_vs_best_rest': LLR['unknown_ref_vs_best_rest'].sum(skipna=True)
    }])

    LLR_summary = LLR_summary.assign(
        normalised_llr_no_context=lambda d: d['llr_no_context'] / d['phrases_kept'],
        normalised_llr_known=lambda d: d['llr_known'] / d['phrases_kept'],
        normalised_llr_unknown=lambda d: d['llr_unknown'] / d['phrases_kept']
    )
    
    # Create final metadata table
    # 1. Drop any overlapping columns in metadata
    overlapping_cols = LLR_summary.columns.intersection(metadata.columns)
    metadata_final = metadata.drop(columns=overlapping_cols, errors='ignore')

    # 2. Concatenate new values
    metadata_final = pd.concat([metadata_final, LLR_summary], axis=1)
    
    if write_excel:
        
        print("Writing file")
        path = Path(save_dir + '/' + doc_name)
        
        # Choose writer mode safely
        writer_mode = "a" if path.exists() else "w"
        writer_kwargs = {"engine": "openpyxl", "mode": writer_mode}
        if writer_mode == "a":
            writer_kwargs["if_sheet_exists"] = "replace"  # only valid in append mode
        

        with pd.ExcelWriter(path, **writer_kwargs) as writer:
            # Write sheets
            docs.to_excel(writer, index=False, sheet_name="docs")
            known.to_excel(writer, index=False, sheet_name="known")
            unknown.to_excel(writer, index=False, sheet_name="unknown")
            no_context.to_excel(writer, index=False, sheet_name="no context")
            LLR.to_excel(writer, index=False, sheet_name="LLR")
            metadata_final.to_excel(writer, index=False, sheet_name="metadata")
            
            # wb = writer.book
            # wb._sheets = ["docs", "metadata", "no context", "known", "unknown", "LLR"]
    
    return metadata_final

In [62]:
def process_directory(
    read_dir,
    save_dir,
    result_save_loc=None,   # optional now
    phrase_loc="/Volumes/BCross/paraphrase examples slurm/wiki-phrase-list-reviewed.xlsx"
):

    print(f"Reading from: {read_dir}")
    print(f"Saving processed files to: {save_dir}")
    
    if result_save_loc:
        print(f"Final combined output: {result_save_loc}\n")
    else:
        print("No result_save_loc provided → combined file will NOT be saved.\n")

    os.makedirs(save_dir, exist_ok=True)

    # If result_save_loc exists, do not overwrite
    if result_save_loc and os.path.exists(result_save_loc):
        print(f"Output already exists: {result_save_loc}. Exiting.")
        return

    # Read all Excel files
    xlsx_files = glob.glob(os.path.join(read_dir, "*.xlsx"))
    print(f"Found {len(xlsx_files)} files\n")

    all_metadata = []

    for i, file_path in enumerate(xlsx_files, start=1):
        
        base_name = os.path.basename(file_path)
        
        # If NO combined output is being written, skip files whose processed output already exists
        if not result_save_loc:
            processed_out_path = os.path.join(save_dir, base_name)  # adjust if your pipeline uses a different naming scheme
            if os.path.exists(processed_out_path):
                print(f"Skipping file {i}/{len(xlsx_files)} (already processed): {base_name}")
                continue
        print(f"Processing file {i}/{len(xlsx_files)}: {base_name}")

        try:
            metadata = create_results_doc_pipeline(
                file_path,
                write_excel=True,
                save_dir=save_dir,
                phrase_loc=phrase_loc
            )
            all_metadata.append(metadata)

        except Exception as e:
            print(f"❌ Failed to process {file_path}\nError: {e}\n")
            continue

    # Combine metadata
    if all_metadata:
        full_metadata = pd.concat(all_metadata, ignore_index=True)
    else:
        full_metadata = pd.DataFrame()

    # Sort if index exists
    if "index" in full_metadata.columns:
        full_metadata = full_metadata.sort_values(by="index").reset_index(drop=True)

    print("\nAll files complete ✓")
    
    # Save if a path was provided
    if result_save_loc:
        full_metadata.to_excel(result_save_loc, index=False)
        print(f"\nCombined results saved to: {result_save_loc}")
    else:
        print("\nSkipping save because no result_save_loc was provided.")

        return full_metadata  # return result in all cases


In [63]:
# base_loc = '/Volumes/BCross/paraphrase examples slurm'
# data_loc = f"{base_loc}/Wiki-Test"

# # For Qwen
# # read_dir = f'{data_loc}/raw'
# # For everything else
# read_dir = f'{data_loc}/ModernBERT-large/gpt2 results/raw'
# save_dir = f'{data_loc}/ModernBERT-large/gpt2 results/filtered_inc_rank_v3'

# result_save_loc = f"{data_loc}/ModernBERT-base/results.xlsx"
# phrase_loc = f"{base_loc}/wiki-phrase-list-reviewed.xlsx"

# process_directory(
#     read_dir=read_dir,
#     save_dir=save_dir,
#     # result_save_loc=result_save_loc,
#     phrase_loc=phrase_loc)

In [64]:
base_data_dir = "/Volumes/BCross/av_datasets_experiments/ngram_masking"

data_types = ["test", "training"]
corpuses = ["ACL", "Enron", "Perverted Justice", "StackExchange",
            "The Telegraph", "TripAdvisor", "Wiki"]
paraphrasing_models = ["ModernBERT-base", "ModernBERT-large"]
scoring_models = ["gpt2"]

for dt in data_types:
    print(f"Working on {dt} data")
    for cp in corpuses:
        print(f"Working on the {cp} corpus")
        for pm in paraphrasing_models:
            print(f"Working on the {pm} paraphrasing model")
            for sm in scoring_models:
                print(f"Working on the {sm} scoring model")
                
                data_dir = f"{base_data_dir}/{dt}/{cp}/{pm}/{sm} results/raw"
                save_dir = f"{base_data_dir}/{dt}/{cp}/{pm}/{sm} results/raw_inc_rank"
                
                if not os.path.isdir(data_dir):
                    print(f"Skipping (missing): {data_dir}")
                    continue
                
                process_directory(
                    read_dir=data_dir,
                    save_dir=save_dir
                )

Working on test data
Working on the ACL corpus
Working on the ModernBERT-base paraphrasing model
Working on the gpt2 scoring model
Reading from: /Volumes/BCross/av_datasets_experiments/ngram_masking/test/ACL/ModernBERT-base/gpt2 results/raw
Saving processed files to: /Volumes/BCross/av_datasets_experiments/ngram_masking/test/ACL/ModernBERT-base/gpt2 results/raw_inc_rank
No result_save_loc provided → combined file will NOT be saved.

Found 5 files

Skipping file 1/5 (already processed): levow_gina_anne_w12_1811_acl_2012 vs liberman_mark_w09_0102_acl_2009.xlsx
Skipping file 2/5 (already processed): liberman_mark_w18_3801_acl_2018 vs liberman_mark_w09_0102_acl_2009.xlsx
Skipping file 3/5 (already processed): kordoni_valia_p18_5005_acl_2018 vs kordoni_valia_e03_1044_acl_2003.xlsx
Skipping file 4/5 (already processed): nishiguchi_sumiyo_w13_5414_acl_2013 vs nivre_joakim_w07_2220_acl_2007.xlsx
Skipping file 5/5 (already processed): kokkinakis_dimitrios_s01_1022_acl_2001 vs kordoni_valia_e03_