# Combine Token Score Files

This notebook is for combining files already grouped into token size level scores into a single dataframe.

In [6]:
import pandas as pd

from pathlib import Path
from typing import Literal

In [None]:
models = ["gemma-3-270m", "gpt2", "Qwen2.5-0.5B-Instruct", "Qwen2.5-1.5B-Instruct"]

corpuses = ["Wiki", "Perverted Justice", "Enron", "ACL", "StackExchange", "TripAdvisor"]

data_types = ["training", "test"]

base_loc = "/Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs"

file_name = "token_level_raw_scores.xlsx"

agg_file_name = "raw_aggregated_scores.xlsx"

metadata_file_name = "raw_problem_completed_metadata.xlsx"

In [8]:
def create_agg_problem_scores(
    score_df: pd.DataFrame,
    group_cols: list[str] | None = None,
    sum_cols: list[str] | None = None,
) -> pd.DataFrame:
    """
    Aggregate per-row scores up to the problem level by summing selected columns.

    Parameters
    ----------
    score_df : pd.DataFrame
        Input table with grouping columns + score columns.
    group_cols : list[str], optional
        Columns to group by. Defaults to the common set you described.
    sum_cols : list[str], optional
        Numeric columns to sum. Defaults to the three you listed.

    Returns
    -------
    pd.DataFrame
        Grouped/aggregated DataFrame with summed score columns.
    """
    if group_cols is None:
        group_cols = ["data_type", "corpus", "scoring_model", "problem", "target", "min_token_size"]

    if sum_cols is None:
        sum_cols = ["no_context_sum_log_probs", "known_sum_log_probs", "unknown_sum_log_probs"]

    missing_g = [c for c in group_cols if c not in score_df.columns]
    missing_s = [c for c in sum_cols if c not in score_df.columns]
    if missing_g or missing_s:
        raise KeyError(
            f"Missing columns. group_cols missing={missing_g}; sum_cols missing={missing_s}"
        )

    # Ensure sum columns are numeric-ish (coerce bad strings to NaN, then treat NaN as 0 for sums)
    df = score_df.copy()
    for c in sum_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    out = (
        df.groupby(group_cols, dropna=False)[sum_cols]
          .sum(min_count=1)          # if a group is all-NaN for a col, keep NaN (not 0)
          .reset_index()
    )

    return out


In [9]:
def combine_token_level_files(
    base_loc: str | Path,
    file_name: str,
    metadata_file_name: str,
    aggregated_scores_file_name: str,
    data_types: list[str],
    corpuses: list[str],
    models: list[str],
    return_df: Literal=False
) -> pd.DataFrame:
    """
    Read token-level excel files from:
        {base_loc}/{data_type}/{corpus}/{model}/{file_name}
    Combine them, then save to:
        {base_loc}/{file_name}

    Skips missing input files.
    Always overwrites the output file.
    Returns the combined dataframe.
    """
    base_loc = Path(base_loc)
    out_path = base_loc / file_name
    metadata_out_path = base_loc / metadata_file_name
    agg_scores_out_path = base_loc / aggregated_scores_file_name

    combined = []
    combined_metadata = []
    
    for data_type in data_types:
        for corpus in corpuses:
            for model in models:
                file_loc = base_loc / data_type / corpus / model / file_name
                metadata_file_loc = base_loc / data_type / corpus / model / metadata_file_name
                
                # Normal score file first
                if not file_loc.exists():
                    print(f"SKIP (missing): {file_loc}")
                    continue

                try:
                    df = pd.read_excel(file_loc)
                    combined.append(df)
                    print(f"READ: {file_loc}  (rows={len(df)})")
                except Exception as e:
                    print(f"WARN: failed reading {file_loc}: {e}")

                # Now metadata score file
                if not metadata_file_loc.exists():
                    print(f"SKIP (missing): {file_loc}")
                    continue

                try:
                    meta_df = pd.read_excel(metadata_file_loc)
                    combined_metadata.append(meta_df)
                    print(f"READ: {metadata_file_loc}  (rows={len(df)})")
                except Exception as e:
                    print(f"WARN: failed reading {metadata_file_loc}: {e}")                   
                
    if not combined:
        raise RuntimeError("No readable input files found; nothing to combine.")

    results = (
        pd.concat(combined, ignore_index=True)
        .sort_values(
            ["data_type", "corpus", "scoring_model", "sample_id", "min_token_size"],
            ascending=[True, True, True, True, True],
            kind="mergesort"
        )
        .reset_index(drop=True)
    )

    agg_results = create_agg_problem_scores(results)
    
    meta = (
        pd.concat(combined_metadata, ignore_index=True)
        .sort_values(
            ["data_type", "corpus", "scoring_model"],
            ascending=[True, True, True],
            kind="mergesort"
        )
        .reset_index(drop=True)
    )
    
    out_path.parent.mkdir(parents=True, exist_ok=True)

    if out_path.exists():
        print(f"OVERWRITE: {out_path}")

    results.to_excel(out_path, index=False)
    print(f"SAVED: {out_path}  (rows={len(results)})")

    agg_results.to_excel(agg_scores_out_path, index=False)
    print(f"SAVED: {agg_scores_out_path}  (rows={len(agg_results)})")
    
    meta.to_excel(metadata_out_path, index=False)
    print(f"SAVED: {metadata_out_path}  (rows={len(meta)})")
    
    if return_df:
        return results


In [10]:
combine_token_level_files(
    base_loc=base_loc,
    file_name=file_name,
    metadata_file_name=metadata_file_name,
    aggregated_scores_file_name=agg_file_name,
    data_types=data_types,
    corpuses=corpuses,
    models=models,
)

SKIP (missing): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gemma-3-270m/token_level_raw_scores.xlsx
SKIP (missing): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/token_level_raw_scores.xlsx
SKIP (missing): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/Qwen2.5-0.5B-Instruct/token_level_raw_scores.xlsx
SKIP (missing): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/Qwen2.5-1.5B-Instruct/token_level_raw_scores.xlsx
SKIP (missing): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Perverted Justice/gemma-3-270m/token_level_raw_scores.xlsx
SKIP (missing): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Perverted Justice/gpt2/token_level_raw_scores.xlsx
SKIP (missing): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Perverted Justice/Qwen2.5-0.5B-Instruct/token_level_raw_scores.xlsx
SKIP (missin