In [1]:
import pandas as pd

In [2]:
models = ["gpt2"]

corpora = [
    "Wiki", "Enron", "Perverted Justice", "StackExchange", "ACL",
    "TripAdvisor", "The Apricity", "Koppel's Blogs", "The Telegraph",
    "Reddit"
]

data_types = ["training", "test"]

base_loc = "/Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs"

file_name = "raw_token_level_scores_complete.xlsx"

save_name = "raw_agg_scores.xlsx"

In [3]:
def create_agg_problem_scores(
    score_df: pd.DataFrame,
    group_cols: list[str] | None = None,
    sum_cols: list[str] | None = None,
) -> pd.DataFrame:
    """
    Aggregate per-row scores up to the problem level by summing selected columns.

    Parameters
    ----------
    score_df : pd.DataFrame
        Input table with grouping columns + score columns.
    group_cols : list[str], optional
        Columns to group by. Defaults to the common set you described.
    sum_cols : list[str], optional
        Numeric columns to sum. Defaults to the three you listed.

    Returns
    -------
    pd.DataFrame
        Grouped/aggregated DataFrame with summed score columns.
    """
    if group_cols is None:
        group_cols = [
            "data_type", "corpus", "scoring_model", "max_context_tokens",
            "min_token_size", "problem", "target"
        ]

    if sum_cols is None:
        sum_cols = ["no_context_sum_log_probs", "known_sum_log_probs", "unknown_sum_log_probs"]

    missing_g = [c for c in group_cols if c not in score_df.columns]
    missing_s = [c for c in sum_cols if c not in score_df.columns]
    if missing_g or missing_s:
        raise KeyError(
            f"Missing columns. group_cols missing={missing_g}; sum_cols missing={missing_s}"
        )

    # Ensure sum columns are numeric-ish (coerce bad strings to NaN, then treat NaN as 0 for sums)
    df = score_df.copy()
    for c in sum_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    out = (
        df.groupby(group_cols, dropna=False)[sum_cols]
          .sum(min_count=1)          # if a group is all-NaN for a col, keep NaN (not 0)
          .reset_index()
    )

    return out


In [4]:
for model_name in models:
    for data_type in data_types:
        for corpus in corpora:
            try:
                
                file_loc = f"{base_loc}/{data_type}/{corpus}/{model_name}/{file_name}"
                save_loc = f"{base_loc}/{data_type}/{corpus}/{model_name}/{save_name}"
                
                df = pd.read_excel(file_loc)
                
                agg_df  = create_agg_problem_scores(df)
                
                agg_df.to_excel(save_loc)
                
            except Exception as e:
                print(f"Missing/failed for model={model_name}, data_type={data_type}, corpus={corpus}: {e}")

Missing/failed for model=gpt2, data_type=training, corpus=Reddit: [Errno 2] No such file or directory: '/Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Reddit/gpt2/raw_token_level_scores_complete.xlsx'
