In [1]:
import pandas as pd

from typing import Iterable, Optional
from pathlib import Path

In [2]:
models = ["gpt2"]

corpora = [
    "Wiki", "Enron", "Perverted Justice", "StackExchange", "ACL",
    "TripAdvisor", "The Apricity", "Koppel's Blogs"
]

data_types = ["training", "test"]

base_loc = "/Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs"

In [3]:
def get_complete_problems(
    score_df: pd.DataFrame,
    metadata: pd.DataFrame,
    join_cols: Optional[Iterable[str]] = None,
    *,
    completed_col: str = "problem_completed",
    save_loc: str | Path | None = None,
    overwrite: bool = False,
    engine: str = "openpyxl",
    return_df: bool = True,
) -> pd.DataFrame | None:
    """
    Filter metadata to completed problems, inner-join to score_df on join_cols,
    optionally save to Excel, and optionally return the joined DataFrame.

    - Drops duplicate completed problems on join_cols to avoid multiplying rows.
    - Expects metadata[completed_col] to already be boolean.
    """

    if join_cols is None:
        join_cols = ["data_type", "corpus", "scoring_model", "max_context_tokens", "problem"]
    join_cols = list(join_cols)

    # Validate required columns
    missing_meta = [c for c in ([completed_col] + join_cols) if c not in metadata.columns]
    if missing_meta:
        raise KeyError(f"metadata is missing required columns: {missing_meta}")

    missing_score = [c for c in join_cols if c not in score_df.columns]
    if missing_score:
        raise KeyError(f"score_df is missing required columns: {missing_score}")

    if metadata[completed_col].dtype != bool:
        raise TypeError(f"Expected metadata['{completed_col}'] to be boolean.")

    completed_probs = (
        metadata.loc[metadata[completed_col], join_cols]
        .drop_duplicates(join_cols)
        .copy()
    )

    joined_df = score_df.merge(completed_probs, how="inner", on=join_cols)

    if save_loc is not None:
        save_loc = Path(save_loc)
        if save_loc.exists() and not overwrite:
            raise FileExistsError(f"File already exists and overwrite=False: {save_loc}")
        save_loc.parent.mkdir(parents=True, exist_ok=True)
        joined_df.to_excel(save_loc, index=False, engine=engine)

    return joined_df if return_df else None


In [4]:
for model_name in models:
    for data_type in data_types:
        for corpus in corpora:
            try:
                
                metadata = pd.read_excel(f"{base_loc}/{data_type}/{corpus}/{model_name}/raw_problem_completed_metadata_combined.xlsx")
                score_df = pd.read_excel(f"{base_loc}/{data_type}/{corpus}/{model_name}/raw_token_level_scores_combined.xlsx")
                save_loc = f"{base_loc}/{data_type}/{corpus}/{model_name}/raw_token_level_scores_complete.xlsx"
                
                get_complete_problems(
                    score_df,
                    metadata,
                    join_cols=["data_type", "corpus", "scoring_model", "max_context_tokens", "problem"],
                    save_loc=save_loc,
                    overwrite=True
                )
            except Exception as e:
                print(f"Missing/failed for model={model_name}, data_type={data_type}, corpus={corpus}: {e}")

Missing/failed for model=gpt2, data_type=test, corpus=Koppel's Blogs: [Errno 2] No such file or directory: "/Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Koppel's Blogs/gpt2/raw_problem_completed_metadata_combined.xlsx"
