In [1]:
import sys
import re

import pandas as pd

from pathlib import Path
from typing import Union, Sequence, Optional
from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from read_and_write_docs import read_excel_sheets, read_rds
from utils import list_xlsx_files

In [2]:
models = ["gpt2"]

corpora = [
    "Wiki", "Enron", "Perverted Justice", "StackExchange", "ACL",
    "TripAdvisor", "The Apricity", "Koppel's Blogs", "The Telegraph",
    "Reddit"
]

data_types = ["training", "test"]

raw_subdirs = (
    "raw", "raw_100", "raw_200", "raw_300", "raw_400", "raw_500",
    "raw_600", "raw_700", "raw_800", "raw_900", "raw_1000"
)

base_loc = "/Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs"

metadata_base_loc = "/Volumes/BCross/datasets/author_verification"

In [3]:
def compare_complete_to_metadata(metadata_base_loc, data_type, corpus, model, excel_files, max_context_tokens=None):
    
    metadata_loc = f"{metadata_base_loc}/{data_type}/doc_level_metadata.rds"
    
    metadata = read_rds(metadata_loc)
    metadata = metadata[metadata['corpus'] == corpus]
    metadata['scoring_model'] = model
    metadata['max_context_tokens'] = max_context_tokens
    
    file_names = [ef.name for ef in excel_files]
    # df with filename + completed=True
    df = pd.DataFrame({
        "filename": file_names,
        "completed": True
    })

    # left join onto metadata_df and fill missing completed with False
    metadata_df = (
        metadata
        .merge(df, on="filename", how="left")
    )

    metadata_df["completed"] = metadata_df["completed"].fillna(False).astype(bool)
    metadata_df["scored"] = False
    
    return metadata_df

In [4]:
def create_problem_complete_metadata(metadata: pd.DataFrame) -> pd.DataFrame:
    """
    Groups by (data_type, corpus, scoring_model, problem) and returns:
      - num_files: total rows
      - files_completed: count where completed == True
      - files_scored: count where scored == True
      - problem_completed: True if num_files == files_scored
    """
    group_cols = ["data_type", "corpus", "scoring_model", "max_context_tokens", "problem"]

    out = (
        metadata
        .groupby(group_cols, dropna=False)
        .agg(
            num_files=("filename", "size"),
            files_completed=("completed", lambda s: int(s.fillna(False).astype(bool).sum())),
            files_scored=("scored", lambda s: int(s.fillna(False).astype(bool).sum())),
        )
        .reset_index()
    )

    out["problem_completed"] = out["num_files"] == out["files_scored"]
    return out

In [5]:
def parse_max_context_tokens(raw_dir_name: Optional[str]) -> Union[int, str, None]:
    """
    raw_dir_name: "raw" or "raw_200" / "raw_300" etc (or None).
    Returns:
      - "None" (string) if raw_dir_name == "raw"
      - int if suffix exists (e.g. "raw_200" -> 200)
      - None (Python) if raw_dir_name is None or doesn't match expected pattern
    """
    if raw_dir_name is None:
        return None

    s = str(raw_dir_name).strip()
    if s == "raw":
        return "None"

    m = re.fullmatch(r"raw_(\d+)", s)
    return int(m.group(1)) if m else None

In [6]:
def _read_for_combine(path: Path, *, engine: str | None = None) -> pd.DataFrame:
    # keep_default_na=False stops pandas treating "None" as NaN
    df = pd.read_excel(path, engine=engine, keep_default_na=False)

    # Normalise max_context_length so blanks become the string "None"
    if "max_context_length" in df.columns:
        def _norm(v):
            # "" happens if Excel cell is blank; NaN can still appear in some cases
            if v == "" or v is None or (isinstance(v, float) and pd.isna(v)):
                return "None"
            # make 200.0 -> 200 (optional nicety)
            if isinstance(v, float) and v.is_integer():
                return int(v)
            return v

        df["max_context_length"] = df["max_context_length"].map(_norm)

    return df

In [7]:
def build_and_save_token_level_raw_scores(
    base_loc: str | Path,
    metadata_base_loc: str | Path,
    data_type: str,
    corpus: str,
    model: str,
    *,
    raw_subdirs: Sequence[str] = ("raw",),
    save_dirname: str | None = None,   # NEW: e.g. "compiled" or "aggregates"
    sheet_name: str = "metadata",
    output_name: str = "token_level_raw_scores.xlsx",
    recursive: bool = False,
    engine: str | None = None,
    overwrite: bool = False,
    combine_files: bool = False,
    combined_file_prefix: str = None
) -> None:
    """
    For each row in completed_df, read all .xlsx files in:
        {base_loc}/{data_type}/{corpus}/{model}/{raw_subdir}

    Save outputs either:
      - alongside raw_subdir parent (default, original behaviour), OR
      - under {base_loc}/{data_type}/{corpus}/{model}/{save_dirname}/ (if provided)

    If multiple raw_subdirs are provided and output_name does NOT include "{raw_subdir}",
    output files are auto-suffixed to avoid collisions.
    """
    
    base_loc = Path(base_loc)
    metadata_base_loc = Path(metadata_base_loc)
    
    model_dir = base_loc / data_type / corpus / model
    save_base = (model_dir / save_dirname) if save_dirname else model_dir
    
    def _render_name(name: str, raw_subdir: str, *, disambiguate: bool) -> str:
        if "{raw_subdir}" in name:
            return name.format(raw_subdir=raw_subdir)
        if disambiguate:
            p = Path(name)
            return f"{p.stem}_{raw_subdir}{p.suffix}"
        return name
    
    def _combined_name(name: str, combined_file_prefix: str) -> str:
        # if the user used a template name, swap raw_subdir for "combined"
        if "{raw_subdir}" in name:
            return name.format(raw_subdir="combined")
        p = Path(name)
        return f"{combined_file_prefix}{p.stem}_combined{p.suffix}"

    if isinstance(raw_subdirs, str):
        raw_subdirs = (raw_subdirs,)

    disambiguate_names = len(tuple(raw_subdirs)) > 1
    
    def _paths_for(raw_subdir: str):
        out_name = _render_name(output_name, raw_subdir, disambiguate=disambiguate_names)
        meta_name = _render_name("problem_metadata.xlsx", raw_subdir, disambiguate=disambiguate_names)
        summary_meta_name = _render_name(
            "problem_completed_metadata.xlsx", raw_subdir, disambiguate=disambiguate_names
        )
        return (
            model_dir / raw_subdir,                 # raw_dir
            save_base / out_name,                   # out_path
            save_base / meta_name,                  # metadata_out_path
            save_base / summary_meta_name,          # summary_metadata_out_path
        )
        
    for raw_subdir in raw_subdirs:
        
        raw_dir = model_dir / raw_subdir

        out_name = _render_name(output_name, raw_subdir, disambiguate=disambiguate_names)
        meta_name = _render_name("problem_metadata.xlsx", raw_subdir, disambiguate=disambiguate_names)
        summary_meta_name = _render_name(
            "problem_completed_metadata.xlsx", raw_subdir, disambiguate=disambiguate_names
        )

        out_path = save_base / out_name
        metadata_out_path = save_base / meta_name
        summary_metadata_out_path = save_base / summary_meta_name

        max_context_tokens = parse_max_context_tokens(raw_subdir)
        
        # skip if output already exists
        if not overwrite and out_path.exists():
            print(f"SKIP (exists): {out_path}")
            continue

        # if raw dir missing / empty, skip
        if not raw_dir.exists():
            print(f"SKIP (no dir): {raw_dir}")
            continue

        excel_files = list_xlsx_files(raw_dir, recursive=recursive)
        if not excel_files:
            print(f"SKIP (no files): {raw_dir}")
            continue

        base_metadata = compare_complete_to_metadata(
            metadata_base_loc, data_type, corpus, model, excel_files, max_context_tokens=max_context_tokens
        )
        
        combined_metadata: list[pd.DataFrame] = []

        for ef in excel_files:
            f_name = ef.name
            try:
                data = read_excel_sheets(ef, [sheet_name])
                combined_metadata.append(data[sheet_name])

                # âœ… mark as scored if read succeeded
                base_metadata.loc[base_metadata["filename"] == f_name, "scored"] = True
            except Exception as e:
                print(f"  WARN: failed reading {sheet_name} from {ef}: {e}")

        if not combined_metadata:
            print(f"SKIP (no readable sheets): {raw_dir}")
            continue
        
        results = (
            pd.concat(combined_metadata, ignore_index=True)
            .sort_values(["sample_id", "min_token_size"], ascending=[True, True], kind="mergesort")
            .reset_index(drop=True)
        )

        # insert data_type before corpus, scoring_model after corpus
        if "corpus" in results.columns:
            corpus_idx = results.columns.get_loc("corpus")

            if "data_type" in results.columns:
                results.drop(columns=["data_type"], inplace=True)
            results.insert(corpus_idx, "data_type", data_type)

            corpus_idx = results.columns.get_loc("corpus")  # re-fetch
            if "scoring_model" in results.columns:
                results.drop(columns=["scoring_model"], inplace=True)
            results.insert(corpus_idx + 1, "scoring_model", model)
            
        # move problem before known_author
        if "problem" in results.columns and "known_author" in results.columns:
            problem_idx = results.columns.get_loc("problem")
            known_author_idx = results.columns.get_loc("known_author")

            problem_col = results.pop("problem")
            if problem_idx < known_author_idx:
                known_author_idx -= 1

            results.insert(known_author_idx, "problem", problem_col)

        # ensure save dir exists, then save
        save_base.mkdir(parents=True, exist_ok=True)

        results.to_excel(out_path, index=False, engine=engine)
        print(f"SAVED: {out_path}  (rows={len(results)})")

        base_metadata.to_excel(metadata_out_path, index=False, engine=engine)

        summary_metadata = create_problem_complete_metadata(base_metadata)
        summary_metadata.to_excel(summary_metadata_out_path, index=False, engine=engine)
        
# -------------------------
    # Combined outputs (optional)
    # -------------------------
    if not combine_files:
        return

    # Determine combined output paths
    combined_results_path = model_dir / _combined_name(output_name, combined_file_prefix)
    combined_summary_path = model_dir / _combined_name("problem_completed_metadata.xlsx", combined_file_prefix)

    if not overwrite and combined_results_path.exists() and combined_summary_path.exists():
        print(f"SKIP (combined exists): {combined_results_path}")
        print(f"SKIP (combined exists): {combined_summary_path}")
        return

    results_dfs: list[pd.DataFrame] = []
    summary_dfs: list[pd.DataFrame] = []

    for raw_subdir in raw_subdirs:
        _, out_path, _, summary_metadata_out_path = _paths_for(raw_subdir)

        # Read existing per-raw_subdir files (even if we skipped creating them above)
        if out_path.exists():
            df = _read_for_combine(out_path, engine=engine)
            if "raw_subdir" not in df.columns:
                df.insert(len(df.columns), "raw_subdir", raw_subdir)
            results_dfs.append(df)
        else:
            print(f"  WARN: missing per-subdir results file (skip in combine): {out_path}")

        if summary_metadata_out_path.exists():
            sdf = _read_for_combine(summary_metadata_out_path, engine=engine)
            if "raw_subdir" not in sdf.columns:
                sdf.insert(len(sdf.columns), "raw_subdir", raw_subdir)
            summary_dfs.append(sdf)
        else:
            print(f"  WARN: missing per-subdir summary file (skip in combine): {summary_metadata_out_path}")

    if results_dfs:
        combined_results = pd.concat(results_dfs, ignore_index=True)

        # nice deterministic ordering if columns exist
        sort_cols = [c for c in ["data_type", "corpus", "scoring_model", "max_context_length", "sample_id", "min_token_size"]
                     if c in combined_results.columns]
        if sort_cols:
            combined_results = combined_results.sort_values(sort_cols, kind="mergesort").reset_index(drop=True)

        save_base.mkdir(parents=True, exist_ok=True)
        combined_results.to_excel(combined_results_path, index=False, engine=engine)
        print(f"SAVED (combined): {combined_results_path}  (rows={len(combined_results)})")
    else:
        print("SKIP (combined results): no per-subdir result files found to combine")

    if summary_dfs:
        combined_summary = pd.concat(summary_dfs, ignore_index=True)

        sort_cols = [c for c in ["data_type", "corpus", "scoring_model", "max_context_length", "problem"]
                     if c in combined_summary.columns]
        if sort_cols:
            combined_summary = combined_summary.sort_values(sort_cols, kind="mergesort").reset_index(drop=True)

        save_base.mkdir(parents=True, exist_ok=True)
        combined_summary.to_excel(combined_summary_path, index=False, engine=engine)
        print(f"SAVED (combined): {combined_summary_path}  (rows={len(combined_summary)})")
    else:
        print("SKIP (combined summary): no per-subdir summary files found to combine")

In [8]:
for model_name in models:
    for data_type in data_types:
        for corpus in corpora:
            try:
                build_and_save_token_level_raw_scores(
                    base_loc=base_loc,
                    metadata_base_loc=metadata_base_loc,
                    data_type=data_type,
                    corpus=corpus,
                    model=model_name,
                    raw_subdirs=raw_subdirs,
                    save_dirname="raw_results",
                    sheet_name="metadata",
                    output_name="token_level_scores.xlsx",
                    recursive=False,
                    engine="openpyxl",
                    overwrite=False,
                    combine_files=True,
                    combined_file_prefix="raw_",
                )
            except Exception as e:
                print(f"Missing/failed for model={model_name}, data_type={data_type}, corpus={corpus}: {e}")

SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/token_level_scores_raw.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/token_level_scores_raw_100.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/token_level_scores_raw_200.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/token_level_scores_raw_300.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/token_level_scores_raw_400.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/token_level_scores_raw_500.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/token_level_scores_raw_600.xlsx
SKIP (exists): /V



SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw.xlsx  (rows=203)




SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw_100.xlsx  (rows=10561)




SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw_200.xlsx  (rows=10561)




SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw_300.xlsx  (rows=10561)




SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw_400.xlsx  (rows=10561)




SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw_500.xlsx  (rows=10561)




SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw_600.xlsx  (rows=10561)




SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw_700.xlsx  (rows=10561)




SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw_800.xlsx  (rows=10556)




SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw_900.xlsx  (rows=10544)




SAVED: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_results/token_level_scores_raw_1000.xlsx  (rows=10533)
SAVED (combined): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_token_level_scores_combined.xlsx  (rows=105763)
SAVED (combined): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Reddit/gpt2/raw_problem_completed_metadata_combined.xlsx  (rows=13200)
