In [1]:
import sys
import re

import pandas as pd

from pathlib import Path
from typing import Union, Sequence, Optional
from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from read_and_write_docs import read_excel_sheets, read_rds
from utils import list_xlsx_files

In [2]:
models = ["gpt2"]

corpora = [
    "Wiki", "Enron", "Perverted Justice", "StackExchange", "ACL",
    "TripAdvisor", "The Apricity", "Koppel's Blogs", "The Telegraph",
    "Reddit"
]

data_types = ["training", "test"]

raw_subdirs = (
    "raw", "raw_100", "raw_200", "raw_300", "raw_400", "raw_500",
    "raw_600", "raw_700", "raw_800", "raw_900", "raw_1000"
)

base_loc = "/Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs"

metadata_base_loc = "/Volumes/BCross/datasets/author_verification"

In [3]:
def compare_complete_to_metadata(metadata_base_loc, data_type, corpus, model, excel_files, max_context_tokens=None):
    
    metadata_loc = f"{metadata_base_loc}/{data_type}/doc_level_metadata.rds"
    
    metadata = read_rds(metadata_loc)
    metadata = metadata[metadata['corpus'] == corpus]
    metadata['scoring_model'] = model
    metadata['max_context_tokens'] = max_context_tokens
    
    file_names = [ef.name for ef in excel_files]
    # df with filename + completed=True
    df = pd.DataFrame({
        "filename": file_names,
        "completed": True
    })

    # left join onto metadata_df and fill missing completed with False
    metadata_df = (
        metadata
        .merge(df, on="filename", how="left")
    )

    metadata_df["completed"] = metadata_df["completed"].fillna(False).astype(bool)
    metadata_df["scored"] = False
    
    return metadata_df

In [4]:
def create_problem_complete_metadata(metadata: pd.DataFrame) -> pd.DataFrame:
    """
    Groups by (data_type, corpus, scoring_model, problem) and returns:
      - num_files: total rows
      - files_completed: count where completed == True
      - files_scored: count where scored == True
      - problem_completed: True if num_files == files_scored
    """
    group_cols = ["data_type", "corpus", "scoring_model", "max_context_tokens", "problem"]

    out = (
        metadata
        .groupby(group_cols, dropna=False)
        .agg(
            num_files=("filename", "size"),
            files_completed=("completed", lambda s: int(s.fillna(False).astype(bool).sum())),
            files_scored=("scored", lambda s: int(s.fillna(False).astype(bool).sum())),
        )
        .reset_index()
    )

    out["problem_completed"] = out["num_files"] == out["files_scored"]
    return out

In [5]:
def parse_max_context_tokens(raw_dir_name: Optional[str]) -> Union[int, str, None]:
    """
    raw_dir_name: "raw" or "raw_200" / "raw_300" etc (or None).
    Returns:
      - "None" (string) if raw_dir_name == "raw"
      - int if suffix exists (e.g. "raw_200" -> 200)
      - None (Python) if raw_dir_name is None or doesn't match expected pattern
    """
    if raw_dir_name is None:
        return None

    s = str(raw_dir_name).strip()
    if s == "raw":
        return "None"

    m = re.fullmatch(r"raw_(\d+)", s)
    return int(m.group(1)) if m else None

In [6]:
def _read_for_combine(path: Path, *, engine: str | None = None) -> pd.DataFrame:
    # keep_default_na=False stops pandas treating "None" as NaN
    df = pd.read_excel(path, engine=engine, keep_default_na=False)

    # Normalise max_context_length so blanks become the string "None"
    if "max_context_length" in df.columns:
        def _norm(v):
            # "" happens if Excel cell is blank; NaN can still appear in some cases
            if v == "" or v is None or (isinstance(v, float) and pd.isna(v)):
                return "None"
            # make 200.0 -> 200 (optional nicety)
            if isinstance(v, float) and v.is_integer():
                return int(v)
            return v

        df["max_context_length"] = df["max_context_length"].map(_norm)

    return df

In [7]:
def build_and_save_phrase_scores_with_metadata_prefix(
    base_loc: str | Path,
    data_type: str,
    corpus: str,
    model: str,
    *,
    raw_subdirs: Sequence[str] = ("raw",),
    save_dirname: str | None = None,
    meta_sheet: str = "metadata",
    phrase_sheet: str = "phrase score",
    output_name: str = "phrase_scores_with_meta.xlsx",
    recursive: bool = False,
    engine: str | None = None,
    overwrite: bool = False,
    combine_files: bool = False,
    combined_file_prefix: str | None = None,
) -> None:
    """
    For each .xlsx in {base_loc}/{data_type}/{corpus}/{model}/{raw_subdir}:
      - read `meta_sheet` and `phrase_sheet`
      - take metadata columns from start up to and including `compute_type`
      - take FIRST ROW of that metadata subset
      - repeat it to match number of rows in phrase score
      - prepend those columns to the phrase score dataframe
    Then concat all files in the directory and save to `output_name`.

    Optionally: `combine_files=True` will combine the per-raw_subdir outputs into one file.
    Assumes you already have:
      - list_xlsx_files(dir, recursive=...)
      - read_excel_sheets(path, sheet_names=[...]) -> dict[str, pd.DataFrame]
      - _read_for_combine(path, engine=...) -> pd.DataFrame  (only needed if combine_files=True)
    """
    base_loc = Path(base_loc)

    if isinstance(raw_subdirs, str):
        raw_subdirs = (raw_subdirs,)

    model_dir = base_loc / data_type / corpus / model
    save_base = (model_dir / save_dirname) if save_dirname else model_dir
    save_base.mkdir(parents=True, exist_ok=True)

    disambiguate_names = len(tuple(raw_subdirs)) > 1

    def _render_name(name: str, raw_subdir: str, *, disambiguate: bool) -> str:
        if "{raw_subdir}" in name:
            return name.format(raw_subdir=raw_subdir)
        if disambiguate:
            p = Path(name)
            return f"{p.stem}_{raw_subdir}{p.suffix}"
        return name

    def _combined_name(name: str, prefix: str | None) -> str:
        prefix = prefix or ""
        if "{raw_subdir}" in name:
            return name.format(raw_subdir="combined")
        p = Path(name)
        return f"{prefix}{p.stem}_combined{p.suffix}"

    def _prefix_metadata(meta: pd.DataFrame) -> list[str]:
        if meta.empty:
            return []
        if "compute_type" in meta.columns:
            end = meta.columns.get_loc("compute_type")
            return list(meta.columns[: end + 1])  # inclusive
        # fallback: if compute_type missing, just use all columns
        return list(meta.columns)

    def _process_one_file(xlsx_path: Path) -> pd.DataFrame | None:
        try:
            data = read_excel_sheets(xlsx_path, [meta_sheet, phrase_sheet])
        except Exception as e:
            print(f"  WARN: failed reading sheets from {xlsx_path.name}: {e}")
            return None

        if meta_sheet not in data or phrase_sheet not in data:
            print(f"  WARN: missing sheet(s) in {xlsx_path.name} (need '{meta_sheet}' + '{phrase_sheet}')")
            return None

        meta = data[meta_sheet]
        phrase = data[phrase_sheet]

        if meta is None or meta.empty:
            print(f"  WARN: empty '{meta_sheet}' in {xlsx_path.name}")
            return None
        if phrase is None or phrase.empty:
            # nothing to contribute
            return None

        prefix_cols = _prefix_metadata(meta)
        if not prefix_cols:
            return None

        if "compute_type" not in meta.columns:
            print(f"  WARN: 'compute_type' not found in {xlsx_path.name} metadata; using all metadata columns")

        meta_row = meta.loc[0, prefix_cols]

        # repeat first metadata row to match phrase score length
        meta_rep = pd.DataFrame([meta_row.to_dict()] * len(phrase))

        # avoid duplicate column names: metadata wins
        overlap = [c for c in phrase.columns if c in meta_rep.columns]
        if overlap:
            phrase = phrase.drop(columns=overlap)

        out = pd.concat(
            [meta_rep.reset_index(drop=True), phrase.reset_index(drop=True)],
            axis=1,
        )
        return out

    def _paths_for(raw_subdir: str):
        raw_dir = model_dir / raw_subdir
        out_name = _render_name(output_name, raw_subdir, disambiguate=disambiguate_names)
        out_path = save_base / out_name
        return raw_dir, out_path

    # -------------------------
    # Per-raw_subdir builds
    # -------------------------
    for raw_subdir in raw_subdirs:
        raw_dir, out_path = _paths_for(raw_subdir)

        if not overwrite and out_path.exists():
            print(f"SKIP (exists): {out_path}")
            continue

        if not raw_dir.exists():
            print(f"SKIP (no dir): {raw_dir}")
            continue

        excel_files = list_xlsx_files(raw_dir, recursive=recursive)
        if not excel_files:
            print(f"SKIP (no files): {raw_dir}")
            continue

        file_dfs: list[pd.DataFrame] = []
        for ef in excel_files:
            df = _process_one_file(Path(ef))
            if df is not None and not df.empty:
                file_dfs.append(df)

        if not file_dfs:
            print(f"SKIP (no readable data): {raw_dir}")
            continue

        results = pd.concat(file_dfs, ignore_index=True)

        # deterministic ordering if columns exist (optional but usually helpful)
        sort_cols = [c for c in ["sample_id", "min_token_size", "phrase_num", "phrase_occurrence"] if c in results.columns]
        if sort_cols:
            results = results.sort_values(sort_cols, kind="mergesort").reset_index(drop=True)

        results.to_excel(out_path, index=False, engine=engine)
        print(f"SAVED: {out_path}  (rows={len(results)})")

    # -------------------------
    # Combined outputs across raw_subdirs (optional)
    # -------------------------
    if not combine_files:
        return

    combined_results_path = model_dir / _combined_name(output_name, combined_file_prefix)

    if not overwrite and combined_results_path.exists():
        print(f"SKIP (combined exists): {combined_results_path}")
        return

    combined_parts: list[pd.DataFrame] = []
    for raw_subdir in raw_subdirs:
        _, out_path = _paths_for(raw_subdir)
        if out_path.exists():
            df = _read_for_combine(out_path, engine=engine)
            if "raw_subdir" not in df.columns:
                df.insert(len(df.columns), "raw_subdir", raw_subdir)
            combined_parts.append(df)
        else:
            print(f"  WARN: missing per-subdir file (skip in combine): {out_path}")

    if not combined_parts:
        print("SKIP (combined): no per-subdir result files found to combine")
        return

    combined = pd.concat(combined_parts, ignore_index=True)
    sort_cols = [c for c in ["data_type", "corpus", "scoring_model", "max_context_length", "sample_id", "min_token_size"] if c in combined.columns]
    if sort_cols:
        combined = combined.sort_values(sort_cols, kind="mergesort").reset_index(drop=True)

    combined.to_excel(combined_results_path, index=False, engine=engine)
    print(f"SAVED (combined): {combined_results_path}  (rows={len(combined)})")


In [8]:
for model_name in models:
    for data_type in data_types:
        for corpus in corpora:
            try:
                build_and_save_phrase_scores_with_metadata_prefix(
                    base_loc=base_loc,
                    data_type=data_type,
                    corpus=corpus,
                    model=model_name,
                    raw_subdirs = raw_subdirs,
                    save_dirname = "raw_results",
                    meta_sheet = "metadata",
                    phrase_sheet = "phrase score",
                    output_name = "phrase_scores.xlsx",
                    recursive = False,
                    engine = "openpyxl",
                    overwrite = False,
                    combine_files = False,
                    combined_file_prefix = None,
                )
            except Exception as e:
                print(f"Missing/failed for model={model_name}, data_type={data_type}, corpus={corpus}: {e}")

SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/phrase_scores_raw.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/phrase_scores_raw_100.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/phrase_scores_raw_200.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/phrase_scores_raw_300.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/phrase_scores_raw_400.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/phrase_scores_raw_500.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/raw_results/phrase_scores_raw_600.xlsx
SKIP (exists): /Volumes/BCross/av_datasets_experimen