# Create 100-row test subsets (always CSV outputs)

Reads:
- `data/task-a-title.csv` (may be one-headline-per-line, sometimes not valid comma-separated values)
- `data/task-a-two-words.csv`
- `data/task-b1.tsv`
- `data/task-b2.tsv`

For each file, samples **100 random rows** (or fewer if file has fewer rows) and writes **CSV** outputs to:
`data/generated_data/test_subsets/`

Also writes a manifest with sampled indices: `test_subsets_manifest.json`


In [65]:
from __future__ import annotations

from pathlib import Path
import json
import pandas as pd
import numpy as np
from pandas.errors import ParserError

SEED = 42
N_SAMPLES = 100

INPUTS = [
    ("task-a-title.csv", "tab"),        # tab-separated values, despite .csv extension
    ("task-a-two-words.csv", "tab"),    # tab-separated values, despite .csv extension
    ("task-b1.tsv", "tab"),
    ("task-b2.tsv", "tab"),
]



In [66]:
def find_project_root(start: Path | None = None) -> Path:
    start = (start or Path.cwd()).resolve()
    here = start
    for _ in range(12):
        data_dir = here / "data"
        if data_dir.is_dir():
            ok = True
            for fname, _kind in INPUTS:
                if not (data_dir / fname).exists():
                    ok = False
                    break
            if ok:
                return here
        if here.parent == here:
            break
        here = here.parent
    return start

ROOT = find_project_root()
DATA_DIR = ROOT / "data"
OUT_DIR = DATA_DIR / "test_subsets"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Project root:", ROOT)
print("Data dir:", DATA_DIR)
print("Output dir:", OUT_DIR)


Project root: /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition
Data dir: /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/data
Output dir: /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/data/test_subsets


In [67]:
rng = np.random.default_rng(SEED)

import re
from pandas.errors import ParserError

_ID_LINE_RE = re.compile(r"^\s*(\d+)\s+(.*\S)\s*$")  # id + whitespace + headline

def read_title_csv_loose(path: Path) -> pd.DataFrame:
    """
    Robust reader for task-a-title.csv.

    Preferred:
    - If file is a proper CSV with columns id/headline -> use it.

    Fallback:
    - Treat as "one example per line" where each line starts with a numeric id:
        <id><whitespace><headline...>
      This is what you want: number becomes id, remainder becomes headline.
    """

    # 1) Try proper CSV first (if it really is CSV)
    try:
        df = pd.read_csv(path, keep_default_na=False, engine="python")
        cols = {c.strip().lower(): c for c in df.columns}

        if "id" in cols and "headline" in cols:
            out = df[[cols["id"], cols["headline"]]].copy()
            out.columns = ["id", "headline"]
            out["id"] = out["id"].astype(str)
            out["headline"] = out["headline"].astype(str)
            return out
    except ParserError:
        pass
    except Exception:
        pass

    # 2) Fallback: raw lines with "<id> <headline>"
    lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
    rows = []
    for ln in lines:
        ln = ln.strip()
        if not ln:
            continue
        low = ln.lower()
        if low in {"headline", "id\theadline", "id,headline"}:
            continue

        m = _ID_LINE_RE.match(ln)
        if m:
            rows.append((m.group(1), m.group(2)))
        else:
            # If a line has no numeric prefix, keep it but use its line index as id
            # (this should be rare; remove this block if you prefer to skip such lines)
            rows.append((str(len(rows)), ln))

    return pd.DataFrame(rows, columns=["id", "headline"])


import pandas as pd

def read_table(path: Path, kind: str) -> pd.DataFrame:
    if kind == "tab":
        df = pd.read_csv(path, sep="\t", keep_default_na=False, engine="python")

        # Many of these files have an extra first column (index) with an empty header.
        # Drop it if it exists.
        first_col = df.columns[0]
        if first_col == "" or str(first_col).startswith("Unnamed"):
            df = df.drop(columns=[first_col])

        return df

    raise ValueError(f"Unknown kind: {kind}")


def sample_indices(n_rows: int, n_samples: int) -> list[int]:
    k = min(n_rows, n_samples)
    idx = rng.choice(n_rows, size=k, replace=False)
    idx = np.sort(idx)
    return [int(i) for i in idx]

def ensure_id_column(df: pd.DataFrame, source_name: str) -> pd.DataFrame:
    """
    Ensure there is an 'id' column. If missing, create a stable id from row position.
    """
    if "id" in df.columns:
        df["id"] = df["id"].astype(str)
        return df

    # Create stable IDs (use source_name prefix to avoid collisions across files)
    df = df.copy()
    df.insert(0, "id", [f"{source_name}_{i:06d}" for i in range(len(df))])
    return df


def make_subset_csv(input_path: Path, kind: str, out_dir: Path, n_samples: int) -> dict:
    df = read_table(input_path, kind)

    # Ensure id column exists BEFORE sampling
    df = ensure_id_column(df, source_name=input_path.stem)

    idx = sample_indices(len(df), n_samples)
    subset = df.iloc[idx].copy()

    if input_path.name == "task-a-title.csv":
        subset = subset[["id", "headline"]]
        
    if input_path.name == "task-a-two-words.csv":
        subset = subset[["id", "word1", "word2", "headline"]]

    # Always write CSV outputs
    out_path = out_dir / (input_path.stem + f".test{len(idx)}.csv")
    subset.to_csv(out_path, sep="\t", index=False)

    return {
        "input": str(input_path),
        "output": str(out_path),
        "rows_in": int(len(df)),
        "rows_out": int(len(subset)),
        "indices": idx,
        "seed": int(SEED),
    }



In [68]:
results = []
for fname, kind in INPUTS:
    inp = DATA_DIR / fname
    if not inp.exists():
        raise FileNotFoundError(f"Missing input file: {inp}")
    meta = make_subset_csv(inp, kind, OUT_DIR, N_SAMPLES)
    results.append(meta)
    print(f"✅ {fname}: {meta['rows_in']} -> {meta['rows_out']} rows")
    print("   ->", meta["output"])

manifest = OUT_DIR / "test_subsets_manifest.json"
manifest.write_text(json.dumps(
    {
        "seed": SEED,
        "n_samples": N_SAMPLES,
        "created_utc": __import__("datetime").datetime.utcnow().isoformat(timespec="seconds") + "Z",
        "subsets": results,
    },
    indent=2
), encoding="utf-8")

print("\nSaved manifest:", manifest)


✅ task-a-title.csv: 1100 -> 100 rows
   -> /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/data/test_subsets/task-a-title.test100.csv
✅ task-a-two-words.csv: 100 -> 100 rows
   -> /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/data/test_subsets/task-a-two-words.test100.csv
✅ task-b1.tsv: 1100 -> 100 rows
   -> /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/data/test_subsets/task-b1.test100.csv
✅ task-b2.tsv: 500 -> 100 rows
   -> /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/data/test_subsets/task-b2.test100.csv

Saved manifest: /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/data/test_subsets/test_subsets_manifest.json


  "created_utc": __import__("datetime").datetime.utcnow().isoformat(timespec="seconds") + "Z",


## Outputs

You should now have **4 CSV files** in `data/generated_data/test_subsets/`:

- `task-a-title.test100.csv`
- `task-a-two-words.test100.csv`
- `task-b1.test100.csv`
- `task-b2.test100.csv`

(If any source has fewer than 100 rows, the output will be `.test<N>.csv` accordingly.)
