In [6]:
# Create data/pairs.csv with 450 random (map_id, prompt_id) rows.
# - map_id values: subfolder names under data/samples/pairs/
# - prompt_id values: from data/prompts.csv (column 'prompt_id' or 'id')
# - Each prompt_id is used exactly once; map_id is sampled with replacement.

import os
import numpy as np
import pandas as pd

# --- paths (relative to the notebook) ---
ROOT_DIR = "../data/samples/pairs"      # where map subfolders live
PROMPTS_CSV = "../data/prompts.csv"     # your prompts table (must exist)
OUTPUT_DIR = "../data"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "pairs.csv")

N_ROWS = 450        # total pairs
SEED = 42           # set None for non-deterministic

rng = np.random.default_rng(SEED) if SEED is not None else np.random.default_rng()

# --- collect map_ids from folder names ---
if not os.path.isdir(ROOT_DIR):
    raise FileNotFoundError(f"Folder not found: {ROOT_DIR}")

map_ids = sorted([d for d in os.listdir(ROOT_DIR) if os.path.isdir(os.path.join(ROOT_DIR, d))])
if not map_ids:
    raise RuntimeError(f"No subfolders found under {ROOT_DIR} (expected map_id folders).")

# --- read prompt_ids from prompts.csv ---
if not os.path.isfile(PROMPTS_CSV):
    raise FileNotFoundError(f"prompts.csv not found at: {PROMPTS_CSV}")

df_prompts = pd.read_csv(PROMPTS_CSV)
if "prompt_id" in df_prompts.columns:
    prompt_ids = df_prompts["prompt_id"].astype(str).str.strip().tolist()
elif "id" in df_prompts.columns:
    prompt_ids = df_prompts["id"].astype(str).str.strip().tolist()
else:
    raise ValueError("prompts.csv must have a 'prompt_id' or 'id' column.")

# drop empties/dupes, then sample exactly N_ROWS unique prompts
prompt_ids = [p for p in prompt_ids if p]
if len(set(prompt_ids)) < N_ROWS:
    raise ValueError(f"Need {N_ROWS} unique prompts; found only {len(set(prompt_ids))} unique in prompts.csv.")
prompt_ids = rng.choice(list(set(prompt_ids)), size=N_ROWS, replace=False)

# --- assign a random map_id to each prompt_id (with replacement) ---
chosen_maps = rng.choice(map_ids, size=N_ROWS, replace=True)

pairs = pd.DataFrame({"map_id": chosen_maps, "prompt_id": prompt_ids})
os.makedirs(OUTPUT_DIR, exist_ok=True)
pairs.to_csv(OUTPUT_FILE, index=False)

print(f"[OK] Saved {len(pairs)} rows to {OUTPUT_FILE}")
print(f"Unique maps used: {pairs['map_id'].nunique()} / {len(map_ids)} available")
print(f"Unique prompts used: {pairs['prompt_id'].nunique()} (should be {N_ROWS})")

pairs.head(10)

[OK] Saved 450 rows to ../data/pairs.csv
Unique maps used: 242 / 300 available
Unique prompts used: 450 (should be 450)


Unnamed: 0,map_id,prompt_id
0,1203,p467
1,80,p053
2,171,p380
3,523,p415
4,1579,p173
5,948,p360
6,1344,p320
7,867,p140
8,469,p280
9,804,p198
