In [1]:
# ===================== 01_generate_embeddings â€” CELL 0: Bootstrap =====================

import os
import sys
from pathlib import Path

# Find repo root (folder that contains "src/imgofup")
p = Path.cwd().resolve()
REPO_ROOT = None
for candidate in [p, *p.parents]:
    if (candidate / "src" / "imgofup").is_dir():
        REPO_ROOT = candidate
        break
if REPO_ROOT is None:
    raise RuntimeError("Could not find repo root (no 'src/imgofup' found).")

SRC_DIR = REPO_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

# Make config stable
os.environ["PROJ_ROOT"] = str(REPO_ROOT)

print("ðŸ“¦ Repo root:", REPO_ROOT)
print("ðŸ“¦ Using src from:", SRC_DIR)
print("ðŸ”§ PROJ_ROOT env set to:", os.environ["PROJ_ROOT"])

DATA_DIR = REPO_ROOT / "data"


ðŸ“¦ Repo root: /Users/amirdonyadide/Documents/GitHub/IMGOFUP
ðŸ“¦ Using src from: /Users/amirdonyadide/Documents/GitHub/IMGOFUP/src
ðŸ”§ PROJ_ROOT env set to: /Users/amirdonyadide/Documents/GitHub/IMGOFUP


In [2]:
# ===================== 01_generate_embeddings â€” CELL 1: Experiment registry =====================

from pathlib import Path

EXPERIMENTS = {
    "openai_prompt_only": {
        "train_out": DATA_DIR / "output" / "train_out_openai_prompt_only",
        "model_out": DATA_DIR / "output" / "models" / "exp_openai_prompt_only",
        "feature_mode": "prompt_only",
        "prompt_encoder_kind": "openai-small",
    },
    "use_prompt_only": {
        "train_out": DATA_DIR / "output" / "train_out_use_prompt_only",
        "model_out": DATA_DIR / "output" / "models" / "exp_use_prompt_only",
        "feature_mode": "prompt_only",
        "prompt_encoder_kind": "dan",
    },
    "map_only": {
        "train_out": DATA_DIR / "output" / "train_out_map_only",
        "model_out": DATA_DIR / "output" / "models" / "exp_map_only",
        "feature_mode": "map_only",
    },
    "use_map": {
        "train_out": DATA_DIR / "output" / "train_out_use_map",
        "model_out": DATA_DIR / "output" / "models" / "exp_use_map",
        "feature_mode": "prompt_plus_map",
        "prompt_encoder_kind": "dan",
    },
    "openai_map": {
        "train_out": DATA_DIR / "output" / "train_out_openai_map",
        "model_out": DATA_DIR / "output" / "models" / "exp_openai_map",
        "feature_mode": "prompt_plus_map",
        "prompt_encoder_kind": "openai-small",
    },
}

for exp_cfg in EXPERIMENTS.values():
    exp_cfg["train_out"] = Path(exp_cfg["train_out"])
    exp_cfg["model_out"] = Path(exp_cfg["model_out"])
    exp_cfg["train_out"].mkdir(parents=True, exist_ok=True)
    exp_cfg["model_out"].mkdir(parents=True, exist_ok=True)

print("ðŸ§ª Experiments:")
for exp_name, cfg in EXPERIMENTS.items():
    pe = cfg.get("prompt_encoder_kind", "-")
    print(f" - {exp_name:18s} | mode={cfg['feature_mode']:14s} | prompt={pe:14s}")


ðŸ§ª Experiments:
 - openai_prompt_only | mode=prompt_only    | prompt=openai-small  
 - use_prompt_only    | mode=prompt_only    | prompt=dan           
 - map_only           | mode=map_only       | prompt=-             
 - use_map            | mode=prompt_plus_map | prompt=dan           
 - openai_map         | mode=prompt_plus_map | prompt=openai-small  


In [3]:
# ===================== 01_generate_embeddings â€” CELL 2: Prompt embeddings (experiment-scoped) =====================

from pathlib import Path
from dataclasses import replace

from imgofup.config import paths
from imgofup.config.constants import (
    PROMPT_EMBEDDINGS_NPZ_NAME,
    PROMPTS_PARQUET_NAME,
    PROMPT_EMBED_VERBOSITY_DEFAULT,
    PROMPT_EMBED_L2_NORMALIZE_DEFAULT,
    PROMPT_EMBED_SAVE_CSV_DEFAULT,
)
from imgofup.pipelines.run_prompt_embeddings import run_prompt_embeddings_from_config

print("\n=== Running prompt embeddings for experiments that require prompts ===")

prompt_meta_by_experiment = {}

# IMPORTANT: because prompt_id is now read from Excel, old artifacts may be stale.
FORCE_REBUILD_PROMPTS = False  # set True to recompute even if artifacts exist

# Validate Excel input path early
input_xlsx = Path(paths.PATHS.USER_STUDY_XLSX).expanduser().resolve()
print(f"\nðŸ“Œ USER_STUDY_XLSX = {input_xlsx}")
print("   is_file:", input_xlsx.is_file())

if not input_xlsx.is_file():
    raise FileNotFoundError(
        f"USER_STUDY_XLSX is not a file: {input_xlsx}\n"
        "Fix: set PROJ_ROOT correctly in Cell 0 OR set USER_STUDY_XLSX env var."
    )

for exp_name, exp_cfg in EXPERIMENTS.items():
    feature_mode = exp_cfg["feature_mode"]

    if feature_mode == "map_only":
        print(f"\nðŸ§ª Experiment: {exp_name}")
        print("   (skip) feature_mode=map_only â†’ no prompt embeddings required.")
        continue

    prompt_encoder_kind = exp_cfg.get("prompt_encoder_kind", paths.CFG.PROMPT_ENCODER)
    CFG_EXP = replace(paths.CFG, PROMPT_ENCODER=str(prompt_encoder_kind))

    # Store prompt artifacts inside each experiment's train_out
    prompt_out_dir = Path(exp_cfg["train_out"]) / "prompt_out"
    prompt_out_dir.mkdir(parents=True, exist_ok=True)

    emb_npz = prompt_out_dir / PROMPT_EMBEDDINGS_NPZ_NAME
    prm_pq  = prompt_out_dir / PROMPTS_PARQUET_NAME

    print(f"\nðŸ§ª Experiment: {exp_name}")
    print(f"   feature_mode   : {feature_mode}")
    print(f"   PROMPT_ENCODER : {CFG_EXP.PROMPT_ENCODER}")
    print(f"   Output dir     : {prompt_out_dir}")

    if (not FORCE_REBUILD_PROMPTS) and emb_npz.is_file() and prm_pq.is_file():
        print("   âœ… Prompt embeddings already exist â€” skipping recomputation.")
        meta = {
            "out_dir": str(prompt_out_dir),
            "embeddings_path": str(emb_npz),
            "prompts_parquet_path": str(prm_pq),
            "skipped": True,
        }
    else:
        meta = run_prompt_embeddings_from_config(
            input_path=input_xlsx,
            out_dir=prompt_out_dir,
            cfg=CFG_EXP,
            paths=paths.PATHS,
            verbosity=PROMPT_EMBED_VERBOSITY_DEFAULT,
            l2_normalize=PROMPT_EMBED_L2_NORMALIZE_DEFAULT,
            also_save_embeddings_csv=PROMPT_EMBED_SAVE_CSV_DEFAULT,
        )
        print("   âœ… Prompt embeddings completed.")

    prompt_meta_by_experiment[exp_name] = meta

print("\nâœ… Prompt embedding step finished.")



=== Running prompt embeddings for experiments that require prompts ===

ðŸ“Œ USER_STUDY_XLSX = /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/userstudy/UserStudy.xlsx
   is_file: True

ðŸ§ª Experiment: openai_prompt_only
   feature_mode   : prompt_only
   PROMPT_ENCODER : openai-small
   Output dir     : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_openai_prompt_only/prompt_out
   âœ… Prompt embeddings already exist â€” skipping recomputation.

ðŸ§ª Experiment: use_prompt_only
   feature_mode   : prompt_only
   PROMPT_ENCODER : dan
   Output dir     : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_use_prompt_only/prompt_out
   âœ… Prompt embeddings already exist â€” skipping recomputation.

ðŸ§ª Experiment: map_only
   (skip) feature_mode=map_only â†’ no prompt embeddings required.

ðŸ§ª Experiment: use_map
   feature_mode   : prompt_plus_map
   PROMPT_ENCODER : dan
   Output dir     : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/o

In [4]:
# ===================== 01_generate_embeddings â€” CELL 3: Map embeddings (shared) =====================

from pathlib import Path

from imgofup.config import paths
from imgofup.config.constants import MAP_EMBEDDINGS_NPZ_NAME, MAPS_PARQUET_NAME
from imgofup.pipelines.run_map_embeddings import run_map_embeddings_from_config

print("\n=== Map embeddings (shared across all experiments) ===")

maps_root = Path(paths.PATHS.MAPS_ROOT).expanduser().resolve()
xlsx_path = Path(paths.PATHS.USER_STUDY_XLSX).expanduser().resolve()

print("MAPS_ROOT      :", maps_root)
print("  is_dir       :", maps_root.is_dir())
print("USER_STUDY_XLSX:", xlsx_path)
print("  is_file      :", xlsx_path.is_file())

if not maps_root.is_dir():
    raise NotADirectoryError(f"MAPS_ROOT is not a directory: {maps_root}")
if not xlsx_path.is_file():
    raise FileNotFoundError(f"USER_STUDY_XLSX is not a file: {xlsx_path}")

# Compute once and reuse (map embeddings do not depend on prompt backend)
MAP_EMB_DIR = Path(paths.PATHS.MAP_OUT) / "shared_extent"
MAP_EMB_DIR.mkdir(parents=True, exist_ok=True)

maps_npz = MAP_EMB_DIR / MAP_EMBEDDINGS_NPZ_NAME
maps_pq  = MAP_EMB_DIR / MAPS_PARQUET_NAME

FORCE_REBUILD_MAPS = False

print("Target dir:", MAP_EMB_DIR)
print("Artifacts :", maps_npz.name, "|", maps_pq.name)

if (not FORCE_REBUILD_MAPS) and maps_npz.is_file() and maps_pq.is_file():
    print("âœ… Map embeddings already exist â€” skipping recomputation.")
    map_meta = {"out_dir": str(MAP_EMB_DIR), "skipped": True}
else:
    map_meta = run_map_embeddings_from_config(
        maps_root=maps_root,
        input_pattern=paths.PATHS.INPUT_MAPS_PATTERN,
        user_study_xlsx=xlsx_path,
        responses_sheet=paths.PATHS.RESPONSES_SHEET,
        tile_id_col=paths.PATHS.TILE_ID_COL,
        complete_col=paths.PATHS.COMPLETE_COL,
        remove_col=paths.PATHS.REMOVE_COL,
        # shared embeddings: no filtering here (keep stable dataset)
        only_complete=False,
        exclude_removed=False,
        out_dir=MAP_EMB_DIR,
        verbosity=1,
        norm="extent",
    )
    print("âœ… Map embeddings completed.")

if not maps_npz.is_file():
    raise FileNotFoundError(f"Missing {MAP_EMBEDDINGS_NPZ_NAME} at: {maps_npz}")
if not maps_pq.is_file():
    raise FileNotFoundError(f"Missing {MAPS_PARQUET_NAME} at: {maps_pq}")

print("âœ… Map embedding artifacts ready:")
print(" -", maps_npz)
print(" -", maps_pq)



=== Map embeddings (shared across all experiments) ===
MAPS_ROOT      : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/input/samples/pairs
  is_dir       : True
USER_STUDY_XLSX: /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/userstudy/UserStudy.xlsx
  is_file      : True
Target dir: /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/map_out/shared_extent
Artifacts : maps_embeddings.npz | maps.parquet
âœ… Map embeddings already exist â€” skipping recomputation.
âœ… Map embedding artifacts ready:
 - /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/map_out/shared_extent/maps_embeddings.npz
 - /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/map_out/shared_extent/maps.parquet


In [5]:
# ===================== 01_generate_embeddings â€” CELL 4: Infer embedding dimensions =====================

from pathlib import Path
import numpy as np

from imgofup.config.constants import MAP_EMBEDDINGS_NPZ_NAME, PROMPT_EMBEDDINGS_NPZ_NAME

def _infer_dim_from_npz(npz_path: Path) -> int:
    npz_path = Path(npz_path).expanduser().resolve()
    if not npz_path.is_file():
        raise FileNotFoundError(f"Missing embeddings file: {npz_path}")
    with np.load(npz_path, allow_pickle=True) as z:
        if "E" not in z:
            raise ValueError(f"{npz_path} missing array 'E'")
        E = z["E"]
    if E.ndim != 2 or E.shape[1] <= 0:
        raise ValueError(f"Invalid embedding matrix in {npz_path}: shape={E.shape}")
    return int(E.shape[1])

# Map dim (shared)
maps_npz = Path(MAP_EMB_DIR) / MAP_EMBEDDINGS_NPZ_NAME
MAP_DIM_INF = _infer_dim_from_npz(maps_npz)
print("âœ… Inferred MAP_DIM from shared maps:", MAP_DIM_INF)

PROMPT_BASED_MODES = {"prompt_only", "prompt_plus_map"}
dims_by_experiment = {}

for exp_name, exp_cfg in EXPERIMENTS.items():
    feature_mode = str(exp_cfg["feature_mode"]).strip().lower()

    PROMPT_DIM_INF = 0
    if feature_mode in PROMPT_BASED_MODES:
        prm_npz = Path(exp_cfg["train_out"]) / "prompt_out" / PROMPT_EMBEDDINGS_NPZ_NAME
        PROMPT_DIM_INF = _infer_dim_from_npz(prm_npz)

    if feature_mode == "prompt_only":
        map_dim, prompt_dim, fused_dim = 0, PROMPT_DIM_INF, PROMPT_DIM_INF
    elif feature_mode == "map_only":
        map_dim, prompt_dim, fused_dim = MAP_DIM_INF, 0, MAP_DIM_INF
    elif feature_mode == "prompt_plus_map":
        map_dim, prompt_dim, fused_dim = MAP_DIM_INF, PROMPT_DIM_INF, MAP_DIM_INF + PROMPT_DIM_INF
    else:
        raise ValueError(f"Unknown feature_mode for {exp_name}: {feature_mode}")

    exp_cfg["map_dim"] = int(map_dim)
    exp_cfg["prompt_dim"] = int(prompt_dim)
    exp_cfg["fused_dim"] = int(fused_dim)

    dims_by_experiment[exp_name] = {
        "feature_mode": feature_mode,
        "MAP_DIM": int(map_dim),
        "PROMPT_DIM": int(prompt_dim),
        "FUSED_DIM": int(fused_dim),
    }

print("\nâœ… Inferred dims per experiment:")
for exp_name, d in dims_by_experiment.items():
    print(
        f" - {exp_name:18s} | mode={d['feature_mode']:14s} | "
        f"MAP_DIM={d['MAP_DIM']:4d} | PROMPT_DIM={d['PROMPT_DIM']:4d} | FUSED_DIM={d['FUSED_DIM']:4d}"
    )


âœ… Inferred MAP_DIM from shared maps: 165

âœ… Inferred dims per experiment:
 - openai_prompt_only | mode=prompt_only    | MAP_DIM=   0 | PROMPT_DIM=1536 | FUSED_DIM=1536
 - use_prompt_only    | mode=prompt_only    | MAP_DIM=   0 | PROMPT_DIM= 512 | FUSED_DIM= 512
 - map_only           | mode=map_only       | MAP_DIM= 165 | PROMPT_DIM=   0 | FUSED_DIM= 165
 - use_map            | mode=prompt_plus_map | MAP_DIM= 165 | PROMPT_DIM= 512 | FUSED_DIM= 677
 - openai_map         | mode=prompt_plus_map | MAP_DIM= 165 | PROMPT_DIM=1536 | FUSED_DIM=1701


In [6]:
# ===================== 01_generate_embeddings â€” CELL 5: Feature construction (multi-experiment) =====================

from pathlib import Path

from imgofup.pipelines.run_concat_features import run_concat_features_from_dirs
from imgofup.config.constants import PROMPTS_PARQUET_NAME

print("\n=== Building feature matrices for all experiments ===")

concat_meta_by_experiment = {}

# Because prompt_id changed, you should rebuild features at least once.
FORCE_REBUILD_FEATURES = True  # set False later when stable

# Choose a canonical prompts.parquet source for map_only (pairs table)
PAIRS_SOURCE_EXP = "use_prompt_only"
PAIRS_PARQUET_CANON = Path(EXPERIMENTS[PAIRS_SOURCE_EXP]["train_out"]) / "prompt_out" / PROMPTS_PARQUET_NAME

if not PAIRS_PARQUET_CANON.is_file():
    raise FileNotFoundError(
        f"Expected prompts parquet for map_only at:\n  {PAIRS_PARQUET_CANON}\n"
        f"Run CELL 2 (prompt embeddings) for '{PAIRS_SOURCE_EXP}' first."
    )

for exp_name, exp_cfg in EXPERIMENTS.items():
    feature_mode = exp_cfg["feature_mode"]

    train_out_dir = Path(exp_cfg["train_out"])
    map_out_dir = Path(MAP_EMB_DIR)
    prompt_out_dir = train_out_dir / "prompt_out"

    # For map_only we still need prompts table, but not prompt embeddings
    pairs_parquet = PAIRS_PARQUET_CANON if feature_mode == "map_only" else None

    print(f"\nðŸ§ª Experiment: {exp_name}")
    print(f"   Feature mode : {feature_mode}")
    print(f"   Prompt out   : {prompt_out_dir}")
    print(f"   Map out      : {map_out_dir}")
    print(f"   Train out    : {train_out_dir}")
    if pairs_parquet is not None:
        print(f"   Pairs parquet: {pairs_parquet} (shared)")

    X_expected = train_out_dir / f"X_{exp_name}.npy"
    pairs_expected = train_out_dir / f"train_pairs_{exp_name}.parquet"

    if (not FORCE_REBUILD_FEATURES) and X_expected.is_file() and pairs_expected.is_file():
        print("   âœ… Features already exist â€” skipping recomputation.")
        meta = {"skipped": True, "X_path": str(X_expected), "pairs_path": str(pairs_expected)}
    else:
        meta = run_concat_features_from_dirs(
            prompt_out_dir=prompt_out_dir,
            map_out_dir=map_out_dir,
            out_dir=train_out_dir,
            exp_name=exp_name,
            feature_mode=feature_mode,
            verbosity=1,
            prompt_id_width=4,
            pairs_parquet=pairs_parquet,
        )
        print("   âœ… Feature construction completed.")

    concat_meta_by_experiment[exp_name] = meta

print("\nâœ… All feature construction finished.")



=== Building feature matrices for all experiments ===

ðŸ§ª Experiment: openai_prompt_only
   Feature mode : prompt_only
   Prompt out   : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_openai_prompt_only/prompt_out
   Map out      : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/map_out/shared_extent
   Train out    : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_openai_prompt_only
   âœ… Feature construction completed.

ðŸ§ª Experiment: use_prompt_only
   Feature mode : prompt_only
   Prompt out   : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_use_prompt_only/prompt_out
   Map out      : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/map_out/shared_extent
   Train out    : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_use_prompt_only
   âœ… Feature construction completed.

ðŸ§ª Experiment: map_only
   Feature mode : map_only
   Prompt out   : /Users/amirdonyadide/Documents/G