# MapVec end-to-end pipeline ðŸ“’

This notebook runs the **entire pipeline**:
1. Prompt embeddings (Universal Sentence Encoder)
2. Map embeddings (handcrafted polygon features)
3. Concatenation into a training matrix
4. Helper cells to inspect vectors by `prompt_id` or `map_id`

**Edit the Parameters** in the next cell to match your project layout.


In [47]:
# ===================== PARAMETERS (EDIT ME) =====================
from pathlib import Path

# Project root that contains `src/` and `data/`
PROJ_ROOT = Path('../')  # e.g., Path('/Users/you/Documents/Semester_5/Thesis/CODES')

# Data locations
DATA_DIR    = PROJ_ROOT / 'data'
PROMPTS_CSV = DATA_DIR / 'input' / 'prompts.csv'           # CSV with columns prompt_id,text (or id,text)
PAIRS_CSV   = DATA_DIR / 'input' / 'pairs.csv'             # CSV with map_id,prompt_id
MAPS_ROOT   = DATA_DIR / 'input' / 'samples' / 'pairs'     # Folder with *_input.geojson files
INPUT_MAPS_PATTERN = '*_input.geojson'
OUTPUT_MAPS_PATTERN = '*_generalized.geojson'

# Output directories
PROMPT_OUT = DATA_DIR / 'output' / 'prompt_out'
MAP_OUT    = DATA_DIR / 'output' / 'map_out'
TRAIN_OUT  = DATA_DIR / 'output' / 'train_out'
PAIR_MAP_OUT    = DATA_DIR / 'output' / 'pair_map_out'
SPLIT_OUT   = DATA_DIR / "train_out" / "splits"
MODEL_OUT   = DATA_DIR / "models"
MODEL_OUT.mkdir(parents=True, exist_ok=True)


# USE model: 'dan' or 'transformer'
USE_MODEL = 'dan'

# Expected dims (do not change unless you know what you're doing)
MAP_DIM = 249
PROMPT_DIM = 512
BATCH_SIZE = 512

PROJ_ROOT, DATA_DIR

(PosixPath('..'), PosixPath('../data'))

In [33]:
# ===================== CLEAN PREVIOUS OUTPUTS =====================
import shutil

for d in [PROMPT_OUT, MAP_OUT, TRAIN_OUT]:
    if d.exists():
        print(f"ðŸ§¹ Removing old directory: {d}")
        shutil.rmtree(d)
    d.mkdir(parents=True, exist_ok=True)

print("âœ… All output folders cleaned and recreated fresh.")

ðŸ§¹ Removing old directory: ../data/output/prompt_out
ðŸ§¹ Removing old directory: ../data/output/map_out
ðŸ§¹ Removing old directory: ../data/output/train_out
âœ… All output folders cleaned and recreated fresh.


In [34]:
# Make sure Python can import your local modules (src/)
import sys
sys.path.insert(0, str(PROJ_ROOT))
print('PYTHONPATH updated with:', PROJ_ROOT)

# Create output folders
PROMPT_OUT.mkdir(parents=True, exist_ok=True)
MAP_OUT.mkdir(parents=True, exist_ok=True)
TRAIN_OUT.mkdir(parents=True, exist_ok=True)
PROMPT_OUT, MAP_OUT, TRAIN_OUT

PYTHONPATH updated with: ..


(PosixPath('../data/output/prompt_out'),
 PosixPath('../data/output/map_out'),
 PosixPath('../data/output/train_out'))

## 0) Dependency check (Parquet engine)
We ensure `pyarrow` or `fastparquet` is available for `pandas.to_parquet`.

In [35]:
import importlib
ok = importlib.util.find_spec('pyarrow') or importlib.util.find_spec('fastparquet')
if not ok:
    raise SystemExit('Missing parquet engine (pyarrow/fastparquet). Install with: conda install pyarrow -y')
print('Parquet engine: OK')

Parquet engine: OK


## 1) Prompt embeddings
Runs `src/mapvec/prompts/prompt_embeddings.py` using your chosen USE model and saves artifacts to `PROMPT_OUT`.

In [36]:
import subprocess, shlex
cmd = (
    f"python {shlex.quote(str(PROJ_ROOT / 'src' / 'mapvec' / 'prompts' / 'prompt_embeddings.py'))} "
    f"--input {shlex.quote(str(PROMPTS_CSV))} --model {shlex.quote(str(USE_MODEL))} --l2 --out_dir {shlex.quote(str(PROMPT_OUT))} -v"
)
print(cmd)
res = subprocess.run(cmd, shell=True)
if res.returncode != 0:
    raise SystemExit('Prompt embedding step failed.')
print('Prompt embeddings completed.')

python ../src/mapvec/prompts/prompt_embeddings.py --input ../data/input/prompts.csv --model dan --l2 --out_dir ../data/output/prompt_out -v


19:34:28 | DEBUG | FILE_DIR=/Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/src/mapvec/prompts
19:34:28 | DEBUG | PROJECT_ROOT=/Users/amirdonyadide/Documents/Semester_5/Thesis/CODES
19:34:28 | DEBUG | DEFAULT_DATA_DIR=/Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data
19:34:28 | INFO | DATA_DIR=/Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data
19:34:28 | INFO | INPUT=/Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/input/prompts.csv
19:34:28 | INFO | OUT_DIR=/Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/output/prompt_out
19:34:28 | INFO | Reading CSV: /Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/input/prompts.csv
19:34:28 | INFO | Loaded 500 prompts (id_col=prompt_id). Sample IDs: p001, p002, p003â€¦
19:34:28 | INFO | Using local USE-dan at /Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/input/model_dan
19:34:28 | INFO | Loading USE-dan from local path: /Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/d

Prompt embeddings completed.


## 2) Map embeddings
Runs the map embedding module on the GeoJSON inputs. Skips problematic features, logs warnings, and writes `embeddings.npz` to `PAIR_MAP_OUT`.

In [37]:
# notebook snippet
import sys, subprocess, pathlib
cmd = [
    sys.executable, "-m", "src.mapvec.maps.pair_map_embeddings",
    "--root", str(MAPS_ROOT),
    "--input_pattern", str(INPUT_MAPS_PATTERN),
    "--gen_pattern", str(OUTPUT_MAPS_PATTERN),
    "--out_dir", str(PAIR_MAP_OUT),
    "-v"
]
print("CMD:", " ".join(cmd))
res = subprocess.run(cmd, cwd=str(PROJ_ROOT))
if res.returncode != 0:
    raise SystemExit("Pair map embedding step failed.")
print("Pair map embeddings completed.")

CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.maps.pair_map_embeddings --root data/input/samples/pairs --input_pattern *_input.geojson --gen_pattern *_generalized.geojson --out_dir data/output/pair_map_out -v


19:34:37 | DEBUG | PROJECT_ROOT=/Users/amirdonyadide/Documents/Semester_5/Thesis/CODES
19:34:37 | DEBUG | DATA_DIR=/Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data
19:34:37 | INFO | Scanning /Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/input/samples/pairs (in=*_input.geojson, gen=*_generalized.geojson)â€¦
19:34:41 | INFO | OK  map_id=0073  -> pair_vec[996] (per_map_dim=249)
19:34:42 | INFO | OK  map_id=0080  -> pair_vec[996] (per_map_dim=249)
19:34:44 | INFO | OK  map_id=0093  -> pair_vec[996] (per_map_dim=249)
19:34:48 | INFO | OK  map_id=0122  -> pair_vec[996] (per_map_dim=249)
19:34:50 | INFO | OK  map_id=0123  -> pair_vec[996] (per_map_dim=249)
19:34:51 | INFO | OK  map_id=0127  -> pair_vec[996] (per_map_dim=249)
19:34:52 | INFO | OK  map_id=0158  -> pair_vec[996] (per_map_dim=249)
19:34:57 | INFO | OK  map_id=0159  -> pair_vec[996] (per_map_dim=249)
19:34:58 | INFO | OK  map_id=0160  -> pair_vec[996] (per_map_dim=249)
19:35:01 | INFO | OK  map_id=0165  ->

Pair map embeddings completed.


19:47:37 | INFO | OK  map_id=1757  -> pair_vec[996] (per_map_dim=249)
19:47:37 | INFO | Saved 300 pair vectors (failed=0) to /Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/output/pair_map_out


## 3) Concatenate
Joins map & prompt vectors using `pairs.csv` and writes `X_concat.npy` and `train_pairs.parquet` to `TRAIN_OUT`.

In [41]:
import sys, subprocess

cmd = [
    sys.executable, "-m", "src.mapvec.concat.concat_embeddings",
    "--pairs",      str(PAIRS_CSV),
    "--map_npz",    str(PAIR_MAP_OUT / "embeddings.npz"),   # from pair_map_out
    "--prompt_npz", str(PROMPT_OUT / "embeddings.npz"),
    "--out_dir",    str(TRAIN_OUT),
    "--drop_dupes",                                   # optional: drop duplicate (map_id,prompt_id)
    # "--fail_on_missing",                            # optional: stop instead of skipping missing IDs
]
print("CMD:", " ".join(cmd))

# Run from the project root so src/ is importable
res = subprocess.run(cmd, cwd=str(PROJ_ROOT))
if res.returncode != 0:
    raise SystemExit("Concatenation step failed.")
print("Concatenation completed.")

CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.concat.concat_embeddings --pairs ../data/input/pairs.csv --map_npz ../data/output/pair_map_out/embeddings.npz --prompt_npz ../data/output/prompt_out/embeddings.npz --out_dir ../data/output/train_out --drop_dupes
Concatenation completed.


20:14:52 | INFO | Map  embeddings: (300, 996) from /Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/output/pair_map_out/embeddings.npz
20:14:52 | INFO | Prompt embeddings: (500, 512) from /Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/output/prompt_out/embeddings.npz
20:14:52 | INFO | X shape = (450, 1508)  (map_dim=996, prompt_dim=512)
20:14:52 | INFO | Saved to /Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/output/train_out in 0.03s


## 4) Split dataset  
Splits the concatenated feature matrix `X_concat.npy` and its metadata `train_pairs.parquet` into separate **training**, **validation**, and **test** subsets.  
Each split preserves row alignment between features and metadata, and the resulting files are saved under `TRAIN_OUT/splits/` as:  

- `X_train.npy`, `pairs_train.parquet`  
- `X_val.npy`, `pairs_val.parquet`  
- `X_test.npy`, `pairs_test.parquet`  

In [45]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

# ==== CONFIG ====
TRAIN_OUT = Path(TRAIN_OUT)
VAL_RATIO  = 0.15     # 15% validation
TEST_RATIO = 0.15     # 15% test (remaining 70% train)
SEED       = 42       # reproducibility
# =================

# Load data
X = np.load(TRAIN_OUT / "X_concat.npy")
pairs_df = pd.read_parquet(TRAIN_OUT / "train_pairs.parquet")

print(f"Loaded X: {X.shape}, pairs: {pairs_df.shape}")

# --- Step 1: Train/Test split
X_train, X_temp, df_train, df_temp = train_test_split(
    X, pairs_df, test_size=VAL_RATIO + TEST_RATIO, random_state=SEED, shuffle=True
)

# --- Step 2: Split temp into Val/Test
relative_test_ratio = TEST_RATIO / (VAL_RATIO + TEST_RATIO)
X_val, X_test, df_val, df_test = train_test_split(
    X_temp, df_temp, test_size=relative_test_ratio, random_state=SEED, shuffle=True
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# --- Save splits
split_dir = TRAIN_OUT / "splits"
split_dir.mkdir(exist_ok=True)

np.save(split_dir / "X_train.npy", X_train)
np.save(split_dir / "X_val.npy",   X_val)
np.save(split_dir / "X_test.npy",  X_test)

df_train.to_parquet(split_dir / "pairs_train.parquet", index=False)
df_val.to_parquet(split_dir / "pairs_val.parquet", index=False)
df_test.to_parquet(split_dir / "pairs_test.parquet", index=False)

print(f"Saved splits to {split_dir}")

Loaded X: (450, 1508), pairs: (450, 2)
Train: (315, 1508), Val: (67, 1508), Test: (68, 1508)
Saved splits to ../data/output/train_out/splits


In [53]:
# 5) Self-supervised baseline (no labels.csv needed)
# - Positive = observed pairs
# - Negatives = shuffled/mismatched pairs
# - Train a classifier to predict compatibility

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

NEG_PER_POS = 3  # number of negatives per positive; tune as needed
RNG_SEED    = 42

# 1) Load splits
X_tr = np.load(split_dir / "X_train.npy")
X_va = np.load(split_dir / "X_val.npy")
X_te = np.load(split_dir / "X_test.npy")

J_tr = pd.read_parquet(split_dir / "pairs_train.parquet")
J_va = pd.read_parquet(split_dir / "pairs_val.parquet")
J_te = pd.read_parquet(split_dir / "pairs_test.parquet")

rng = np.random.default_rng(RNG_SEED)

def make_negatives(join_df: pd.DataFrame, neg_per_pos: int) -> pd.DataFrame:
    """
    Create negatives by shuffling prompt_id within the split.
    Ensures (map_id, prompt_id) is not an observed positive.
    Returns a table with columns: map_id, prompt_id, y (1 for pos, 0 for neg).
    """
    pos = join_df[["map_id", "prompt_id"]].astype(str).copy()
    pos["y"] = 1

    negl = []
    prompts = join_df["prompt_id"].astype(str).to_numpy()
    maps    = join_df["map_id"].astype(str).to_numpy()

    for _ in range(neg_per_pos):
        shuffled = prompts.copy()
        rng.shuffle(shuffled)
        neg = pd.DataFrame({"map_id": maps, "prompt_id": shuffled})
        # drop collisions with positives
        neg = neg.merge(pos[["map_id","prompt_id"]], on=["map_id","prompt_id"],
                        how="left", indicator=True)
        neg = neg[neg["_merge"] == "left_only"].drop(columns="_merge")
        negl.append(neg)

    neg_all = pd.concat(negl, ignore_index=True)
    neg_all["y"] = 0

    out = pd.concat([pos, neg_all], ignore_index=True)
    out = out.sample(frac=1.0, random_state=RNG_SEED).reset_index(drop=True)
    return out

# Build train/val/test label tables (self-supervised)
Y_tr_tbl = make_negatives(J_tr, NEG_PER_POS)
Y_va_tbl = make_negatives(J_va, NEG_PER_POS)
Y_te_tbl = make_negatives(J_te, NEG_PER_POS)

# 2) Align X rows to these tables

def make_index(df: pd.DataFrame) -> dict[tuple[str, str], int]:
    # Map (map_id, prompt_id) -> row index within the split
    return {
        (m, p): i
        for i, (m, p) in enumerate(
            df[["map_id", "prompt_id"]].astype(str).itertuples(index=False, name=None)
        )
    }

idx_tr = make_index(J_tr)
idx_va = make_index(J_va)
idx_te = make_index(J_te)

def rows_from_table(X_split: np.ndarray,
                    index_lookup: dict[tuple[str, str], int],
                    tbl: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
    inds = []
    ys   = []
    for m, p, y in tbl[["map_id", "prompt_id", "y"]].astype(str).itertuples(index=False, name=None):
        key = (m, p)
        i = index_lookup.get(key)
        if i is None:
            # This can happen if a shuffled negative coincides with some other split's positive.
            # We simply skip such cases.
            continue
        inds.append(i)
        ys.append(int(y))
    inds = np.asarray(inds, dtype=int)
    ys   = np.asarray(ys, dtype=int)
    return X_split[inds], ys

X_tr_b, y_tr = rows_from_table(X_tr, idx_tr, Y_tr_tbl)
X_va_b, y_va = rows_from_table(X_va, idx_va, Y_va_tbl)
X_te_b, y_te = rows_from_table(X_te, idx_te, Y_te_tbl)

print("Balanced sizes ->",
      "train:", X_tr_b.shape, "val:", X_va_b.shape, "test:", X_te_b.shape)

# 3) Scale (fit on train only)
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr_b)
X_va_s = scaler.transform(X_va_b)
X_te_s = scaler.transform(X_te_b)

# 4) Train logistic regression baseline (compatibility classifier)
clf = LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=None)
clf.fit(X_tr_s, y_tr)

def eval_split(Xs, ys):
    p  = clf.predict(Xs)
    pr = clf.predict_proba(Xs)[:, 1]
    return {
        "accuracy": float(accuracy_score(ys, p)),
        "f1": float(f1_score(ys, p)),
        "auroc": float(roc_auc_score(ys, pr)),
    }

metrics = {
    "val":  eval_split(X_va_s, y_va),
    "test": eval_split(X_te_s, y_te),
}
metrics

Balanced sizes -> train: (315, 1508) val: (67, 1508) test: (68, 1508)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)