# MapVec end-to-end pipeline ðŸ“’

This notebook runs the **entire pipeline**:
1. Prompt embeddings (Universal Sentence Encoder)
2. Map embeddings (handcrafted polygon features)
3. Concatenation into a training matrix
4. Helper cells to inspect vectors by `prompt_id` or `map_id`

**Edit the Parameters** in the next cell to match your project layout.


In [1]:
# ===================== PARAMETERS (EDIT ME) =====================
from pathlib import Path

# Project root that contains `src/` and `data/`
PROJ_ROOT = Path('../')  # e.g., Path('/Users/you/Documents/Semester_5/Thesis/CODES')

# Data locations
DATA_DIR    = PROJ_ROOT / 'data'
PROMPTS_CSV = DATA_DIR / 'input' / 'prompts.csv'           # CSV with columns prompt_id,text (or id,text)
PAIRS_CSV   = DATA_DIR / 'input' / 'pairs.csv'             # CSV with map_id,prompt_id
MAPS_ROOT   = DATA_DIR / 'input' / 'samples' / 'pairs'     # Folder with *_input.geojson files
INPUT_MAPS_PATTERN = '*_input.geojson'
OUTPUT_MAPS_PATTERN = '*_generalized.geojson'

# Output directories
PROMPT_OUT = DATA_DIR / 'output' / 'prompt_out'
MAP_OUT    = DATA_DIR / 'output' / 'map_out'
TRAIN_OUT  = DATA_DIR / 'output' / 'train_out'
PAIR_MAP_OUT    = DATA_DIR / 'output' / 'pair_map_out'
SPLIT_OUT   = DATA_DIR / "train_out" / "splits"
MODEL_OUT   = DATA_DIR / "models"
MODEL_OUT.mkdir(parents=True, exist_ok=True)


# USE model: 'dan' or 'transformer'
USE_MODEL = 'dan'

# Expected dims (do not change unless you know what you're doing)
MAP_DIM = 249
PROMPT_DIM = 512
BATCH_SIZE = 512

PROJ_ROOT, DATA_DIR

(PosixPath('..'), PosixPath('../data'))

In [2]:
# ===================== CLEAN PREVIOUS OUTPUTS =====================
import shutil

for d in [PROMPT_OUT, MAP_OUT, TRAIN_OUT]:
    if d.exists():
        print(f"ðŸ§¹ Removing old directory: {d}")
        shutil.rmtree(d)
    d.mkdir(parents=True, exist_ok=True)

print("âœ… All output folders cleaned and recreated fresh.")

ðŸ§¹ Removing old directory: ../data/output/prompt_out
ðŸ§¹ Removing old directory: ../data/output/map_out
ðŸ§¹ Removing old directory: ../data/output/train_out
âœ… All output folders cleaned and recreated fresh.


In [3]:
# Make sure Python can import your local modules (src/)
import sys
sys.path.insert(0, str(PROJ_ROOT))
print('PYTHONPATH updated with:', PROJ_ROOT)

# Create output folders
PROMPT_OUT.mkdir(parents=True, exist_ok=True)
MAP_OUT.mkdir(parents=True, exist_ok=True)
TRAIN_OUT.mkdir(parents=True, exist_ok=True)
PROMPT_OUT, MAP_OUT, TRAIN_OUT

PYTHONPATH updated with: ..


(PosixPath('../data/output/prompt_out'),
 PosixPath('../data/output/map_out'),
 PosixPath('../data/output/train_out'))

## 0) Dependency check (Parquet engine)
We ensure `pyarrow` or `fastparquet` is available for `pandas.to_parquet`.

In [4]:
import importlib
ok = importlib.util.find_spec('pyarrow') or importlib.util.find_spec('fastparquet')
if not ok:
    raise SystemExit('Missing parquet engine (pyarrow/fastparquet). Install with: conda install pyarrow -y')
print('Parquet engine: OK')

Parquet engine: OK


## 1) Prompt embeddings
Runs `src/mapvec/prompts/prompt_embeddings.py` using your chosen USE model and saves artifacts to `PROMPT_OUT`.

In [5]:
import subprocess, shlex
cmd = (
    f"python {shlex.quote(str(PROJ_ROOT / 'src' / 'mapvec' / 'prompts' / 'prompt_embeddings.py'))} "
    f"--input {shlex.quote(str(PROMPTS_CSV))} --model {shlex.quote(str(USE_MODEL))} --l2 --out_dir {shlex.quote(str(PROMPT_OUT))} -v"
)
print(cmd)
res = subprocess.run(cmd, shell=True)
if res.returncode != 0:
    raise SystemExit('Prompt embedding step failed.')
print('Prompt embeddings completed.')

python ../src/mapvec/prompts/prompt_embeddings.py --input ../data/input/prompts.csv --model dan --l2 --out_dir ../data/output/prompt_out -v


20:46:31 | DEBUG | FILE_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/src/mapvec/prompts
20:46:31 | DEBUG | PROJECT_ROOT=/Users/amirdonyadide/Documents/GitHub/Thesis
20:46:31 | DEBUG | DEFAULT_DATA_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data
20:46:31 | INFO | DATA_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data
20:46:31 | INFO | INPUT=/Users/amirdonyadide/Documents/GitHub/Thesis/data/input/prompts.csv
20:46:31 | INFO | OUT_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out
20:46:31 | INFO | Reading CSV: /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/prompts.csv
20:46:31 | INFO | Loaded 500 prompts (id_col=prompt_id). Sample IDs: p001, p002, p003â€¦
20:46:31 | INFO | Using local USE-dan at /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/model_dan
20:46:31 | INFO | Loading USE-dan from local path: /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/model_dan â€¦
20:46:34 | INFO | Fingerprint not found. Saved model loading will 

Prompt embeddings completed.


## 2) Map embeddings
Runs the map embedding module on the GeoJSON inputs. Skips problematic features, logs warnings, and writes `embeddings.npz` to `PAIR_MAP_OUT`.

In [37]:
# notebook snippet
import sys, subprocess, pathlib
cmd = [
    sys.executable, "-m", "src.mapvec.maps.pair_map_embeddings",
    "--root", str(MAPS_ROOT),
    "--input_pattern", str(INPUT_MAPS_PATTERN),
    "--gen_pattern", str(OUTPUT_MAPS_PATTERN),
    "--out_dir", str(PAIR_MAP_OUT),
    "-v"
]
print("CMD:", " ".join(cmd))
res = subprocess.run(cmd, cwd=str(PROJ_ROOT))
if res.returncode != 0:
    raise SystemExit("Pair map embedding step failed.")
print("Pair map embeddings completed.")

CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.maps.pair_map_embeddings --root data/input/samples/pairs --input_pattern *_input.geojson --gen_pattern *_generalized.geojson --out_dir data/output/pair_map_out -v


19:34:37 | DEBUG | PROJECT_ROOT=/Users/amirdonyadide/Documents/Semester_5/Thesis/CODES
19:34:37 | DEBUG | DATA_DIR=/Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data
19:34:37 | INFO | Scanning /Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/input/samples/pairs (in=*_input.geojson, gen=*_generalized.geojson)â€¦
19:34:41 | INFO | OK  map_id=0073  -> pair_vec[996] (per_map_dim=249)
19:34:42 | INFO | OK  map_id=0080  -> pair_vec[996] (per_map_dim=249)
19:34:44 | INFO | OK  map_id=0093  -> pair_vec[996] (per_map_dim=249)
19:34:48 | INFO | OK  map_id=0122  -> pair_vec[996] (per_map_dim=249)
19:34:50 | INFO | OK  map_id=0123  -> pair_vec[996] (per_map_dim=249)
19:34:51 | INFO | OK  map_id=0127  -> pair_vec[996] (per_map_dim=249)
19:34:52 | INFO | OK  map_id=0158  -> pair_vec[996] (per_map_dim=249)
19:34:57 | INFO | OK  map_id=0159  -> pair_vec[996] (per_map_dim=249)
19:34:58 | INFO | OK  map_id=0160  -> pair_vec[996] (per_map_dim=249)
19:35:01 | INFO | OK  map_id=0165  ->

Pair map embeddings completed.


19:47:37 | INFO | OK  map_id=1757  -> pair_vec[996] (per_map_dim=249)
19:47:37 | INFO | Saved 300 pair vectors (failed=0) to /Users/amirdonyadide/Documents/Semester_5/Thesis/CODES/data/output/pair_map_out


## 3) Concatenate
Joins map & prompt vectors using `pairs.csv` and writes `X_concat.npy` and `train_pairs.parquet` to `TRAIN_OUT`.

In [6]:
import sys, subprocess

cmd = [
    sys.executable, "-m", "src.mapvec.concat.concat_embeddings",
    "--pairs",      str(PAIRS_CSV),
    "--map_npz",    str(PAIR_MAP_OUT / "embeddings.npz"),   # from pair_map_out
    "--prompt_npz", str(PROMPT_OUT / "embeddings.npz"),
    "--out_dir",    str(TRAIN_OUT),
    "--drop_dupes",                                   # optional: drop duplicate (map_id,prompt_id)
    # "--fail_on_missing",                            # optional: stop instead of skipping missing IDs
]
print("CMD:", " ".join(cmd))

# Run from the project root so src/ is importable
res = subprocess.run(cmd, cwd=str(PROJ_ROOT))
if res.returncode != 0:
    raise SystemExit("Concatenation step failed.")
print("Concatenation completed.")

CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.concat.concat_embeddings --pairs ../data/input/pairs.csv --map_npz ../data/output/pair_map_out/embeddings.npz --prompt_npz ../data/output/prompt_out/embeddings.npz --out_dir ../data/output/train_out --drop_dupes
Concatenation completed.


20:46:50 | INFO | Map  embeddings: (300, 996) from /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/pair_map_out/embeddings.npz
20:46:50 | INFO | Prompt embeddings: (500, 512) from /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out/embeddings.npz
20:46:50 | INFO | X shape = (450, 1508)  (map_dim=996, prompt_dim=512)
20:46:50 | INFO | Saved to /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out in 0.03s


## 4) Split dataset  
Splits the concatenated feature matrix `X_concat.npy` and its metadata `train_pairs.parquet` into separate **training**, **validation**, and **test** subsets.  
Each split preserves row alignment between features and metadata, and the resulting files are saved under `TRAIN_OUT/splits/` as:  

- `X_train.npy`, `pairs_train.parquet`  
- `X_val.npy`, `pairs_val.parquet`  
- `X_test.npy`, `pairs_test.parquet`  

In [7]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

# ==== CONFIG ====
TRAIN_OUT = Path(TRAIN_OUT)
VAL_RATIO  = 0.15     # 15% validation
TEST_RATIO = 0.15     # 15% test (remaining 70% train)
SEED       = 42       # reproducibility
# =================

# Load data
X = np.load(TRAIN_OUT / "X_concat.npy")
pairs_df = pd.read_parquet(TRAIN_OUT / "train_pairs.parquet")

print(f"Loaded X: {X.shape}, pairs: {pairs_df.shape}")

# --- Step 1: Train/Test split
X_train, X_temp, df_train, df_temp = train_test_split(
    X, pairs_df, test_size=VAL_RATIO + TEST_RATIO, random_state=SEED, shuffle=True
)

# --- Step 2: Split temp into Val/Test
relative_test_ratio = TEST_RATIO / (VAL_RATIO + TEST_RATIO)
X_val, X_test, df_val, df_test = train_test_split(
    X_temp, df_temp, test_size=relative_test_ratio, random_state=SEED, shuffle=True
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# --- Save splits
split_dir = TRAIN_OUT / "splits"
split_dir.mkdir(exist_ok=True)

np.save(split_dir / "X_train.npy", X_train)
np.save(split_dir / "X_val.npy",   X_val)
np.save(split_dir / "X_test.npy",  X_test)

df_train.to_parquet(split_dir / "pairs_train.parquet", index=False)
df_val.to_parquet(split_dir / "pairs_val.parquet", index=False)
df_test.to_parquet(split_dir / "pairs_test.parquet", index=False)

print(f"Saved splits to {split_dir}")

Loaded X: (450, 1508), pairs: (450, 2)
Train: (315, 1508), Val: (67, 1508), Test: (68, 1508)
Saved splits to ../data/output/train_out/splits


In [9]:
# 5) Self-supervised baseline (no labels.csv needed)
# Build features for positives and for shuffled negatives directly from embeddings.

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

NEG_PER_POS = 3
RNG_SEED    = 42

# --- paths to the original embeddings used for concatenation
MAP_NPZ = DATA_DIR / 'output' / "pair_map_out" / "embeddings.npz"
PRM_NPZ = DATA_DIR / "output" / "prompt_out" / "embeddings.npz"

def load_npz(npz_path: Path):
    z = np.load(npz_path, allow_pickle=True)
    E   = z["E"]
    ids = [str(x) for x in z["ids"].tolist()]
    return E, ids

E_map, map_ids = load_npz(MAP_NPZ)   # shape (M, Dm)
E_prm, prm_ids = load_npz(PRM_NPZ)   # shape (P, Dp)
idx_map = {k:i for i,k in enumerate(map_ids)}
idx_prm = {k:i for i,k in enumerate(prm_ids)}

# --- load split ID tables (created in the previous cell)
J_tr = pd.read_parquet(split_dir / "pairs_train.parquet")
J_va = pd.read_parquet(split_dir / "pairs_val.parquet")
J_te = pd.read_parquet(split_dir / "pairs_test.parquet")

rng = np.random.default_rng(RNG_SEED)

def make_pos_neg_tbl(join_df: pd.DataFrame, neg_per_pos: int) -> pd.DataFrame:
    join_df = join_df[["map_id","prompt_id"]].astype(str).copy()
    pos = join_df.copy()
    pos["y"] = 1

    negl = []
    maps = join_df["map_id"].to_numpy()
    prms = join_df["prompt_id"].to_numpy()
    for _ in range(neg_per_pos):
        shuf = prms.copy()
        rng.shuffle(shuf)
        neg = pd.DataFrame({"map_id": maps, "prompt_id": shuf})
        # drop any accidental positives
        neg = neg.merge(pos[["map_id","prompt_id"]], on=["map_id","prompt_id"],
                        how="left", indicator=True)
        neg = neg[neg["_merge"]=="left_only"].drop(columns="_merge")
        negl.append(neg)

    neg_all = pd.concat(negl, ignore_index=True)
    neg_all["y"] = 0

    all_tbl = pd.concat([pos, neg_all], ignore_index=True)
    all_tbl = all_tbl.sample(frac=1.0, random_state=RNG_SEED).reset_index(drop=True)
    return all_tbl

def table_to_features(tbl: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
    """Map (map_id,prompt_id) pairs to (E_map | E_prm) features."""
    m_idx = [idx_map.get(m, -1) for m in tbl["map_id"].astype(str)]
    p_idx = [idx_prm.get(p, -1) for p in tbl["prompt_id"].astype(str)]
    keep  = [i for i,(im,ip) in enumerate(zip(m_idx, p_idx)) if im >= 0 and ip >= 0]
    if not keep:
        raise SystemExit("No pairs could be mapped to embeddings. Check IDs.")
    m_idx = np.asarray([m_idx[i] for i in keep], dtype=int)
    p_idx = np.asarray([p_idx[i] for i in keep], dtype=int)
    X = np.hstack([E_map[m_idx], E_prm[p_idx]]).astype(np.float32, copy=False)
    y = tbl.iloc[keep]["y"].to_numpy(dtype=int)
    return X, y

# Build features/labels for each split
Y_tr_tbl = make_pos_neg_tbl(J_tr, NEG_PER_POS)
Y_va_tbl = make_pos_neg_tbl(J_va, NEG_PER_POS)
Y_te_tbl = make_pos_neg_tbl(J_te, NEG_PER_POS)

X_tr_b, y_tr = table_to_features(Y_tr_tbl)
X_va_b, y_va = table_to_features(Y_va_tbl)
X_te_b, y_te = table_to_features(Y_te_tbl)

print("Balanced sizes ->",
      "train:", X_tr_b.shape, "val:", X_va_b.shape, "test:", X_te_b.shape,
      "| class balance (train):", np.bincount(y_tr))

# --- scale (fit on train only)
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr_b)
X_va_s = scaler.transform(X_va_b)
X_te_s = scaler.transform(X_te_b)

# --- train + eval
clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_tr_s, y_tr)

def eval_split(Xs, ys):
    pred = clf.predict(Xs)
    proba = clf.predict_proba(Xs)[:, 1]
    return {
        "accuracy": float(accuracy_score(ys, pred)),
        "f1": float(f1_score(ys, pred)),
        "auroc": float(roc_auc_score(ys, proba)),
    }

metrics = {"val": eval_split(X_va_s, y_va),
           "test": eval_split(X_te_s, y_te)}
metrics

Balanced sizes -> train: (1254, 1508) val: (266, 1508) test: (270, 1508) | class balance (train): [939 315]


{'val': {'accuracy': 0.4849624060150376,
  'f1': 0.33170731707317075,
  'auroc': 0.493662341558539},
 'test': {'accuracy': 0.5185185185185185,
  'f1': 0.3434343434343434,
  'auroc': 0.49919918462434487}}

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np

# ---- set this to your actual map dimension ----
MAP_DIM = 996  # map | prompt -> 996 + 512 = 1508
# -----------------------------------------------

def split_blocks(X):
    Xm = X[:, :MAP_DIM]
    Xp = X[:, MAP_DIM:]
    return Xm, Xp

def block_scale_fit(X):
    Xm, Xp = split_blocks(X)
    sm = StandardScaler().fit(Xm)
    sp = StandardScaler().fit(Xp)
    Xs = np.hstack([sm.transform(Xm), sp.transform(Xp)])
    return sm, sp, Xs

def block_scale_transform(X, sm, sp):
    Xm, Xp = split_blocks(X)
    return np.hstack([sm.transform(Xm), sp.transform(Xp)])

# We already have: X_tr_b, y_tr, X_va_b, y_va, X_te_b, y_te
sm, sp, X_tr_s = block_scale_fit(X_tr_b)
X_va_s = block_scale_transform(X_va_b, sm, sp)
X_te_s = block_scale_transform(X_te_b, sm, sp)

# ---- Rebalance train to 1:1 (downsample the majority class) ----
pos_mask = (y_tr == 1)
neg_mask = ~pos_mask
X_pos, y_pos = X_tr_s[pos_mask], y_tr[pos_mask]
X_neg, y_neg = X_tr_s[neg_mask], y_tr[neg_mask]

if len(X_pos) > 0 and len(X_neg) > 0:
    if len(X_neg) > len(X_pos):
        X_neg_b, y_neg_b = resample(X_neg, y_neg, replace=False, n_samples=len(X_pos), random_state=42)
        X_bal = np.vstack([X_pos, X_neg_b])
        y_bal = np.hstack([y_pos, y_neg_b])
    else:
        X_pos_b, y_pos_b = resample(X_pos, y_pos, replace=False, n_samples=len(X_neg), random_state=42)
        X_bal = np.vstack([X_pos_b, X_neg])
        y_bal = np.hstack([y_pos_b, y_neg])
else:
    # fallback if something went wrong
    X_bal, y_bal = X_tr_s, y_tr

# ---- Try a stronger baseline (Gradient Boosting) ----
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_bal, y_bal)

def eval_model(model, Xs, ys):
    pred = model.predict(Xs)
    try:
        proba = model.predict_proba(Xs)[:, 1]
        auroc = float(roc_auc_score(ys, proba))
    except Exception:
        auroc = None
    out = {
        "accuracy": float(accuracy_score(ys, pred)),
        "f1": float(f1_score(ys, pred)),
    }
    if auroc is not None and len(np.unique(ys)) == 2:
        out["auroc"] = auroc
    return out

metrics_gb = {
    "val":  eval_model(gb, X_va_s, y_va),
    "test": eval_model(gb, X_te_s, y_te),
}
print("GradientBoosting:", metrics_gb)

# ---- Also try a tuned logistic as a comparison ----
logcv = LogisticRegressionCV(
    Cs=10, cv=5, max_iter=3000, class_weight="balanced", scoring="roc_auc", n_jobs=None
)
logcv.fit(X_bal, y_bal)

metrics_lr = {
    "val":  eval_model(logcv, X_va_s, y_va),
    "test": eval_model(logcv, X_te_s, y_te),
}
print("LogisticCV:", metrics_lr)

GradientBoosting: {'val': {'accuracy': 0.40601503759398494, 'f1': 0.3875968992248062, 'auroc': 0.538438460961524}, 'test': {'accuracy': 0.4666666666666667, 'f1': 0.3898305084745763, 'auroc': 0.5446272568433315}}
LogisticCV: {'val': {'accuracy': 0.45112781954887216, 'f1': 0.3706896551724138, 'auroc': 0.5031125778144454}, 'test': {'accuracy': 0.4962962962962963, 'f1': 0.3644859813084112, 'auroc': 0.505824111822947}}
