# Phase 2: Data Processing & Embeddings

**Goal:** Create a vector database of nuPlan scenarios for retrieval-augmented generation (RAG)

**Team:** Karina Shah, Dhruvina Gujarati, Nilay Kumar, Nishanth Krishna Churchmal

**Course:** CSE 475 - Fall 2025


## Pipeline Overview

nuPlan DB → Scenarios → Text Descriptions → Embeddings → FAISS Index → Retrieval


In [1]:
%pip install "sentence-transformers>=2.2.0,<4.0.0" --upgrade


Collecting sentence-transformers<4.0.0,>=2.2.0
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers<4.0.0,>=2.2.0)
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers<4.0.0,>=2.2.0)
  Using cached tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Using cached transformers-4.57.1-py3-none-any.whl (12.0 MB)
Using cached tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
Installing collected packages: tokenizers, transformers, sentence-transformers
[2K  Attempting uninstall: tokenizers
[2K    Found existing installation: tokenizers 0.15.2
[2K    Uninstalling tokenizers-0.15.2:
[2K      Successfully uninstalled tokenizers-0.15.2
[2K  Attempting u

In [8]:
import sys
!{sys.executable} -m pip install pyarrow


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pyarrow
  Using cached pyarrow-21.0.0-cp39-cp39-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Using cached pyarrow-21.0.0-cp39-cp39-manylinux_2_28_x86_64.whl (42.7 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-21.0.0


In [12]:
!pip install fastparquet


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading fastparquet-2024.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m17.3 MB/s[0m  [33m0:00:00[0m
[?25hDownloading cramjam-2.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.9 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: cramjam, fastparquet
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [fastparquet]
[1A[2KSuccessfully installed cramjam-2.11.0 fastparquet-2024.11.0


In [2]:
import sys
!{sys.executable} -m pip install sentence-transformers --upgrade

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Using cached sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Installing collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 3.4.1
    Uninstalling sentence-transformers-3.4.1:
      Successfully uninstalled sentence-transformers-3.4.1
Successfully installed sentence-transformers-5.1.2


In [3]:
import os
import sqlite3
from pathlib import Path
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from tqdm.auto import tqdm

NUPLAN_DATA_ROOT = Path(os.environ["NUPLAN_DATA_ROOT"])
NUPLAN_EXP_ROOT = Path(os.environ["NUPLAN_EXP_ROOT"])

DB_DIR = Path(os.environ["NUPLAN_DATA_ROOT"]) / "nuplan-v1.1" / "splits" / "mini"
print("DB dir:", DB_DIR)

db_files = sorted(DB_DIR.glob("*.db"))
print(f"Found {len(db_files)} DB files")
db_file = db_files[0]
print("Using DB:", db_file)

INDEX_DIR = NUPLAN_EXP_ROOT / "rag_index"
INDEX_DIR.mkdir(parents=True, exist_ok=True)
INDEX_PATH = INDEX_DIR / "faiss_index.bin"
METADATA_PATH = INDEX_DIR / "metadata.parquet"
INDEX_DIR

  from .autonotebook import tqdm as notebook_tqdm


DB dir: /home/nilayjkumar/nuplan/dataset/nuplan-v1.1/splits/mini
Found 64 DB files
Using DB: /home/nilayjkumar/nuplan/dataset/nuplan-v1.1/splits/mini/2021.05.12.22.00.38_veh-35_01008_01518.db


PosixPath('/home/nilayjkumar/nuplan/exp/rag_index')

In [4]:
conn = sqlite3.connect(str(db_file))
query = """
SELECT 
    token               AS scenario_id,
    type                AS scenario_type,
    lidar_pc_token      AS lidar_pc_token
FROM scenario_tag
"""
df = pd.read_sql_query(query, conn)
conn.close()
print("Num scenarios:", len(df))
df.head()

Num scenarios: 13812


Unnamed: 0,scenario_id,scenario_type,lidar_pc_token
0,"b'\x18,\nF\x81\x1d[='",low_magnitude_speed,"b'&*Kz\x1f,Z\xb6'"
1,b'\x06]\x86_\xf3C^X',low_magnitude_speed,b'\\YL\x7fU>_\xb9'
2,b'\xc8gu\xa2uKPv',low_magnitude_speed,b'\x92\xb7\xf6\x9d\x95DXc'
3,b'{\xa9\x16;\x95yW\xbf',low_magnitude_speed,b'\x90\x0f\x14\xe9\xcf\xfdY\xdf'
4,b'g\xc5pY\xcdXTs',low_magnitude_speed,b'.y\x9d\xee\x12\xda]<'


In [5]:
def build_text(row):
    return (
        f"Scenario type: {row['scenario_type']} | "
        f"Lidar token: {row['lidar_pc_token']} | "
        f"Scenario id: {row['scenario_id']}"
    )
df["text"] = df.apply(build_text, axis=1)
df[["scenario_id", "scenario_type", "text"]].head()

Unnamed: 0,scenario_id,scenario_type,text
0,"b'\x18,\nF\x81\x1d[='",low_magnitude_speed,Scenario type: low_magnitude_speed | Lidar tok...
1,b'\x06]\x86_\xf3C^X',low_magnitude_speed,Scenario type: low_magnitude_speed | Lidar tok...
2,b'\xc8gu\xa2uKPv',low_magnitude_speed,Scenario type: low_magnitude_speed | Lidar tok...
3,b'{\xa9\x16;\x95yW\xbf',low_magnitude_speed,Scenario type: low_magnitude_speed | Lidar tok...
4,b'g\xc5pY\xcdXTs',low_magnitude_speed,Scenario type: low_magnitude_speed | Lidar tok...


In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")
texts = df["text"].tolist()
embeddings = model.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)
embeddings = embeddings.astype("float32")
embeddings.shape

Batches: 100%|████████████████████████████████████████████████████████████████████████| 216/216 [02:10<00:00,  1.66it/s]


(13812, 384)

In [14]:
d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embeddings)
print("Vectors in index:", index.ntotal)

# Save FAISS index
faiss.write_index(index, str(INDEX_PATH))

# Save metadata
metadata = df[["scenario_id", "scenario_type", "lidar_pc_token", "text"]].copy()

# Convert the binary ID fields to hex strings so they’re parquet-friendly
for col in ["scenario_id", "lidar_pc_token"]:
    metadata[col] = metadata[col].apply(
        lambda x: x.hex() if isinstance(x, (bytes, bytearray)) else str(x)
    )

# Write using fastparquet engine
metadata.to_parquet(METADATA_PATH, index=False, engine="fastparquet")

INDEX_PATH, METADATA_PATH


Vectors in index: 13812


(PosixPath('/home/nilayjkumar/nuplan/exp/rag_index/faiss_index.bin'),
 PosixPath('/home/nilayjkumar/nuplan/exp/rag_index/metadata.parquet'))

In [16]:
index = faiss.read_index(str(INDEX_PATH))
metadata = pd.read_parquet(METADATA_PATH, engine="fastparquet")


def embed_query(query: str):
    q_emb = model.encode(
        [query],
        normalize_embeddings=True,
        convert_to_numpy=True,
    ).astype("float32")
    return q_emb

def search(query: str, k: int = 5):
    q_emb = embed_query(query)
    scores, idxs = index.search(q_emb, k)
    idxs = idxs[0]
    scores = scores[0]
    results = metadata.iloc[idxs].copy()
    results["score"] = scores
    return results

search("hard braking scenario", k=5) # Sanity check

Unnamed: 0,scenario_id,scenario_type,lidar_pc_token,text,score
12945,2a6d704894b6565e,stationary_in_traffic,d5883187cabe585c,Scenario type: stationary_in_traffic | Lidar t...,0.260185
12217,a93cfb1ba7ab5b6c,stationary_in_traffic,941adc13b541550b,Scenario type: stationary_in_traffic | Lidar t...,0.238049
13547,7fa49b3c49fd5e6d,stationary_in_traffic,e1c741a273ff59b1,Scenario type: stationary_in_traffic | Lidar t...,0.23331
2120,aef45d4895d858c4,near_high_speed_vehicle,39a2b7adc11b5e66,Scenario type: near_high_speed_vehicle | Lidar...,0.232866
2239,b091210baf135931,near_high_speed_vehicle,71d503f349005c36,Scenario type: near_high_speed_vehicle | Lidar...,0.232294
