# ShopTalk – LLM Integration & End-to-End RAG Pipeline

**Project:** ShopTalk – AI-Powered Shopping Assistant  
**Dataset:** [Amazon Berkeley Objects (ABO)](https://amazon-berkeley-objects.s3.amazonaws.com/index.html)  
**Author:** Balaji Gurusala  
**Notebook Scope:** T000d+ from `.spec/tasks.md` – LLM Integration, RAG Generation, Model Comparison  
**Prerequisite:** `03-rag-prototype.ipynb` must have been run (produces search indexes + config in `/kaggle/working/`)  
**Environment:** Local MacBook (Apple Silicon / MPS) or Kaggle (CUDA) or CPU

---

### Purpose

Complete the **Generation** half of the RAG pipeline by integrating Large Language Models.  
NB03 built the **Retrieval** engine (hybrid search, P@5 = 0.775). This notebook adds:

| Component | Details |
|-----------|--------|
| **LLM Integration** | OpenAI GPT-4o-mini (primary) + Groq Llama-3.3-70B (open-source comparison) |
| **Prompt Engineering** | System prompt, context formatting, few-shot examples |
| **RAG Pipeline** | query → hybrid_search → context assembly → LLM generation → response |
| **Model Comparison** | Side-by-side GPT-4o-mini vs Llama-3.3 on quality, latency, cost |
| **Evaluation** | RAGAS-style metrics: Faithfulness, Answer Relevance, Latency |
| **Synthetic Prices** | Category-based price generation for filter demo |
| **Session Context** | Multi-turn conversation support for follow-up queries |

### Inputs (from NB03)

| Artifact | Description |
|----------|-------------|
| `rag_products.pkl` | Full product catalogue (9,190 rows × 51 cols) |
| `rag_text_index.npy` | Pre-normalised SentenceTransformer embeddings (9190 × 384) |
| `rag_image_index.npy` | Pre-normalised CLIP image embeddings (9190 × 512) |
| `rag_config.json` | Hyperparameters, model IDs, benchmark results |

### Outputs

| Artifact | Description |
|----------|-------------|
| `llm_config.json` | LLM settings, prompt templates, evaluation results |
| `llm_evaluation.csv` | Detailed evaluation results for 50+ queries |
| `products_with_prices.pkl` | Products with synthetic prices for UI demo |
| `streamlit_app/` | Production-ready Streamlit app code |

---

## Step 0 – Environment Setup & Hardware Detection

In [None]:
# ============================================================
# Step 0: Environment Setup
# ============================================================
import sys, os, json, time, re, warnings, hashlib
from pathlib import Path
from typing import Optional, List, Dict, Tuple, Any
from datetime import datetime

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

print(f"Python {sys.version}")

# --- Hardware Detection (per constitution.md) ---
import torch
import numpy as np
import pandas as pd

DEVICE = (
    "cuda" if torch.cuda.is_available() else
    "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else
    "cpu"
)
GPU_NAME = torch.cuda.get_device_name(0) if DEVICE == "cuda" else DEVICE
print(f"PyTorch {torch.__version__}")
print(f"Device: {DEVICE} ({GPU_NAME})")

# --- Platform Detection ---
ON_KAGGLE = Path("/kaggle/working").exists()
print(f"Platform: {'Kaggle' if ON_KAGGLE else 'Local'}")

# --- Install extra deps if needed ---
def _install(*pkgs):
    import subprocess
    for pkg in pkgs:
        try:
            __import__(pkg.split("==")[0].replace("-", "_"))
        except ImportError:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

_install("langchain", "langchain-core", "langchain-openai", "langchain-groq",
         "langchain-ollama", "openai", "groq", "python-dotenv", "tiktoken", "httpx",
         "chromadb")

print("\n\u2713 Environment ready")

---

## Step 1 – Load NB03 Artifacts & Prepare Data

In [None]:
# ============================================================
# Step 1: Load NB03 artifacts
# ============================================================

# --- Resolve data directory ---
# On Kaggle: NB03 output is attached as a dataset version
# Locally: look in ../data/ or current directory
_candidates = [
    Path("/kaggle/input/shoptalk-rag-prototype"),   # Kaggle dataset
    Path("/kaggle/working"),                         # Same-session Kaggle
    Path("../data"),                                 # Local dev
    Path("."),                                       # Fallback
]
DATA_DIR = None
for d in _candidates:
    if (d / "rag_products.pkl").exists():
        DATA_DIR = d
        break
assert DATA_DIR is not None, (
    f"Cannot find rag_products.pkl in any of: {[str(d) for d in _candidates]}. "
    "Run 03-rag-prototype.ipynb first."
)

EXPORT_DIR = Path("/kaggle/working") if ON_KAGGLE else Path("../data")
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data directory:   {DATA_DIR}")
print(f"Export directory:  {EXPORT_DIR}")

# --- Load products ---
df = pd.read_pickle(DATA_DIR / "rag_products.pkl")
print(f"\nProducts loaded: {len(df):,} rows x {len(df.columns)} cols")

# --- Load embedding indexes ---
TEXT_INDEX = np.load(DATA_DIR / "rag_text_index.npy")
IMAGE_INDEX = np.load(DATA_DIR / "rag_image_index.npy")
print(f"Text index:   {TEXT_INDEX.shape} (dtype={TEXT_INDEX.dtype})")
print(f"Image index:  {IMAGE_INDEX.shape} (dtype={IMAGE_INDEX.dtype})")

# --- Load RAG config ---
with open(DATA_DIR / "rag_config.json") as f:
    RAG_CONFIG = json.load(f)
print(f"RAG config:   {list(RAG_CONFIG.keys())}")

# --- Verify alignment ---
assert len(df) == TEXT_INDEX.shape[0] == IMAGE_INDEX.shape[0], "Index/DataFrame size mismatch!"
print(f"\n\u2713 All artifacts loaded and aligned ({len(df):,} products)")

In [None]:
# ============================================================
# Step 1b: Generate Synthetic Prices (ABO has no price field)
# ============================================================
# Per data-model.md: "ABO has no price field; synthetic prices
# assigned during ingestion if price filtering is needed."

np.random.seed(42)

# Price ranges by product category (realistic for demo)
CATEGORY_PRICE_RANGES = {
    "SHOES":                  (29.99, 149.99),
    "SHIRT":                  (14.99, 59.99),
    "T_SHIRT":                (9.99,  39.99),
    "CELLULAR_PHONE_CASE":    (7.99,  39.99),
    "HOME":                   (12.99, 89.99),
    "HOME_BED_AND_BATH":      (14.99, 79.99),
    "HARDWARE":               (4.99,  49.99),
    "HARDWARE_HANDLE":        (5.99,  29.99),
    "THERMOPLASTIC_FILAMENT": (15.99, 45.99),
    "FURNITURE":              (49.99, 399.99),
    "OTTOMAN":                (39.99, 199.99),
    "CHAIR":                  (59.99, 299.99),
    "TABLE":                  (49.99, 349.99),
    "LIGHTING":               (14.99, 89.99),
    "LAMP":                   (19.99, 99.99),
    "WATCH":                  (24.99, 199.99),
    "LUGGAGE":                (29.99, 149.99),
    "FINERING":               (9.99,  89.99),
}
DEFAULT_RANGE = (9.99, 99.99)

def generate_price(category: str) -> float:
    """Generate a realistic synthetic price based on product category."""
    low, high = CATEGORY_PRICE_RANGES.get(str(category).upper(), DEFAULT_RANGE)
    price = np.random.uniform(low, high)
    return round(price, 2)

if "price" not in df.columns or df["price"].isna().all():
    df["price"] = df["product_type_flat"].apply(
        lambda c: generate_price(str(c).split("/")[0] if pd.notna(c) else "")
    )
    print(f"\u2713 Synthetic prices generated for {len(df):,} products")
    print(f"  Price range: ${df['price'].min():.2f} - ${df['price'].max():.2f}")
    print(f"  Mean: ${df['price'].mean():.2f} | Median: ${df['price'].median():.2f}")
else:
    print(f"\u2713 Prices already present")

# Quick distribution by top categories
print("\nPrice by top categories:")
for cat in df["product_type_flat"].value_counts().head(8).index:
    subset = df[df["product_type_flat"] == cat]["price"]
    print(f"  {cat:30s}  ${subset.mean():6.2f} avg  (n={len(subset)})")

---

## Step 2 – Load Search Models (SentenceTransformer + CLIP)

In [None]:
# ============================================================
# Step 2: Load Search Models (loaded once per constitution.md)
# ============================================================

from sentence_transformers import SentenceTransformer
from transformers import CLIPModel, CLIPProcessor

ST_MODEL_ID = RAG_CONFIG.get("text_model_id", "all-MiniLM-L6-v2")
CLIP_MODEL_ID = RAG_CONFIG.get("image_model_id", "openai/clip-vit-base-patch32")

print(f"Loading SentenceTransformer: {ST_MODEL_ID}")
t0 = time.time()
st_model = SentenceTransformer(ST_MODEL_ID, device=DEVICE)
print(f"  \u2713 Loaded in {time.time()-t0:.1f}s")

print(f"Loading CLIP: {CLIP_MODEL_ID}")
t0 = time.time()
clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID).to(DEVICE).eval()
clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_ID)
print(f"  \u2713 Loaded in {time.time()-t0:.1f}s")

print(f"\n\u2713 All search models loaded on {DEVICE}")

---

## Step 3 – Load Search Pipeline + ChromaDB

Instead of copy-pasting NB03's search code, we:
1. **Import** the shared `src/search.py` module (single source of truth for search logic)
2. **Load ChromaDB** from NB03's persisted directory (if available) for Stage-1 retrieval
3. **Fall back** to in-memory NumPy if ChromaDB isn't found

Only the **query encoding** models (loaded in Step 2) and the **shared search module** are needed.  
The product embeddings are already indexed — either in ChromaDB or the `.npy` files loaded in Step 1.

In [None]:
# ============================================================
# Step 3: Search Pipeline + ChromaDB
# ============================================================
# LOCAL: imports from src/search.py (single source of truth)
# KAGGLE: defines everything inline (self-contained, zero setup)

import re

# ---------- try the shared module first ----------
_SEARCH_IMPORTED = False
try:
    project_root = Path("..").resolve()
    if str(project_root) not in sys.path:
        sys.path.insert(0, str(project_root))
    from src.search import (
        hybrid_search, l2_normalize, apply_rerank,
        compute_dynamic_alpha, retrieve_inmemory,
    )
    _SEARCH_IMPORTED = True
    print("\u2713 Imported search pipeline from src/search.py (local)")
except ImportError:
    pass

# ---------- inline fallback (Kaggle / Colab) ----------
if not _SEARCH_IMPORTED:
    print("\u2139 src/search.py not found — defining search inline (Kaggle mode)")

    # --- constants ---
    ALPHA_DEFAULT_CFG = RAG_CONFIG.get("alpha_default", 0.6)
    LEXICAL_WEIGHT  = 0.16
    TITLE_WEIGHT    = 0.10
    TYPE_WEIGHT     = 0.06
    HEAD_NOUN_MISS_PENALTY = 0.50
    GENDER_MISS_PENALTY    = 0.40
    LOW_CONFIDENCE_CUTOFF  = 0.30

    QUERY_STOPWORDS = {
        "for","with","and","the","a","an","in","on","to","of","by","from",
        "best","good","new","comfortable","great","nice","high","quality",
        "s","under","below","above","less","than","more","about",
        "show","me","find","get","want","need","looking","search",
    }
    FEMALE_TOKENS = {"women","woman","female","ladies","lady","girls","girl","womens"}
    MALE_TOKENS   = {"men","man","male","mens","boys","boy"}
    ABO_CATEGORY_HINTS = {
        "shoe":{"SHOES"},"shoes":{"SHOES"},"sneaker":{"SHOES"},"sneakers":{"SHOES"},
        "boot":{"SHOES"},"boots":{"SHOES"},"sandal":{"SHOES"},"sandals":{"SHOES"},
        "filament":{"THERMOPLASTIC_FILAMENT","MECHANICAL_COMPONENTS"},
        "pla":{"THERMOPLASTIC_FILAMENT"},"abs":{"THERMOPLASTIC_FILAMENT"},
        "3d":{"THERMOPLASTIC_FILAMENT"},"printer":{"THERMOPLASTIC_FILAMENT"},
        "phone":{"CELLULAR_PHONE_CASE"},"case":{"CELLULAR_PHONE_CASE"},
        "cover":{"CELLULAR_PHONE_CASE"},
        "drawer":{"HARDWARE"},"slides":{"HARDWARE"},"slide":{"HARDWARE"},
        "handle":{"HARDWARE_HANDLE"},"hardware":{"HARDWARE","HARDWARE_HANDLE"},
        "shirt":{"SHIRT","T_SHIRT"},"tshirt":{"SHIRT","T_SHIRT"},
        "t-shirt":{"SHIRT","T_SHIRT"},"polo":{"SHIRT"},
        "pillow":{"HOME_BED_AND_BATH","HOME"},"sheet":{"HOME_BED_AND_BATH"},
        "curtain":{"HOME_BED_AND_BATH"},"towel":{"HOME_BED_AND_BATH"},
        "ottoman":{"OTTOMAN","FURNITURE"},"chair":{"CHAIR","FURNITURE"},
        "table":{"TABLE","FURNITURE"},"lamp":{"LIGHTING","LAMP"},
        "watch":{"WATCH"},"backpack":{"LUGGAGE"},
    }
    VISUAL_CUES    = {"colorful","patterned","floral","striped","printed","design",
                      "aesthetic","stylish","cute","pretty","beautiful"}
    TECHNICAL_CUES = {"inch","mm","kg","watt","volt","capacity","specs",
                      "compatible","mount","gauge","thread","count"}

    # --- helpers ---
    def _tok(t):
        t = re.sub(r"[^a-zA-Z0-9\-\s]"," ",str(t).lower())
        return [w for w in t.split() if w and w not in QUERY_STOPWORDS]

    def _overlap(q, t):
        qt = set(_tok(q))
        return len(qt & set(_tok(t)))/max(1,len(qt)) if qt else 0.0

    def _expand(tokens):
        e = set(tokens)
        for t in tokens:
            if "-" in t:
                for p in t.split("-"):
                    if p and p in ABO_CATEGORY_HINTS: e.add(p)
        return e

    def l2_normalize(x):
        n = np.linalg.norm(x, axis=1, keepdims=True)
        return x / np.where(n==0, 1, n)

    def compute_dynamic_alpha(query):
        q = set(_tok(query))
        a = ALPHA_DEFAULT_CFG
        v = len(q & VISUAL_CUES)
        t = len(q & TECHNICAL_CUES)
        n = bool(re.search(r"\d", query))
        if v: a -= 0.15*min(v,2)
        if t or n: a += 0.10*min(t+int(n),2)
        return max(0.2, min(0.9, a))

    def apply_rerank(results, query, top_k):
        if results.empty: return results
        hn = _expand({t for t in _tok(query) if t in ABO_CATEGORY_HINTS})
        exp_types = set()
        for t in _tok(query): exp_types |= ABO_CATEGORY_HINTS.get(t, set())
        q_g = set(_tok(query))
        gi = "female" if (q_g & FEMALE_TOKENS) and not (q_g & MALE_TOKENS) else \
             "male"   if (q_g & MALE_TOKENS)   and not (q_g & FEMALE_TOKENS) else None
        adj = results.copy()
        adj["lex"] = adj.apply(lambda r: _overlap(query, " ".join([
            str(r.get("item_name_flat","")), str(r.get("enriched_text","")),
            str(r.get("product_type_flat",""))])), axis=1)
        adj["tol"] = adj.apply(lambda r: _overlap(query, str(r.get("item_name_flat",""))), axis=1)
        adj["tyo"] = adj.apply(lambda r: _overlap(query, str(r.get("product_type_flat",""))), axis=1)
        adj["fs"] = adj["hybrid_score"] + LEXICAL_WEIGHT*adj["lex"] + TITLE_WEIGHT*adj["tol"] + TYPE_WEIGHT*adj["tyo"]
        hm, gm = [], []
        for _, row in adj.iterrows():
            hay = " ".join([str(row.get("item_name_flat","")),str(row.get("enriched_text","")),
                            str(row.get("product_type_flat",""))]).lower()
            ht = set(_tok(hay)); he = _expand(ht)
            h = 1.0
            if hn and not (hn & he):
                pt = str(row.get("product_type_flat","")).upper()
                if not (exp_types and any(t in pt for t in exp_types)):
                    h = 1.0 - HEAD_NOUN_MISS_PENALTY
            g = 1.0
            if gi=="female" and (ht & MALE_TOKENS) and not (ht & FEMALE_TOKENS): g=1.0-GENDER_MISS_PENALTY
            elif gi=="male" and (ht & FEMALE_TOKENS) and not (ht & MALE_TOKENS): g=1.0-GENDER_MISS_PENALTY
            hm.append(h); gm.append(g)
        adj["fs"] *= np.array(hm)*np.array(gm)
        adj = adj.sort_values("fs", ascending=False).reset_index(drop=True)
        strong = adj[adj["fs"]>=LOW_CONFIDENCE_CUTOFF]
        out = strong.head(top_k).copy() if len(strong)>=top_k else adj.head(top_k).copy()
        out["hybrid_score"] = out["fs"]; out["_rank"] = np.arange(1,len(out)+1)
        return out

    def hybrid_search(query, df, text_index, image_index, encode_text_fn,
                      encode_clip_fn, top_k=5, alpha=None, price_max=None,
                      category=None, rerank=True, **kw):
        if alpha is None: alpha = compute_dynamic_alpha(query) if rerank else ALPHA_DEFAULT_CFG
        qt = encode_text_fn(query); qc = encode_clip_fn(query)
        ts = (text_index @ qt.T).squeeze(); ims = (image_index @ qc.T).squeeze()
        scores = alpha*ts + (1-alpha)*ims
        nf = min(len(df), max(top_k*12, 80))
        ti = np.argsort(scores)[::-1][:nf]
        res = df.iloc[ti].copy(); res["hybrid_score"] = scores[ti]; res = res.reset_index(drop=True)
        if price_max is not None and "price" in res.columns:
            res = res[res["price"]<=price_max].reset_index(drop=True)
        if category and "product_type_flat" in res.columns:
            res = res[res["product_type_flat"].str.upper().str.contains(category.upper(),na=False)].reset_index(drop=True)
        if res.empty: return res
        res.attrs["alpha_used"] = alpha
        if rerank: res = apply_rerank(res, query, top_k)
        else: res = res.sort_values("hybrid_score",ascending=False).head(top_k).copy(); res["_rank"]=np.arange(1,len(res)+1)
        res.attrs["alpha_used"] = alpha
        return res

    print("  \u2713 Inline search pipeline defined (all functions available)")

# ============================================================
# Query encoder closures (capture the loaded models from Step 2)
# ============================================================
def encode_text(query: str) -> np.ndarray:
    """Encode query with SentenceTransformer (L2-normalised)."""
    emb = st_model.encode([query], show_progress_bar=False, normalize_embeddings=True)
    return emb.astype(np.float32)

def encode_clip(query: str) -> np.ndarray:
    """Encode query with CLIP text encoder (L2-normalised)."""
    inputs = clip_processor(
        text=[query], return_tensors="pt", padding=True, truncation=True
    ).to(DEVICE)
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
    emb = features.cpu().numpy().astype(np.float32)
    return l2_normalize(emb)

# ============================================================
# Try to connect to NB03's persisted ChromaDB
# ============================================================
TEXT_COLLECTION = None
IMAGE_COLLECTION = None
ITEM_ID_TO_IDX = None

chroma_candidates = [
    DATA_DIR / "chroma_db",
    DATA_DIR / ".." / "chroma_db",
    Path("/kaggle/input/shoptalk-rag-prototype/chroma_db"),
]

for chroma_path in chroma_candidates:
    if chroma_path.exists() and any(chroma_path.iterdir()):
        try:
            import chromadb
            chroma_client = chromadb.PersistentClient(path=str(chroma_path))
            TEXT_COLLECTION = chroma_client.get_collection("products_text_v1")
            IMAGE_COLLECTION = chroma_client.get_collection("products_image_v1")
            ITEM_ID_TO_IDX = {str(row["item_id"]): i for i, row in df.iterrows()}
            print(f"\u2713 Connected to ChromaDB at {chroma_path}")
            print(f"  Text collection:  {TEXT_COLLECTION.count():,} docs")
            print(f"  Image collection: {IMAGE_COLLECTION.count():,} docs")
            break
        except Exception as e:
            print(f"  \u26a0 ChromaDB at {chroma_path} failed: {e}")

if TEXT_COLLECTION is None:
    print("\u2139 ChromaDB not found — using in-memory NumPy backend (from .npy files)")

# ============================================================
# Convenience wrapper — binds loaded data/models so callers
# only need to pass the query string
# ============================================================
def search(
    query: str,
    top_k: int = 5,
    price_max: float = None,
    category: str = None,
) -> pd.DataFrame:
    """Search products. Uses ChromaDB if available, else in-memory NumPy."""
    return hybrid_search(
        query=query,
        df=df,
        text_index=TEXT_INDEX,
        image_index=IMAGE_INDEX,
        encode_text_fn=encode_text,
        encode_clip_fn=encode_clip,
        top_k=top_k,
        price_max=price_max,
        category=category,
        text_collection=TEXT_COLLECTION,
        image_collection=IMAGE_COLLECTION,
        item_id_to_idx=ITEM_ID_TO_IDX,
    )

# --- Smoke test ---
test_results = search("red shoes for women", top_k=3)
print(f"\n\u2713 search() working | Test query returned {len(test_results)} results")
for _, r in test_results.iterrows():
    print(f"  [{int(r['_rank'])}] {r['item_name_flat'][:70]:70s}  type={r['product_type_flat']}  ${r.get('price', 0):.2f}")

---

## Step 4 – LLM Setup & API Configuration

Per `constitution.md`: *"LLM: Comparative Study — Proprietary: GPT-4o. Open Source: Llama 3 (via Ollama or Groq)."*

| Provider | Model | Where it runs | Setup |
|----------|-------|---------------|-------|
| **Ollama** (default local) | `llama3.2` / `llama3.1` | Your Mac/PC | `brew install ollama && ollama pull llama3.2` |
| **OpenAI** (proprietary benchmark) | `gpt-4o-mini` | OpenAI cloud | API key in `.env` |
| **Groq** (Kaggle fallback) | `llama-3.3-70b-versatile` | Groq cloud | API key in Kaggle Secrets |

**Priority order:** Ollama (free, local) → OpenAI → Groq.  
On **Kaggle**, Ollama is unavailable (no local server), so it falls back to Groq/OpenAI automatically.

In [None]:
# ============================================================
# Step 4: LLM Client Setup (LangChain orchestration)
# ============================================================
# Priority: Ollama (free, local) → OpenAI → Groq
# On Kaggle: Ollama unavailable, falls back to Groq/OpenAI

from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser

# --- API Key Setup ---
if ON_KAGGLE:
    from kaggle_secrets import UserSecretsClient
    secrets = UserSecretsClient()
    OPENAI_API_KEY = secrets.get_secret("OPENAI_API_KEY")
    GROQ_API_KEY = secrets.get_secret("GROQ_API_KEY")
else:
    from dotenv import load_dotenv
    load_dotenv()
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
    GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")

# --- Initialize LLM registry ---
llm_registry = {}

# ---- 1. Ollama (local, free — default for local dev) ----
# Requires: brew install ollama && ollama pull llama3.2
# Ollama runs a local server at http://localhost:11434
if not ON_KAGGLE:
    try:
        from langchain_ollama import ChatOllama
        import httpx

        # Check if Ollama server is running
        _ollama_ok = False
        try:
            resp = httpx.get("http://localhost:11434/api/tags", timeout=3.0)
            if resp.status_code == 200:
                available_models = [m["name"] for m in resp.json().get("models", [])]
                _ollama_ok = len(available_models) > 0
                if _ollama_ok:
                    print(f"\u2713 Ollama server running — models: {available_models[:5]}")
        except (httpx.ConnectError, httpx.TimeoutException):
            pass

        if _ollama_ok:
            # Pick the best available Llama model
            OLLAMA_MODEL = None
            for candidate in ["llama3.2", "llama3.1", "llama3", "llama2", "mistral", "phi3"]:
                matching = [m for m in available_models if candidate in m]
                if matching:
                    OLLAMA_MODEL = matching[0]
                    break

            if OLLAMA_MODEL is None and available_models:
                OLLAMA_MODEL = available_models[0]  # Use whatever is available

            if OLLAMA_MODEL:
                llm_ollama = ChatOllama(
                    model=OLLAMA_MODEL,
                    temperature=0.3,
                    num_predict=512,
                )
                llm_registry[f"ollama/{OLLAMA_MODEL}"] = llm_ollama
                print(f"\u2713 Ollama/{OLLAMA_MODEL} initialized (local, free)")
        else:
            print("\u2139 Ollama server not running — skipping (start with: ollama serve)")
    except ImportError:
        print("\u2139 langchain-ollama not installed — run: pip install langchain-ollama")
else:
    print("\u2139 Ollama not available on Kaggle (no local server)")

# ---- 2. OpenAI GPT-4o-mini (proprietary benchmark) ----
if OPENAI_API_KEY:
    try:
        from langchain_openai import ChatOpenAI
        llm_gpt4o_mini = ChatOpenAI(
            model="gpt-4o-mini",
            api_key=OPENAI_API_KEY,
            temperature=0.3,
            max_tokens=512,
            request_timeout=30,
        )
        llm_registry["gpt-4o-mini"] = llm_gpt4o_mini
        print("\u2713 GPT-4o-mini initialized (OpenAI)")
    except ImportError:
        print("\u26a0 langchain-openai not installed — run: pip install langchain-openai")
else:
    print("\u2139 OPENAI_API_KEY not set — GPT-4o-mini skipped")

# ---- 3. Groq Llama-3.3 (cloud, free tier — Kaggle fallback) ----
if GROQ_API_KEY:
    try:
        from langchain_groq import ChatGroq
        llm_llama_groq = ChatGroq(
            model="llama-3.3-70b-versatile",
            api_key=GROQ_API_KEY,
            temperature=0.3,
            max_tokens=512,
        )
        llm_registry["groq/llama-3.3-70b"] = llm_llama_groq
        print("\u2713 Llama-3.3-70B initialized (Groq cloud)")
    except ImportError:
        print("\u26a0 langchain-groq not installed — run: pip install langchain-groq")
else:
    print("\u2139 GROQ_API_KEY not set — Groq skipped")

# --- Select default LLM ---
# Priority: Ollama (free) → OpenAI (quality) → Groq (Kaggle fallback)
if not llm_registry:
    msg = "No LLM available!\n"
    if ON_KAGGLE:
        msg += (
            "  KAGGLE SETUP REQUIRED:\n"
            "    1. Click 'Add-ons' → 'Secrets' in the right sidebar\n"
            "    2. Add secret: GROQ_API_KEY  (free at console.groq.com)\n"
            "       OR:        OPENAI_API_KEY (paid at platform.openai.com)\n"
            "    3. Toggle the secret ON for this notebook\n"
            "    4. Re-run this cell"
        )
    else:
        msg += (
            "  LOCAL:  Install Ollama → brew install ollama && ollama pull llama3.2 && ollama serve\n"
            "  CLOUD:  Set OPENAI_API_KEY or GROQ_API_KEY in .env"
        )
    raise RuntimeError(msg)

# Pick default: prefer Ollama for local, OpenAI for quality, Groq as fallback
_priority = ["ollama/", "gpt-4o-mini", "groq/"]
DEFAULT_LLM_NAME = list(llm_registry.keys())[0]  # fallback
for prefix in _priority:
    matches = [k for k in llm_registry if k.startswith(prefix)]
    if matches:
        DEFAULT_LLM_NAME = matches[0]
        break

DEFAULT_LLM = llm_registry[DEFAULT_LLM_NAME]
print(f"\n\u2705 Default LLM: {DEFAULT_LLM_NAME}")
print(f"   Available:   {list(llm_registry.keys())}")

---

## Step 5 – Prompt Engineering

Designing the system prompt, context formatting, and few-shot templates  
for the shopping assistant persona.

In [None]:
# ============================================================
# Step 5: Prompt Engineering
# ============================================================

SYSTEM_PROMPT = """You are ShopTalk, a friendly and knowledgeable AI shopping assistant.
Your job is to help customers find products from our catalog and answer questions about them.

RULES:
1. ONLY recommend products from the provided context — never invent products or details.
2. Be concise and helpful: aim for 2-4 sentences per recommendation.
3. Highlight relevant features that match the customer's query (color, material, brand, size, etc.).
4. When showing multiple products, briefly explain WHY each is relevant to the query.
5. If no products match the query well, honestly say so and suggest trying different keywords.
6. Use product IDs when referencing products so the UI can display product cards.
7. Use a warm, conversational tone — like a helpful store associate.
8. For follow-up questions, use conversation history for context.
9. When prices are available, mention them to help the customer.
10. Keep responses under 150 words unless the customer asks for details.

FORMAT:
- Lead with a brief, friendly answer to the question.
- Then mention the top 2-3 most relevant products with key details.
- End with a helpful suggestion or question if appropriate."""


def format_product_context(results: pd.DataFrame, max_products: int = 5) -> str:
    """Format retrieved products into a structured context string for the LLM."""
    if results.empty:
        return "No products found matching this query."

    lines = []
    for i, (_, row) in enumerate(results.head(max_products).iterrows()):
        parts = [f"Product {i+1}: {row.get('item_name_flat', 'Unknown Product')}"]
        parts.append(f"  ID: {row.get('item_id', 'N/A')}")

        if pd.notna(row.get('brand_flat')) and str(row['brand_flat']).strip():
            parts.append(f"  Brand: {row['brand_flat']}")
        if pd.notna(row.get('product_type_flat')):
            parts.append(f"  Category: {row['product_type_flat']}")
        if pd.notna(row.get('color_flat')) and str(row['color_flat']).strip():
            parts.append(f"  Color: {row['color_flat']}")
        if pd.notna(row.get('price')):
            parts.append(f"  Price: ${row['price']:.2f}")
        if pd.notna(row.get('bullet_point_flat')):
            bullets = str(row['bullet_point_flat'])[:400]
            parts.append(f"  Features: {bullets}")
        if pd.notna(row.get('image_caption')) and str(row['image_caption']).strip():
            parts.append(f"  Appearance: {row['image_caption']}")

        lines.append("\n".join(parts))

    return "\n\n".join(lines)


USER_TEMPLATE = """Here are the top matching products from our catalog:

{product_context}

---
Customer query: {query}

Provide a helpful, concise recommendation based on the products above."""


# --- Quick test of context formatting ---
test_ctx = format_product_context(test_results, max_products=3)
print("Sample product context (3 products):")
print("=" * 60)
print(test_ctx)
print("=" * 60)
print(f"\nContext length: {len(test_ctx)} chars")

---

## Step 6 – End-to-End RAG Pipeline

The complete pipeline: `query → hybrid_search → context assembly → LLM generation → response`  
With support for filters, session context (follow-ups), and error handling.

In [None]:
# ============================================================
# Step 6: End-to-End RAG Pipeline
# ============================================================

def rag_query(
    query: str,
    llm = None,
    llm_name: str = None,
    top_k: int = 5,
    price_max: float = None,
    category: str = None,
    session_history: List[Dict] = None,
) -> Dict[str, Any]:
    """End-to-end RAG pipeline: query -> search -> context -> LLM -> response.

    Args:
        query: Customer query text.
        llm: LangChain LLM instance. Defaults to DEFAULT_LLM.
        llm_name: Name for logging. Defaults to DEFAULT_LLM_NAME.
        top_k: Number of products to retrieve.
        price_max: Optional max price filter.
        category: Optional category filter.
        session_history: List of {role, content} dicts for multi-turn context.

    Returns:
        Dict with response_text, product_ids, products, status, timings.
    """
    if llm is None:
        llm = DEFAULT_LLM
    if llm_name is None:
        llm_name = DEFAULT_LLM_NAME

    result = {
        "query": query,
        "llm_name": llm_name,
        "response_text": "",
        "product_ids": [],
        "status": "ok",
        "retrieval_time": 0.0,
        "generation_time": 0.0,
        "total_time": 0.0,
    }

    # --- Stage 1: Retrieve (uses ChromaDB if available, else in-memory) ---
    t0 = time.time()
    try:
        search_results = search(
            query, top_k=top_k, price_max=price_max, category=category
        )
    except Exception as e:
        result["status"] = "pipeline_error"
        result["response_text"] = "Something went wrong with the search. Please try again."
        result["retrieval_time"] = time.time() - t0
        return result

    result["retrieval_time"] = time.time() - t0

    if search_results.empty:
        result["status"] = "no_results"
        result["response_text"] = (
            "No products match that query. Try different keywords or "
            "broaden your filters."
        )
        return result

    # --- Stage 2: Build Context ---
    product_context = format_product_context(search_results, max_products=top_k)
    user_message = USER_TEMPLATE.format(
        product_context=product_context, query=query
    )

    # --- Stage 3: Build Messages ---
    messages = [SystemMessage(content=SYSTEM_PROMPT)]

    # Add session history for follow-up context
    if session_history:
        for msg in session_history[-6:]:  # Last 3 turns (6 messages)
            if msg["role"] == "user":
                messages.append(HumanMessage(content=msg["content"]))
            elif msg["role"] == "assistant":
                messages.append(AIMessage(content=msg["content"]))

    messages.append(HumanMessage(content=user_message))

    # --- Stage 4: Generate ---
    t1 = time.time()
    try:
        response = llm.invoke(messages)
        result["response_text"] = response.content
    except Exception as e:
        result["status"] = "pipeline_error"
        result["response_text"] = "Something went wrong. Please try again."
        result["generation_time"] = time.time() - t1
        result["total_time"] = result["retrieval_time"] + result["generation_time"]
        print(f"\u26a0 LLM Error: {e}")
        return result

    result["generation_time"] = time.time() - t1
    result["total_time"] = result["retrieval_time"] + result["generation_time"]
    result["product_ids"] = search_results["item_id"].tolist()[:top_k]
    result["products"] = search_results.head(top_k)

    return result


print("\u2713 rag_query() defined — complete RAG pipeline ready")

In [None]:
# ============================================================
# Step 6b: Test the RAG Pipeline with Sample Queries
# ============================================================

TEST_QUERIES = [
    "red shoes for women",
    "3D printer filament PLA",
    "phone case with colorful design",
    "comfortable cotton t-shirt",
    "kitchen drawer slides hardware",
]

print("=" * 80)
print("RAG PIPELINE — DEMO QUERIES")
print("=" * 80)

demo_results = []
_LINE = "─" * 80

for q in TEST_QUERIES:
    print(f"\n{_LINE}")
    print(f"\u2753 Query: \"{q}\"")
    print(_LINE)

    result = rag_query(q, top_k=3)
    demo_results.append(result)

    print(f"\n\ud83d\udcac Response ({result['llm_name']}):")
    print(f"  {result['response_text']}")
    print(f"\n\u23f1 Retrieval: {result['retrieval_time']:.3f}s | "
          f"Generation: {result['generation_time']:.3f}s | "
          f"Total: {result['total_time']:.3f}s")
    print(f"  Products: {result['product_ids'][:3]}")
    print(f"  Status: {result['status']}")

# Summary
print(f"\n{'=' * 80}")
print("TIMING SUMMARY")
print(f"{'=' * 80}")
retrieval_times = [r["retrieval_time"] for r in demo_results]
generation_times = [r["generation_time"] for r in demo_results]
total_times = [r["total_time"] for r in demo_results]
print(f"  Retrieval:  avg={np.mean(retrieval_times):.3f}s  p95={np.percentile(retrieval_times, 95):.3f}s")
print(f"  Generation: avg={np.mean(generation_times):.3f}s  p95={np.percentile(generation_times, 95):.3f}s")
print(f"  Total:      avg={np.mean(total_times):.3f}s  p95={np.percentile(total_times, 95):.3f}s")
_pass_fail = "✅ PASS" if np.percentile(total_times, 95) < 5 else "❌ FAIL"
print(f"  Target: <5s total (per requirements.md) → {_pass_fail}")

In [None]:
# ============================================================
# Step 6c: Multi-Turn Conversation Test
# ============================================================

print("=" * 80)
print("MULTI-TURN CONVERSATION TEST")
print("=" * 80)

session_history = []

# Turn 1: Initial query
q1 = "Show me women's shoes"
print(f"\n\ud83d\udc64 Turn 1: \"{q1}\"")
r1 = rag_query(q1, top_k=3, session_history=session_history)
print(f"\ud83e\udd16 {r1['response_text']}")
session_history.append({"role": "user", "content": q1})
session_history.append({"role": "assistant", "content": r1['response_text']})

# Turn 2: Follow-up with context
q2 = "Do you have any in red?"
print(f"\n\ud83d\udc64 Turn 2: \"{q2}\"")
r2 = rag_query("red women's shoes", top_k=3, session_history=session_history)
print(f"\ud83e\udd16 {r2['response_text']}")
session_history.append({"role": "user", "content": q2})
session_history.append({"role": "assistant", "content": r2['response_text']})

# Turn 3: Price filter
q3 = "Show me something under $50"
print(f"\n\ud83d\udc64 Turn 3: \"{q3}\"")
r3 = rag_query("red women's shoes", top_k=3, price_max=50.0, session_history=session_history)
print(f"\ud83e\udd16 {r3['response_text']}")

print(f"\n\u2713 Multi-turn conversation working with session context")

---

## Step 7 – Model Comparison (all available LLMs)

Per `constitution.md`: *"LLM: Comparative Study — Proprietary: GPT-4o. Open Source: Llama 3 (via Ollama or Groq)."*  
We compare **all available models** on the same queries for quality, latency, and response length.  
Locally this may be Ollama vs OpenAI; on Kaggle it may be GPT-4o-mini vs Groq Llama-3.3.

In [None]:
# ============================================================
# Step 7: Model Comparison (GPT-4o-mini vs Llama-3.3-70B)
# ============================================================

COMPARISON_QUERIES = [
    "red shoes for women under $80",
    "3D printer PLA filament",
    "phone case with colorful design",
    "comfortable cotton t-shirt for men",
    "kitchen drawer slides",
    "waterproof backpack",
    "bedding set queen size",
    "modern table lamp",
    "stainless steel watch",
    "leather ottoman for living room",
]

comparison_results = []

print("Running model comparison...\n")
for q in COMPARISON_QUERIES:
    row = {"query": q}

    for name, llm in llm_registry.items():
        try:
            result = rag_query(q, llm=llm, llm_name=name, top_k=3)
            row[f"{name}_response"] = result["response_text"]
            row[f"{name}_retrieval_s"] = result["retrieval_time"]
            row[f"{name}_generation_s"] = result["generation_time"]
            row[f"{name}_total_s"] = result["total_time"]
            row[f"{name}_status"] = result["status"]
            row[f"{name}_word_count"] = len(result["response_text"].split())
        except Exception as e:
            row[f"{name}_response"] = f"ERROR: {e}"
            row[f"{name}_status"] = "error"

    comparison_results.append(row)
    print(f"  \u2713 {q[:50]}")

df_comparison = pd.DataFrame(comparison_results)

# --- Summary Statistics ---
print(f"\n{'=' * 80}")
print("MODEL COMPARISON SUMMARY")
print(f"{'=' * 80}")

for name in llm_registry:
    gen_col = f"{name}_generation_s"
    total_col = f"{name}_total_s"
    wc_col = f"{name}_word_count"

    if gen_col in df_comparison.columns:
        gen_times = df_comparison[gen_col].dropna()
        total_times = df_comparison[total_col].dropna()
        word_counts = df_comparison[wc_col].dropna()

        print(f"\n  {name}:")
        print(f"    Generation: avg={gen_times.mean():.3f}s  p95={gen_times.quantile(0.95):.3f}s")
        print(f"    Total:      avg={total_times.mean():.3f}s  p95={total_times.quantile(0.95):.3f}s")
        print(f"    Words/resp: avg={word_counts.mean():.0f}  min={word_counts.min():.0f}  max={word_counts.max():.0f}")
        ok_count = (df_comparison[f"{name}_status"] == "ok").sum()
        print(f"    Success:    {ok_count}/{len(df_comparison)} queries")

In [None]:
# ============================================================
# Step 7b: Side-by-Side Response Comparison (First 3 queries)
# ============================================================

model_names = list(llm_registry.keys())

for i, row in df_comparison.head(3).iterrows():
    print(f"\n{'=' * 80}")
    print(f"Query: \"{row['query']}\"")
    print(f"{'=' * 80}")

    for name in model_names:
        resp_col = f"{name}_response"
        gen_col = f"{name}_generation_s"
        if resp_col in row:
            print(f"\n  [{name}] ({row.get(gen_col, 0):.2f}s):")
            # Wrap long lines
            text = str(row[resp_col])
            for line in text.split("\n"):
                print(f"    {line}")
    print()

---

## Step 8 – Generation Quality Evaluation

Evaluate the RAG pipeline on faithfulness, relevance, and completeness.  
Per `test_strategy.md`: *"Manually score 50 query responses for Helpfulness and Naturalness."*

In [None]:
# ============================================================
# Step 8: Automated Quality Evaluation
# ============================================================
# We evaluate using LLM-as-judge (per RAGAS methodology):
# - Faithfulness: Does the response only use info from context?
# - Answer Relevance: Does the response address the query?
# - Conciseness: Is the response appropriately concise?

EVAL_PROMPT = """You are an evaluation judge for a shopping assistant chatbot.
Rate the following response on three criteria (score 1-5 each):

1. FAITHFULNESS: Does the response ONLY mention products/details from the provided context?
   5=perfect, 1=invents products/details
2. RELEVANCE: Does the response address the customer's query well?
   5=perfectly relevant, 1=completely off-topic
3. HELPFULNESS: Is the response helpful, natural, and well-formatted?
   5=excellent shopping advice, 1=unhelpful/robotic

CUSTOMER QUERY: {query}

RETRIEVED PRODUCTS (context):
{context}

ASSISTANT RESPONSE:
{response}

Return ONLY a JSON object with scores, no other text:
{{"faithfulness": <1-5>, "relevance": <1-5>, "helpfulness": <1-5>, "reasoning": "<brief explanation>"}}"""


# Extended evaluation queries (50 queries per test_strategy.md)
EVAL_QUERIES = [
    "red shoes for women",
    "3D printer filament PLA",
    "phone case with colorful design",
    "comfortable cotton t-shirt",
    "kitchen drawer slides hardware",
    "waterproof backpack for hiking",
    "queen size bedding set",
    "modern desk lamp LED",
    "stainless steel men's watch",
    "leather ottoman",
    "blue running sneakers",
    "iPhone case clear",
    "bathroom towel set",
    "wooden dining table",
    "women's sandals summer",
    "ABS filament black",
    "kids' backpack school",
    "decorative throw pillow",
    "men's leather boots",
    "Samsung Galaxy case",
    "ergonomic office chair",
    "kitchen cabinet handles",
    "curtains for bedroom",
    "white cotton polo shirt",
    "ring gold plated",
    "outdoor patio furniture",
    "bathroom shelf hardware",
    "women's winter boots",
    "floor lamp living room",
    "travel luggage carry-on",
    "shower curtain waterproof",
    "men's running shoes Nike",
    "soft bed sheets queen",
    "wall mount shelf bracket",
    "colorful phone case floral",
    "wooden coffee table modern",
    "men's casual t-shirt",
    "silver watch women's",
    "kitchen faucet handle",
    "decorative table lamp",
    "hiking boots waterproof men",
    "silk pillowcase set",
    "phone case with kickstand",
    "velvet ottoman round",
    "drawer pull handles brass",
    "women's athletic sneakers",
    "3D printer nozzle",
    "home storage basket",
    "accent chair for bedroom",
    "men's dress watch classic",
]


def evaluate_response(query: str, context: str, response: str, judge_llm=None) -> dict:
    """Use LLM-as-judge to evaluate a RAG response."""
    if judge_llm is None:
        judge_llm = DEFAULT_LLM

    eval_message = EVAL_PROMPT.format(
        query=query, context=context, response=response
    )
    try:
        result = judge_llm.invoke([HumanMessage(content=eval_message)])
        # Parse JSON from response
        text = result.content.strip()
        # Handle markdown code blocks
        if "```" in text:
            text = text.split("```")[1]
            if text.startswith("json"):
                text = text[4:]
            text = text.strip()
        scores = json.loads(text)
        return scores
    except Exception as e:
        return {"faithfulness": 0, "relevance": 0, "helpfulness": 0, "reasoning": f"Parse error: {e}"}


# --- Run evaluation on 50 queries ---
print(f"Running evaluation on {len(EVAL_QUERIES)} queries with {DEFAULT_LLM_NAME}...\n")

eval_results = []

for i, q in enumerate(EVAL_QUERIES):
    # Generate response
    rag_result = rag_query(q, top_k=3)

    # Get context for evaluation
    if "products" in rag_result and rag_result["products"] is not None:
        context = format_product_context(rag_result["products"], max_products=3)
    else:
        context = "No products found."

    # Evaluate with LLM-as-judge
    scores = evaluate_response(q, context, rag_result["response_text"])

    eval_results.append({
        "query": q,
        "response_text": rag_result["response_text"],
        "product_ids": str(rag_result["product_ids"][:3]),
        "status": rag_result["status"],
        "retrieval_time": rag_result["retrieval_time"],
        "generation_time": rag_result["generation_time"],
        "total_time": rag_result["total_time"],
        "faithfulness": scores.get("faithfulness", 0),
        "relevance": scores.get("relevance", 0),
        "helpfulness": scores.get("helpfulness", 0),
        "reasoning": scores.get("reasoning", ""),
    })

    if (i + 1) % 10 == 0:
        print(f"  \u2713 {i+1}/{len(EVAL_QUERIES)} queries evaluated")

df_eval = pd.DataFrame(eval_results)
print(f"\n\u2713 Evaluation complete: {len(df_eval)} queries")

In [None]:
# ============================================================
# Step 8b: Evaluation Summary & Metrics
# ============================================================

print("=" * 70)
print("EVALUATION RESULTS SUMMARY")
print("=" * 70)

# Quality scores
for metric in ["faithfulness", "relevance", "helpfulness"]:
    vals = df_eval[metric][df_eval[metric] > 0]  # Exclude parse failures
    if len(vals) > 0:
        print(f"\n  {metric.upper():15s}  mean={vals.mean():.2f}/5  "
              f"median={vals.median():.1f}  min={vals.min():.0f}  max={vals.max():.0f}  "
              f"(n={len(vals)})")

# Latency
print(f"\n  LATENCY:")
print(f"    Retrieval  — avg: {df_eval['retrieval_time'].mean():.3f}s  "
      f"p95: {df_eval['retrieval_time'].quantile(0.95):.3f}s  "
      f"p99: {df_eval['retrieval_time'].quantile(0.99):.3f}s")
print(f"    Generation — avg: {df_eval['generation_time'].mean():.3f}s  "
      f"p95: {df_eval['generation_time'].quantile(0.95):.3f}s  "
      f"p99: {df_eval['generation_time'].quantile(0.99):.3f}s")
print(f"    Total      — avg: {df_eval['total_time'].mean():.3f}s  "
      f"p95: {df_eval['total_time'].quantile(0.95):.3f}s  "
      f"p99: {df_eval['total_time'].quantile(0.99):.3f}s")

# Status distribution
print(f"\n  STATUS:")
for status, count in df_eval['status'].value_counts().items():
    print(f"    {status}: {count}/{len(df_eval)}")

# Performance vs requirements
p95_total = df_eval['total_time'].quantile(0.95)
p99_total = df_eval['total_time'].quantile(0.99)
print(f"\n  REQUIREMENTS CHECK:")
_ret_icon = "✅" if df_eval['retrieval_time'].quantile(0.95) < 1 else "❌"
_tot_icon = "✅" if p95_total < 5 else "❌"
print(f"    RAG retrieval < 1s:   {_ret_icon}  "
      f"(p95={df_eval['retrieval_time'].quantile(0.95):.3f}s)")
print(f"    Total < 5s:           {_tot_icon}  (p95={p95_total:.3f}s)")
print(f"    P99 total:            {p99_total:.3f}s")

---

## Step 9 – Export Artifacts

In [None]:
# ============================================================
# Step 9: Export Artifacts
# ============================================================

# 1. Products with prices
products_path = EXPORT_DIR / "products_with_prices.pkl"
df.to_pickle(products_path)
print(f"\u2713 {products_path.name:35s}  {products_path.stat().st_size / 1e6:.1f} MB")

# Also save as CSV for portability
csv_path = EXPORT_DIR / "products_with_prices.csv"
export_cols = [
    "item_id", "item_name_flat", "brand_flat", "product_type_flat",
    "color_flat", "price", "bullet_point_flat", "item_keywords_flat",
    "image_caption", "enriched_text", "main_image_id", "path", "country",
]
available_cols = [c for c in export_cols if c in df.columns]
df[available_cols].to_csv(csv_path, index=False)
print(f"\u2713 {csv_path.name:35s}  {csv_path.stat().st_size / 1e6:.1f} MB")

# 2. Evaluation results
eval_path = EXPORT_DIR / "llm_evaluation.csv"
df_eval.to_csv(eval_path, index=False)
print(f"\u2713 {eval_path.name:35s}  {eval_path.stat().st_size / 1e3:.1f} KB")

# 3. Model comparison results
comp_path = EXPORT_DIR / "llm_comparison.csv"
df_comparison.to_csv(comp_path, index=False)
print(f"\u2713 {comp_path.name:35s}  {comp_path.stat().st_size / 1e3:.1f} KB")

# 4. LLM config
llm_config = {
    "notebook": "04-llm-integration",
    "timestamp": datetime.now().isoformat(),
    "default_llm": DEFAULT_LLM_NAME,
    "available_llms": list(llm_registry.keys()),
    "system_prompt": SYSTEM_PROMPT,
    "user_template": USER_TEMPLATE,
    "rag_config": RAG_CONFIG,
    "evaluation_summary": {
        "n_queries": len(df_eval),
        "faithfulness_mean": round(float(df_eval["faithfulness"].mean()), 2),
        "relevance_mean": round(float(df_eval["relevance"].mean()), 2),
        "helpfulness_mean": round(float(df_eval["helpfulness"].mean()), 2),
        "retrieval_p95_s": round(float(df_eval["retrieval_time"].quantile(0.95)), 3),
        "generation_p95_s": round(float(df_eval["generation_time"].quantile(0.95)), 3),
        "total_p95_s": round(float(df_eval["total_time"].quantile(0.95)), 3),
    },
    "price_ranges": CATEGORY_PRICE_RANGES,
    "device": str(DEVICE),
    "gpu_name": GPU_NAME,
}

config_path = EXPORT_DIR / "llm_config.json"
with open(config_path, "w") as f:
    json.dump(llm_config, f, indent=2, default=str)
print(f"\u2713 {config_path.name:35s}  {config_path.stat().st_size / 1e3:.1f} KB")

print(f"\n\u2713 All artifacts exported to {EXPORT_DIR}/")

In [None]:
# ============================================================
# Final Summary
# ============================================================

print("=" * 60)
print("SHOPTALK LLM INTEGRATION \u2014 SUMMARY")
print("=" * 60)

print("\n--- Pipeline ---")
print(f"  Products:        {len(df):,}")
print(f"  Search:          Hybrid (text + image) with production reranking")
print(f"  Default LLM:     {DEFAULT_LLM_NAME}")
print(f"  Available LLMs:  {list(llm_registry.keys())}")
print(f"  Device:          {DEVICE} ({GPU_NAME})")

print("\n--- Quality (LLM-as-Judge, 50 queries) ---")
for m in ["faithfulness", "relevance", "helpfulness"]:
    v = df_eval[m][df_eval[m] > 0]
    if len(v): print(f"  {m:15s}  {v.mean():.2f}/5")

print("\n--- Latency ---")
print(f"  Retrieval p95:   {df_eval['retrieval_time'].quantile(0.95):.3f}s  (target < 1s)")
print(f"  Total p95:       {df_eval['total_time'].quantile(0.95):.3f}s  (target < 5s)")
print(f"  Total p99:       {df_eval['total_time'].quantile(0.99):.3f}s")

print("\n--- Exports ---")
for p in sorted(EXPORT_DIR.glob("*")):
    if p.is_file() and p.suffix in [".pkl", ".csv", ".json", ".npy"]:
        print(f"  {p.name:35s}  {p.stat().st_size / 1e6:>6.1f} MB")

print("\n" + "=" * 60)
print("LLM integration complete. Next: 05-fine-tuning.ipynb")
print("=" * 60)