# Modelling

## Data Loading & Libraries

In [1]:
import json, pandas as pd, numpy as np, random
from typing import Dict, List, Any
from pathlib import Path

from scipy import sparse
from joblib import dump, load
from sklearn.metrics.pairwise import cosine_similarity

# KNN + SVD
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NearestNeighbors

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from explanations import attach_llm_explanations

ART = Path('../../data/processed')

W_NOTE = {"top": 0.35, "mid": 0.40, "base": 0.25}

W_BLOCK = {"accord": 0.80, "meta": 0.20}

ACCORD_POS_WEIGHTS = np.array([1.0, 0.8, 0.6, 0.4, 0.2], dtype=np.float32)
DEFAULT_ACCORD_WEIGHT = 0.6

SEASON_TO_ACCORD_HINTS = {
    "summer": {
        "boost": {
            "citrus", "aquatic", "ozonic", "green", "aromatic", "fresh", "fresh spicy",
            "fruity", "marine", "soapy", "tropical", "salty", "coconut", "musky"
        },
        "penalize": {
            "amber", "sweet", "gourmand", "smoky", "leather", "tobacco", "vanilla", "balsamic",
            "oud", "chocolate", "honey", "coffee", "oriental"
        },
    },
    "spring": {
        "boost": {
            "floral", "white floral", "green", "citrus", "aromatic", "fresh", "fresh spicy",
            "violet", "rose", "herbal", "aldehydic", "powdery", "lavender"
        },
        "penalize": {
            "animalic", "leather", "oud", "smoky", "tobacco", "gourmand"
        },
    },
    "fall": {
        "boost": {
            "woody", "warm spicy", "amber", "tobacco", "leather", "balsamic",
            "patchouli", "vanilla", "cinnamon", "honey", "earthy", "mossy",
            "oriental", "coffee", "chocolate"
        },
        "penalize": {
            "aquatic", "ozonic", "marine", "soapy", "fresh"
        },
    },
    "winter": {
        "boost": {
            "amber", "vanilla", "sweet", "smoky", "leather", "balsamic", "oud", "tobacco",
            "whiskey", "rum", "wine", "chocolate", "cacao", "coffee",
            "spicy", "warm spicy", "oriental", "honey", "almond", "nutty", "powdery", "musky"
        },
        "penalize": {
            "aquatic", "green", "ozonic", "citrus", "fresh", "marine", "tropical", "coconut"
        },
    },
}

USE_CASE_HINTS = {
    "office": {
        "boost": {
            "citrus", "aromatic", "green", "woody", "fresh spicy", "musky", "powdery", "soapy", "ozonic"
        },
        "penalize": {
            "animalic", "oud", "smoky", "leather", "gourmand", "sweet", "tobacco",
            "alcohol", "rum", "whiskey", "wine", "vodka", "champagne",
            "coffee", "chocolate", "honey", "cannabis"
        },
    },
    "date": {
        "boost": {
            "vanilla", "amber", "sweet", "fruity", "warm spicy", "soft spicy",
            "white floral", "rose", "musky", "powdery",
            "chocolate", "honey", "coconut", "tropical", "almond", "caramel", "lactonic", "creamy"
        },
        "penalize": {
            "aquatic", "ozonic", "green", "aldehydic", "soapy", "metallic", "marine", "fresh"
        },
    },
    "gym": {
        "boost": {
            "citrus", "green", "aquatic", "ozonic", "aromatic", "fresh", "fresh spicy",
            "soapy", "musky", "marine", "herbal"
        },
        "penalize": {
            "sweet", "gourmand", "amber", "vanilla", "oud", "smoky", "leather", "tobacco",
            "honey", "chocolate", "coffee", "coconut", "oriental", "balsamic"
        },
    },
    "casual": {
        "boost": {
            "citrus", "fruity", "aromatic", "green", "aquatic", "fresh", "fresh spicy",
            "musky", "woody", "soapy", "ozonic"
        },
        "penalize": {
            "animalic", "oud", "leather", "tobacco", "smoky", "gourmand", "sweet", "coffee", "cannabis"
        },
    },
    "formal": {
        "boost": {
            "woody", "iris", "aldehydic", "amber", "leather", "powdery", "rose",
            "spicy", "soft spicy", "warm spicy", "patchouli", "musky", "balsamic", "violet", "oriental"
        },
        "penalize": {
            "gourmand", "sweet", "aquatic", "ozonic", "fruity", "tropical", "coconut", "cherry"
        },
    },
    "signature": {
        "boost": {
            "citrus", "aromatic", "woody", "green", "fresh spicy", "musky", "powdery", "soapy",
            "floral", "white floral", "ozonic"
        },
        "penalize": {
            "animalic", "oud", "smoky", "tobacco", "gourmand", "sweet", "leather",
            "coffee", "cannabis", "oriental"
        },
    },
}

INTENSITY_WEIGHT = {"soft": -0.10, "moderate": 0.0, "loud": +0.10}

CFG = {
    #Seed
    'seed': 42,
    # embeddings
    "embed_dim": 256,          # try 128/256/384 if needed

    # retrieval
    "knn_neighbors": 1000,      # recall pool size before MMR
    "mmr_lambda": 0.40,         # 0.6 relevance / 0.4 diversity
    "topk": 20,                # final list length

    # AE training
    "ae_epochs": 10,
    "ae_batch": 256,
    "ae_lr": 1e-3,
    "ae_p_mask": 0.15,         # denoising: randomly drop inputs

    # evaluation
    "eval_k": 20               # evaluate top-k lists
}

In [2]:
# Set seed for reproducibility
random.seed(CFG['seed']); np.random.seed(CFG['seed']); torch.manual_seed(CFG['seed'])

# load data
X = sparse.load_npz(ART/'X_sparse.npz').tocsr()
personas = pd.read_parquet(ART/'personas_v3.parquet')
feature_meta = json.loads((ART/'feature_meta.json').read_text())
items = pd.read_parquet(ART/'items.parquet')
bridge  = pd.read_parquet(ART/'fragrance_note_bridge.parquet')

feat_names = feature_meta['feature_names']
feat_pos = {c:i for i,c in enumerate(feat_names)}

In [3]:
personas['liked_accords_ranked'][0]

array([{'name': 'woody', 'rank': 1}, {'name': 'warm spicy', 'rank': 2},
       {'name': 'amber', 'rank': 3}, {'name': 'leather', 'rank': 4},
       {'name': 'smoky', 'rank': 5}], dtype=object)

## Utility Functions (L2 , MMR)

Since the Sparse Matrix and the queries are l2 normalized, the cosine similarity turns into a dot product.

In [4]:
# this function l2 normalizes a row in a csr matrix
def l2_normalize_row(q: sparse.csr_matrix) -> sparse.csr_matrix:
    n = np.sqrt(q.multiply(q).sum())
    return q if n == 0 else q.multiply(1.0/float(n))

def mmr_from_relevance(rel_scores: np.ndarray,
                       cand_vecs: np.ndarray,
                       cand_ids: np.ndarray,
                       lambda_relevance: float = CFG["mmr_lambda"],
                       top_k: int = CFG["topk"]) -> list:
    """
    MMR using precomputed relevance scores for each candidate (rel_scores ~ length m),
    and candidate embeddings for diversity (cand_vecs ~ (m×d), L2-normalized rows).
    """
    rel = np.asarray(rel_scores).ravel()
    m = len(cand_ids)
    assert rel.shape[0] == m and cand_vecs.shape[0] == m, "rel_scores and cand_vecs must align"

    selected, rest = [], list(range(m))
    for _ in range(min(top_k, m)):
        if not selected:
            j = int(np.argmax(rel[rest]))
            selected.append(rest.pop(j))
            continue

        # max similarity to the already selected set (diversity term)
        S = cand_vecs[selected] @ cand_vecs[rest].T          # (|S| × |rest|)
        max_sim = S.max(axis=0)                              # (|rest|,)

        # MMR score = λ·relevance − (1−λ)·redundancy
        score = lambda_relevance * rel[rest] - (1.0 - lambda_relevance) * max_sim
        j = int(np.argmax(score))
        selected.append(rest.pop(j))

    return [cand_ids[i] for i in selected]

## Pipeline A: SVD -> KNN -> MMR

In [5]:
svd = TruncatedSVD(n_components=CFG['embed_dim'],random_state=42)
svd_pipe = make_pipeline(svd, Normalizer(copy=False))
Z_svd = svd_pipe.fit_transform(X)

knn_svd = NearestNeighbors(n_neighbors=CFG['knn_neighbors'], metric='cosine').fit(Z_svd)

np.save(ART/'svd_embeddings.npy', Z_svd.astype('float32'))
dump(svd_pipe, ART/'svd_pipe.joblib')
dump(knn_svd, ART/'svd_knn.joblib')

['../../data/processed/svd_knn.joblib']

We normalize after SVD to keep the direction our focus instead of magnitudes as component magnitudes will skew similarities in kNN search. Normalizing insures that you get more diverse neighbors, basically unclustering by size of embeddings, so just direction matters.

## Pipeline B: Masked Autoencoder -> KNN -> MMR

In [6]:
class CSRDataset(Dataset):
    def __init__(self, X : sparse.csr_matrix, p_mask=0.15, to_tensor: bool = True):
        self.X, self.p_mask, self.D, self.to_tensor = X, p_mask, X.shape[1], to_tensor
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i):
        row = self.X.getrow(i).toarray().astype(np.float32).ravel()
        if self.p_mask>0:
            m = (np.random.rand(self.D) < self.p_mask)
            x = row.copy()
            x[m] = 0.0
        else:
            x = row

        if self.to_tensor:
            x = torch.from_numpy(x)
            row = torch.from_numpy(row)
        return x, row

class Autoencoder(nn.Module):
    def __init__(self, D, d=256):
        super().__init__()
        self.enc = nn.Sequential(nn.Linear(D, 1024), nn.ReLU(), nn.Linear(1024, d))
        self.dec = nn.Sequential(nn.Linear(d,1024), nn.ReLU(), nn.Linear(1024, D))
    def forward(self,x):
        z = self.enc(x)
        return z, self.dec(z)

We will use a Masked Autoencoder (MAE) to make the model more robust at handling incomplete data, as well as generalize better and avoid overfitting.

In [7]:
device = "mps" if torch.mps.is_available() else "cpu"
ds = CSRDataset(X, p_mask=CFG['ae_p_mask'], to_tensor=True)
dl = DataLoader(ds, batch_size=CFG['ae_batch'],shuffle= True, num_workers=0)

model = Autoencoder(D=X.shape[1],d=CFG['embed_dim']).to(device)
opt = torch.optim.Adam(model.parameters(),lr = CFG['ae_lr'])
loss_fn = nn.MSELoss()

model.train()
for epoch in range(1, CFG['ae_epochs']+1):
    tot = 0.0
    for xb, yb in dl:
        xb, yb  = xb.to(device, non_blocking=True).float(), yb.to(device,non_blocking=True).float()
        z, xhat = model(xb)
        loss = loss_fn(xhat, yb)
        opt.zero_grad(); loss.backward(); opt.step()
        tot += loss.item()*xb.size(0)
    print(f"epoch {epoch:02d}: {tot/len(dl.dataset):.6f}")

epoch 01: 0.000392
epoch 02: 0.000266
epoch 03: 0.000217
epoch 04: 0.000194
epoch 05: 0.000179
epoch 06: 0.000168
epoch 07: 0.000160
epoch 08: 0.000153
epoch 09: 0.000148
epoch 10: 0.000144


In [8]:
@torch.no_grad()
def batch_encode_csr(model, X_csr, batch_size=CFG['ae_batch'], device = device):
    model.eval()
    emb=[]

    loader = DataLoader(CSRDataset(X_csr, p_mask=0.0, to_tensor=True),
                        batch_size=batch_size, shuffle=False, num_workers=0)

    for xb, _ in loader:
        xb = xb.to(device, non_blocking=True)
        z, _ = model(xb)
        z = z / (z.norm(dim=1,keepdim=True) + 1e-9)
        emb.append(z.cpu().numpy())

    return np.vstack(emb).astype('float32')

Z_ae = batch_encode_csr(model, X, batch_size=CFG['ae_batch'], device=device)
knn_ae = NearestNeighbors(n_neighbors=CFG['knn_neighbors'], metric='cosine').fit(Z_ae)

In [9]:
torch.save(model.state_dict(), ART/'ae.pt')
np.save(ART/'ae_embeddings.npy', Z_ae)
dump(knn_ae, ART/'ae_knn.joblib')

['../../data/processed/ae_knn.joblib']

## Query Building from User/Persona

In [10]:
note_to_col = {
    "top":  {n: f"{n}_top"  for n in feature_meta["top_mlb_classes"]},
    "mid":  {n: f"{n}_mid"  for n in feature_meta["mid_mlb_classes"]},
    "base": {n: f"{n}_base" for n in feature_meta["base_mlb_classes"]},
}

def _rank_weight(rank: int) -> float:
    return float(ACCORD_POS_WEIGHTS[rank-1]) if 1 <= rank <= 5 else 0.0

def build_query(pref: dict) -> sparse.csr_matrix:
    """
    Returns a (1×D) CSR row in the **same feature space and order** as X_sparse.
    Blocks & weights mirror training:
      - Notes: per-level weight (top/mid/base) + per-block L2
      - Avoid notes: same as liked but NEGATIVE weight
      - Accords: rank weights [1,.8,.6,.4,.2] + per-block L2, then × W_BLOCK["accord"]
      - Disliked accords: NEGATIVE default weight (no rank) in accord block
      - Meta: neutral (zeros unless provided separately)
    """
    D = len(feat_names)

    # ----- Notes block (liked + avoid) -----
    cols_notes, data_notes = [], []

    def add_note(level: str, name: str, weight: float):
        name = str(name).strip().lower()
        col_name = note_to_col[level].get(name)
        if col_name is None:
            return
        j = feat_pos.get(col_name)
        if j is None:
            return
        cols_notes.append(j)
        data_notes.append(weight)

    # liked notes (positive weights)
    for level_key, level in [
        ("liked_notes_top",  "top"),
        ("liked_notes_mid",  "mid"),
        ("liked_notes_base", "base"),
    ]:
        for n in map(str, pref.get(level_key, [])):
            add_note(level, n, W_NOTE[level])

    # avoid notes (negative weights)
    for n in map(str, pref.get("avoid_notes", [])):
        # check all three levels (avoid could be in any vocab)
        for level in ("top", "mid", "base"):
            if n in note_to_col[level]:
                add_note(level, n, -W_NOTE[level])

    notes_row = sparse.csr_matrix(
        (data_notes, ([0]*len(cols_notes), cols_notes)),
        shape=(1, D), dtype=np.float32
    )
    notes_row = l2_normalize_row(notes_row)

    # ----- Accords block (liked + disliked) -----
    cols_acc, data_acc = [], []
    used_ranks = set()
    for entry in pref.get("liked_accords_ranked", []):
        name = str(entry.get("name", "")).strip().lower()
        rank = int(entry.get("rank", 0))
        if not (1 <= rank <= 5):
            continue
        if rank in used_ranks:
            continue
        j = feat_pos.get(f"accord_{name}")
        if j is None:
            continue
        cols_acc.append(j)
        data_acc.append(_rank_weight(rank))
        used_ranks.add(rank)

    # disliked accords (negative default weight)
    for name in map(str, pref.get("disliked_accords", [])):
        j = feat_pos.get(f"accord_{name.strip().lower()}")
        if j is None:
            continue
        cols_acc.append(j)
        data_acc.append(-DEFAULT_ACCORD_WEIGHT)

    acc_row = sparse.csr_matrix(
        (data_acc, ([0]*len(cols_acc), cols_acc)),
        shape=(1, D), dtype=np.float32
    )
    acc_row = l2_normalize_row(acc_row).multiply(W_BLOCK["accord"])

    meta_row = sparse.csr_matrix((1, D), dtype=np.float32)

    q = notes_row + acc_row + meta_row
    return q.tocsr()

### Convert all personas to queries

In [11]:
def personas_to_csr_from_df(df: pd.DataFrame, feature_meta: dict) -> sparse.csr_matrix:
    rows = []
    for _, r in df.iterrows():
        # Reuse your build_query directly; we only encode taste (notes/accords).
        pref = {
            "liked_accords_ranked": r["liked_accords_ranked"],
            "disliked_accords": r['disliked_accords'],
            "liked_notes_top": r["liked_notes_top"],
            "liked_notes_mid": r["liked_notes_mid"],
            "liked_notes_base": r["liked_notes_base"],
            "avoid_notes": r['avoid_notes'],
            'gender_focus': r['gender_focus'],
            'season' : r['season'],
            'use_case' : r['use_case'],
            'intensity': r['intensity']
        }
        q = build_query(pref)   # (1×D) CSR in the same feature space as X_sparse
        rows.append(q)
    return sparse.vstack(rows).tocsr()

P_csr = personas_to_csr_from_df(personas, feature_meta)
sparse.save_npz(ART/'personas_csr.npz',P_csr)
print("Personas CSR shape:", P_csr.shape)

Personas CSR shape: (92, 3476)


## Query Encodings for Both Pipelines

In [12]:
ZP_ae = batch_encode_csr(model, P_csr, batch_size=CFG['ae_batch'], device=device)
ZP_svd = svd_pipe.transform(P_csr).astype('float32')

# Save
np.save(ART/'persona_ae_embeddings.npy', ZP_ae)
np.save(ART/'persona_svd_embeddings.npy', ZP_svd)

## Persona + Fragrance Affinity

In [13]:
A_item_persona_ae  = Z_ae  @ ZP_ae.T   # (N_items × P)
A_item_persona_svd = Z_svd @ ZP_svd.T

# Save
np.save(ART/'A_item_persona_ae.npy',  A_item_persona_ae.astype('float32'))
np.save(ART/'A_item_persona_svd.npy', A_item_persona_svd.astype('float32'))

## Recomendation Engine

### Encoding Functions for Preferences

In [14]:
def encode_query_ae(preference: dict, model, device: str = device) -> np.ndarray:
    q = build_query(preference)  # (1×D) CSR aligned to X_sparse
    with torch.no_grad():
        xb = torch.from_numpy(q.toarray().astype(np.float32)).to(device)
        z, _ = model(xb)
        z = z / (z.norm(dim=1, keepdim=True) + 1e-9)
    return z.cpu().numpy()  # (1×d)

def encode_query_svd(preference: dict, svd_pipe) -> np.ndarray:
    q = build_query(preference)  # (1×D) CSR
    z = svd_pipe.transform(q)
    return z.astype('float32')  # (1×d)

### Persona Boost: Blend content relevance with a collaborative persona signal.

This makes sure that our recommender gets the collaborative signal from other people and adds the 'others liked this, so you might as well' part to the recommendations. As well as adding context bias such as filters for Season and Use Cases by boosting/penalizing certain accords.

In [15]:
def persona_boost(zq_vec: np.ndarray,
                  Z_persona: np.ndarray,
                  A_item_persona: np.ndarray,
                  cand_ids: np.ndarray,
                  top_personas: int = 12,
                  temperature: float = 0.1) -> np.ndarray:
    """
    Compute a collaborative 'people-like-you' score for candidate items using precomputed item-to-persona affinities.
    """
    #similiarity of query to each persona
    sim_p = Z_persona @ zq_vec

    # focus on top persona matches; suppress noise from weakly similar personas
    if top_personas and top_personas < sim_p.size:
        keep = np.argpartition(-sim_p,top_personas)[:top_personas]
        mask = np.full_like(sim_p, -np.inf, dtype=np.float32)
        mask[keep] = sim_p[keep]
        sim_p=mask

    # softmax weights over personas
    x = sim_p / max(1e-6, float(temperature))
    x = x - np.nanmax(x[np.isfinite(x)])
    w = np.exp(np.where(np.isfinite(x), x, -1e9))
    w = w / (w.sum() + 1e-9)  # (P,)

    # persona boost for the specific candidates: (m×P) @ (P,) → (m,)
    boost = A_item_persona[cand_ids] @ w

    # scale to 0..1 for stable blending
    bmin, bmax = boost.min(), boost.max()
    return ((boost - bmin) / (bmax - bmin + 1e-9)).astype('float32')

def inject_context_bias(rel_vec: np.ndarray, cand_ids: np.ndarray, items_df: pd.DataFrame,
                        preference: dict, bonus=0.01, penalty=0.01) -> np.ndarray:
    """Light, optional pre-MMR nudge using season/use_case hints."""
    season   = (preference.get("season") or "").strip().lower()
    use_case = (preference.get("use_case") or "").strip().lower()

    rel = rel_vec.copy()
    for i, idx in enumerate(cand_ids):
        accs = accords_set_from_row(items_df.iloc[idx])
        if season in SEASON_TO_ACCORD_HINTS:
            b = SEASON_TO_ACCORD_HINTS[season]["boost"]; p = SEASON_TO_ACCORD_HINTS[season]["penalize"]
            if accs & b: rel[i] += bonus
            if accs & p: rel[i] -= penalty
        if use_case in USE_CASE_HINTS:
            b = USE_CASE_HINTS[use_case]["boost"]; p = USE_CASE_HINTS[use_case]["penalize"]
            if accs & b: rel[i] += bonus
            if accs & p: rel[i] -= penalty
    return rel

### Recommendation Helpers

In [16]:
def gender_ok(row_gender: str, pref: str) -> bool:
    g = (row_gender or "").strip().lower()
    p = (pref or "").strip().lower()
    if not p: return True
    if p == "unisex":         return g in {"unisex"}
    if p == "men":            return g in {"men","unisex"}
    if p == "women":          return g in {"women","unisex"}
    # if user is strict, you could enforce equality only:
    return g == p

def get_item_notes(fragrance_id: str, bridge_df: pd.DataFrame, k_per_level=4) -> dict:
    if bridge_df is None: return {}
    sub = bridge_df[bridge_df['fragrance_id'] == fragrance_id]
    out = {}
    for level in ('top','mid','base'):
        notes = sub.loc[sub['level'] == level,'note'].tolist()
        out[level] = notes[:k_per_level]
    return out

def accords_set_from_row(row: pd.Series) -> set:
    acc = []
    for k in ("mainaccord1","mainaccord2","mainaccord3","mainaccord4","mainaccord5"):
        v = row.get(k, None)
        if pd.isna(v):
            continue
        s = str(v).strip().lower()
        if s and s != "nan":
            acc.append(s)
    return set(acc)

def prefilter_candidates(cand_ids: np.ndarray, items_df: pd.DataFrame, preference: Dict[str, Any]) -> List[int]:
    gender_pref = (preference.get("gender_focus") or "").strip().lower()
    keep = []
    for idx in cand_ids:
        row = items_df.iloc[idx]
        if gender_pref and not gender_ok(str(row.get("Gender","")), gender_pref):
            continue
        keep.append(idx)
    return keep if keep else list(cand_ids)

def soft_context_rerank(selected_ids: List[int], items_df: pd.DataFrame, preference: Dict[str, Any]) -> List[int]:
    season   = (preference.get("season") or "").strip().lower()
    use_case = (preference.get("use_case") or "").strip().lower()
    intensity= (preference.get("intensity") or "").strip().lower()

    scores = np.zeros(len(selected_ids), dtype=np.float32)
    for i, idx in enumerate(selected_ids):
        row = items_df.iloc[idx]
        accs = accords_set_from_row(row)

        if season in SEASON_TO_ACCORD_HINTS:
            b = SEASON_TO_ACCORD_HINTS[season]["boost"]
            p = SEASON_TO_ACCORD_HINTS[season]["penalize"]
            if accs & b: scores[i] += 0.05
            if accs & p: scores[i] -= 0.05

        if use_case in USE_CASE_HINTS:
            b = USE_CASE_HINTS[use_case]["boost"]
            p = USE_CASE_HINTS[use_case]["penalize"]
            if accs & b: scores[i] += 0.05
            if accs & p: scores[i] -= 0.05

        if intensity in INTENSITY_WEIGHT:
            scores[i] += INTENSITY_WEIGHT[intensity]

    order = np.argsort(-scores, kind="stable")
    return [selected_ids[j] for j in order]

### Recommender

In [17]:
def recommend(preference: dict,
              mode: str = 'ae',
              top_k: int = CFG['topk'],
              beta_persona: float = 0.35,
              mmr_lambda: float = CFG['mmr_lambda'],
              top_personas: int = 20,
              temperature: float = 0.2,
              use_context_bias: bool = True) -> pd.DataFrame:
    """
    preference: dict with liked_accords_ranked, liked_notes_top/mid/base, and optional filters:
      gender_focus ∈ {"men","women","unisex","any"}, season, use_case, intensity
    """
    # 1) encode query
    if mode == 'ae':
        zq = encode_query_ae(preference,model,device)
        Z_catalog, knn_index = Z_ae, knn_ae
        Z_persona = ZP_ae; A_item_persona = A_item_persona_ae
    else:
        zq = encode_query_svd(preference,svd_pipe)
        Z_catalog, knn_index = Z_svd, knn_svd
        Z_persona = ZP_svd; A_item_persona = A_item_persona_svd

    zqv = zq.ravel()

    # 2) Get large candidate pool, hard prefilter(gender)
    _, nbrs = knn_index.kneighbors(zq,n_neighbors=CFG['knn_neighbors'])
    cand_ids = nbrs[0]
    cand_ids = np.array(prefilter_candidates(cand_ids,items,preference),dtype=int)
    if cand_ids.size == 0:
        return items.iloc[:0].copy()

    # 3) Compute relevance signals
    Z_cand = Z_catalog[cand_ids]
    rel_content = Z_cand @ zqv
    rel_persona = persona_boost(zqv, Z_persona,A_item_persona,cand_ids, top_personas, temperature)
    rel_fused = (1.0 - beta_persona) * rel_content + beta_persona * rel_persona

    # optional to add context bias
    if use_context_bias:
        rel_fused = inject_context_bias(rel_fused,cand_ids,items,preference,bonus=0.02,penalty=0.02)

    # 4) Diversify with MMR
    selected = mmr_from_relevance(rel_fused,Z_cand, cand_ids,lambda_relevance=mmr_lambda, top_k=top_k)

    # 5) Soft rerank (season/use_case/intensity)
    selected = soft_context_rerank(selected,items,preference)

    # 6) results + explanations
    pos = {cid:i for i,cid in enumerate(cand_ids)}
    cols = ["fragrance_id","Brand","Perfume","Year","Gender",
            "mainaccord1","mainaccord2","mainaccord3","mainaccord4","mainaccord5",
            "Weighted Rating","Rating Count","url"]
    df = items.iloc[selected][cols].copy()

    df['score_content'] = [float(rel_content[pos[i]]) for i in selected]
    df['score_persona'] = [float(rel_persona[pos[i]]) for i in selected]
    df['score_fused'] = [float(rel_fused[pos[i]]) for i in selected]

    # explainability
    query_accords = {str(a.get('name','')).strip().lower() for a in (preference.get('liked_accords_ranked')) if isinstance(a,dict)}

    why_acc = []
    why_notes = []
    for _, r in df.iterrows():
        accs = accords_set_from_row(r)
        why_acc.append(', '.join(sorted(accs & query_accords)))
        if bridge is not None:
            notes = get_item_notes(str(r['fragrance_id']),bridge,k_per_level=3)
            why_notes.append(f"top: {', '.join(notes.get('top',[]))} | mid: {', '.join(notes.get('mid',[]))} | base: {', '.join(notes.get('base',[]))}")
        else:
            why_notes.append("")
    df['why_accords_overlap'] = why_acc
    df['sample_notes'] = why_notes

    return df.reset_index(drop=True)

### Sanity Check

In [18]:
# Minimal preference (accords only)
test_pref = {
    "liked_accords_ranked": [
        {"name": "woody", "rank": 1},
        {"name": "citrus", "rank": 2},
        {"name": "aromatic", "rank": 3},
        {"name": "fresh spicy", "rank": 4},
        {"name": "vanilla", "rank": 5},
    ],
    "gender_focus": "any",
    "season": "summer",
    "use_case": "office",
    "intensity": "moderate",
}

out_ae = recommend(test_pref, mode="ae", top_k=CFG['topk'])
out_svd = recommend(test_pref, mode="svd", top_k=CFG['topk'])

out_ae = attach_llm_explanations(out_ae,test_pref, model='gpt-4o-mini')
out_svd = attach_llm_explanations(out_svd,test_pref, model='gpt-4o-mini')

display(out_ae.head(5)[["Brand","Perfume","Year","why_accords_overlap","sample_notes","explanation_llm"]])
display(out_svd.head(5)[["Brand","Perfume","Year","why_accords_overlap","sample_notes","explanation_llm"]])

assert out_ae.shape[0] > 0 and out_svd.shape[0] > 0, "Empty results — check query build or filters."

Unnamed: 0,Brand,Perfume,Year,why_accords_overlap,sample_notes,explanation_llm
0,benetton,benetton-sport-man,2001.0,"aromatic, citrus, fresh spicy, woody","top: lime, lemon, clementine | mid: ginger flo...","This captures the essence of summer vitality, ..."
1,azzaro,chrome-eau-de-parfum,2022.0,"aromatic, citrus, woody",top: green mandarin | mid: lavender | base: pine,This captures the essence of summer in the off...
2,issey-miyake,l-eau-d-issey-pour-homme-summer-edition-by-kev...,2022.0,"aromatic, fresh spicy, woody","top: watery notes, bergamot, lemon | mid: euca...",This captures the essence of summer in the off...
3,acqua-di-parma,note-di-colonia-ii,2016.0,"aromatic, citrus, fresh spicy, woody","top: basil, bergamot, orange | mid: sandalwood...",This captures the essence of summer in an offi...
4,vertus,bois-et-cuir,2015.0,"aromatic, citrus, fresh spicy, woody","top: grapefruit, ivy, apple | mid: cedar, lave...",This captures the essence of summer in the off...


Unnamed: 0,Brand,Perfume,Year,why_accords_overlap,sample_notes,explanation_llm
0,alain-delon,samourai-47,2002.0,"aromatic, citrus, woody","top: japanese plum, tarragon, lemon | mid: lav...",This captures the essence of summer in an offi...
1,ds-durga,grapefruit-generation,2021.0,"citrus, fresh spicy, woody","top: pomelo, elm | mid: paradisone, hawthorn, ...","This captures the essence of summer vitality, ..."
2,comme-des-garcons,scent-three-sugi,2013.0,"aromatic, fresh spicy, woody","top: cypress, madagascar pepper | mid: iris, v...",This captures the essence of a refreshing summ...
3,fossil,fossil-1954-for-women,2014.0,"aromatic, citrus, woody","top: ginger, grapefruit, mandarin blossom | mi...",This captures the essence of summer in the off...
4,avon,instinct-for-him,2013.0,"aromatic, citrus, woody",top: mandarin orange | mid: sage | base: ebony,This captures the essence of a refreshing summ...


In [19]:
out_ae.head(2)[['explanation_llm']]

Unnamed: 0,explanation_llm
0,"This captures the essence of summer vitality, ..."
1,This captures the essence of summer in the off...


"This captures the essence of summer vitality, making it a refreshing companion for the office.\n\n- The zesty citrus notes of lime and lemon invigorate your senses, creating an uplifting atmosphere.\n- Aromatic hints of ginger flower and geranium add a sophisticated twist, ensuring you stand out in a professional setting.\n- The woody base of cedar and sandalwood grounds the fragrance, lending a subtle warmth that resonates throughout the day."

"This captures the essence of summer in the office with its invigorating freshness.  \n- The watery notes and citrusy bergamot create an uplifting atmosphere, energizing your workday.  \n- Eucalyptus and basil in the heart add a refreshing twist, making every interaction feel vibrant.  \n- The woody base of cypress and pine grounds the fragrance, ensuring it remains sophisticated and professional."


### Evaluation Metrics

- **Precision@k:** Fraction of recommended items in the top-k that are relevant.
- **nDCG:** Captures ranking quality; relevant items at rank 1 count more than at rank 10.
- **Intra-List Diversity (ILD):** 1- average pairwise cosine among results -> higher is better variety.
- **Novelty:**  penalize super-popular items (log Rating Count proxy).
- **Persona Alignment:** do the results reflect a user's high-level taste (accords)?

In [20]:
# Rank
def precision_at_k(rels: np.ndarray, k: int) -> float:
    k = min(k, len(rels))
    return 0.0 if k == 0 else float(rels[:k].sum()) / float(k)

def dcg_at_k(gains: np.ndarray, k: int) -> float:
    k = min(k, len(gains))
    if k == 0: return 0.0
    denom = np.log2(np.arange(2, k+2))
    return float((gains[:k] / denom).sum())

def ndcg_at_k(gains: np.ndarray, k: int) -> float:
    k = min(k, len(gains))
    if k == 0: return 0.0
    ideal = np.sort(gains)[::-1]
    idcg  = dcg_at_k(ideal, k)
    return 0.0 if idcg == 0.0 else float(dcg_at_k(gains, k) / idcg)

# Diversity
def intra_list_diversity(ids: list, Z_catalog: np.ndarray) -> float:
    """ILD = 1 − mean cosine similarity among results (higher → more variety)."""
    if len(ids) <= 1: return 0.0
    M = Z_catalog[ids]  # (m×d)
    S = cosine_similarity(M)
    tri = S[np.triu_indices(len(ids), k=1)]
    return float(1.0 - tri.mean()) if tri.size else 0.0

# Novelty
def avg_novelty_log_pop(recs_df, eps: float = 1.0) -> float:
    """
    Novelty via -log(popularity), with popularity proxied by Rating Count.
    Higher = more novel (less globally popular).
    """
    if recs_df is None or recs_df.empty: return 0.0
    pop = recs_df["Rating Count"].to_numpy(dtype=float)
    return float((-np.log(pop + eps)).mean())

# Persona Alignement with accords per recommendation
def persona_alignment_jaccard(recs_df: pd.DataFrame, persona: dict) -> float:
    """
    Mean Jaccard similarity between persona's liked accords (names only)
    and each item's {mainaccord1..5}. Higher = better alignment with high-level taste.
    """
    ranked = persona.get('liked_accords_ranked', [])
    P = {
        str(e.get('name', '')).strip().lower()
        for e in ranked if isinstance(e, dict) and str(e.get('name', '')).strip()
    }
    if not P or recs_df is None or recs_df.empty:
        return 0.0

    sims = []
    for _, r in recs_df.iterrows():
        A = {
            str(r.get(f"mainaccord{j}", "")).strip().lower()
            for j in range(1, 6)
            if isinstance(r.get(f"mainaccord{j}"), str) and str(r.get(f"mainaccord{j}", "")).strip()
        }
        inter = len(P & A)
        union = max(1, len(P | A))
        sims.append(inter / union)
    return float(np.mean(sims)) if sims else 0.0

These are helpers for getting the relevance score of a persona against each item in the recommendations.

In [21]:
def build_note_index(bridge_df: pd.DataFrame) -> dict:
    if bridge_df is None or bridge_df.empty: return {}
    g = bridge_df.groupby('fragrance_id')['note'].apply(lambda s: set(map(str.lower, s.astype(str))))
    return {str(k): set(v) for k,v in g.items()}

NOTE_INDEX = build_note_index(bridge)

def item_accord_weight(row: pd.Series) -> dict:
    d = {}
    for pos, w in enumerate(ACCORD_POS_WEIGHTS, start=1):  # 1..5
        a = row.get(f"mainaccord{pos}", "")
        if isinstance(a, str) and a.strip():
            d[a.strip().lower()] = float(w)
    return d

ITEM_ACCORD_WEIGHTS = [item_accord_weight(items.iloc[i]) for i in range(len(items))]

# Penalties (tunable, mirroring block magnitudes)
LAMBDA_ACC_PENALTY  = 0.80   # disliked accords
LAMBDA_NOTE_PENALTY = 0.35   # avoid notes

def persona_item_score(persona: dict, row_idx: int) -> float:
    """
    Continuous relevance s(i|persona): accord gains + note gains - penalties.
    Matches weighting: accords use rank weights; notes use W_NOTE per level.
    """
    pw = {}
    for e in persona.get('liked_accords_ranked',[]):
        name = str(e.get('name','')).strip().lower()
        rank = int(e.get('rank',0))
        if (1<= rank <=5) and name:
            pw[name] = float(ACCORD_POS_WEIGHTS[rank-1])
    liked_top = set(map(str.lower, persona.get("liked_notes_top", [])))
    liked_mid = set(map(str.lower, persona.get("liked_notes_mid", [])))
    liked_base = set(map(str.lower, persona.get("liked_notes_base", [])))
    avoid_notes = set(map(str.lower, persona.get("avoid_notes", [])))
    disliked_accs = set(map(str.lower, persona.get("disliked_accords", [])))

    fid = str(items.iloc[row_idx]['fragrance_id'])
    notes_i = NOTE_INDEX.get(fid, set())
    acc_i_w = ITEM_ACCORD_WEIGHTS[row_idx]

    # Accord gain
    acc_gain = 0.0
    for a, w_p in pw.items():
        w_i = acc_i_w.get(a, 0.0)
        if w_i > 0:
            acc_gain += w_p * w_i

    # Note gain
    note_gain = (
        W_NOTE["top"]  * len(liked_top  & notes_i) +
        W_NOTE["mid"]  * len(liked_mid  & notes_i) +
        W_NOTE["base"] * len(liked_base & notes_i)
    )

    # Penalties for explicit dislikes/avoids
    acc_pen = LAMBDA_ACC_PENALTY  * len(disliked_accs & set(acc_i_w.keys()))
    note_pen= LAMBDA_NOTE_PENALTY * len(avoid_notes  & notes_i)

    return float(acc_gain + note_gain - acc_pen - note_pen)

def persona_catalog_scores(persona: dict, gain_cap: float = 6.0, thresh: float = 0.0):
    """
    Returns arrays aligned to `items` rows:
      - scores_all  : continuous s(i|persona)
      - binary_all  : 1 if s > thresh (for precision/recall)
      - gains_all   : max(0, s) clipped to gain_cap (for nDCG idealization)
    """
    N = len(items)
    scores = np.fromiter((persona_item_score(persona,i) for i in range(N)), dtype=float, count=N)
    binary = (scores > float(thresh)).astype(np.float32)
    gains = np.maximum(scores, 0.0)
    if gain_cap is not None:
        gains = np.minimum(gains, float(gain_cap))
    return scores, binary, gains

Evaluate functions

In [22]:
def evaluate_recommender(
    personas_input,                  # dict (single persona) OR pd.DataFrame (many)
    mode: str = "ae",
    k: int = CFG["eval_k"],
    sample_n: int = None,           # only used if personas_input is a DataFrame
    seed: int = 42,
    gain_cap: float = 6.0,
    thresh: float = 0.0,
    recs_df: pd.DataFrame = None,
    recs_map: dict = None
):
    """
    Inputs:
      personas_input : dict OR pd.DataFrame
          - dict: a single persona in the schema used by `recommend()`
          - DataFrame: many personas
      mode           : "ae" or "svd"
      k              : top-k to evaluate
      sample_n       : subsample personas when DataFrame is passed
      gain_cap       : cap for graded gain used in nDCG
      thresh         : threshold for binary relevance (precision/recall)
      recs_df        : precomputed recs DF for the single-persona case
      recs_map       : dict of {persona_idx: recs DF} for the multi-persona case

    Returns:
      summary_dict, details_df
    """
    if isinstance(personas_input, dict):
        personas_iter = [personas_input]
        persona_indices = [0]
        # Single persona: allow a single precomputed recs_df
        precomputed_for_idx = {0: recs_df} if recs_df is not None else {}
    elif isinstance(personas_input, pd.DataFrame):
        rng = np.random.default_rng(seed)
        idxs = personas_input.index.to_list()
        if sample_n and sample_n < len(idxs):
            idxs = list(rng.choice(idxs, size=sample_n, replace=False))
        personas_iter = [personas_input.loc[i].to_dict() for i in idxs]
        persona_indices = idxs
        precomputed_for_idx = recs_map or {}
    else:
        raise TypeError('personas_input must be a dict or pd.DataFrame')
    # Choose Catalog embedding space for ILD
    Z_catalog = Z_ae if mode == 'ae' else Z_svd

    rows, all_ids = [], []
    for i, persona in zip(persona_indices, personas_iter):
        # Persona to catalog relevance (for P/R/nDCG ground-truth)
        scores_all, binary_all, gains_all = persona_catalog_scores(
            persona, gain_cap=gain_cap, thresh=thresh
        )
        gt_count = int(binary_all.sum())

        # use precomputer recs if provided, otherwise compute using recommend function
        recs = precomputed_for_idx.get(i, None)
        if recs is None:
            recs = recommend(persona, mode=mode, top_k=k)
        # keep only top-k rows if a larger DF was supplied
        if recs is not None and not recs.empty and len(recs) > k:
            recs = recs.iloc[:k].copy()

        if recs is None or recs.empty:
            rows.append({
                "persona_idx": i, "returned": 0, "gt_relevant": gt_count,
                "precision": 0.0, "nDCG": 0.0,
                "ILD": 0.0, "Novelty": 0.0, "PersonaAlign": 0.0
            })
            all_ids.append([])
            continue

        # Align recs to catalog row indices
        rec_row_idx = recs.index.to_list()
        all_ids.append(rec_row_idx)

        #Ranked relevance vectors (from persona scores)
        binary_ranked = (scores_all[rec_row_idx] > thresh).astype(float)
        gains_ranked = np.minimum(np.maximum(scores_all[rec_row_idx], 0.0), gain_cap)

        # Metrics
        prec = precision_at_k(binary_ranked, k)
        ideal = np.sort(gains_all)[::-1]
        idcg = dcg_at_k(ideal,k)
        ndcg = (dcg_at_k(gains_ranked, k) / idcg) if idcg > 0 else 0.0

        ild = intra_list_diversity(rec_row_idx, Z_catalog)
        nov = avg_novelty_log_pop(recs)
        pal = persona_alignment_jaccard(recs,persona)

        rows.append({
            "persona_idx": i,
            "returned": len(recs), "gt_relevant": gt_count,
            "precision": prec, "nDCG": ndcg,
            "ILD": ild, "Novelty": nov, "PersonaAlign": pal
        })

    #Aggregate
    details = pd.DataFrame(rows)
    summary = {
        "mode": mode, "k": k, "n_personas": len(details),
        "Precision@k_mean": float(details["precision"].mean()) if len(details) else 0.0,
        "nDCG@k_mean":      float(details["nDCG"].mean())      if len(details) else 0.0,
        "ILD_mean":         float(details["ILD"].mean())        if len(details) else 0.0,
        "Novelty_mean":     float(details["Novelty"].mean())    if len(details) else 0.0,
        "PersonaAlign_mean":float(details["PersonaAlign"].mean()) if len(details) else 0.0,
    }
    return summary, details

### A/B Testing using personas

In [23]:
personas_ae = personas[:50].reset_index(drop=True)
personas_svd = personas[50:].reset_index(drop=True)
summary_ae, details_ae = evaluate_recommender(personas_ae, mode="ae", k=CFG['eval_k'], sample_n=20, seed=42)
summary_svd, details_svd = evaluate_recommender(personas_svd, mode="svd", k=CFG['eval_k'], sample_n=20, seed=42)

display(summary_ae, details_ae)
display(summary_svd, details_svd)

{'mode': 'ae',
 'k': 20,
 'n_personas': 20,
 'Precision@k_mean': 0.7799999999999999,
 'nDCG@k_mean': 0.1710616278250811,
 'ILD_mean': 0.6599939465522766,
 'Novelty_mean': -5.067697349080225,
 'PersonaAlign_mean': 0.3928194444444444}

Unnamed: 0,persona_idx,returned,gt_relevant,precision,nDCG,ILD,Novelty,PersonaAlign
0,24,20,19067,1.0,0.147584,0.659994,-5.065103,0.339286
1,32,20,20661,0.85,0.190617,0.659994,-5.313338,0.347778
2,7,20,18539,1.0,0.156103,0.659994,-4.971043,0.378968
3,41,20,22979,0.85,0.214093,0.659994,-4.959819,0.470794
4,48,20,17661,0.85,0.168787,0.659994,-5.107889,0.390079
5,33,20,19312,0.55,0.139539,0.659994,-4.674151,0.309524
6,6,20,18975,0.6,0.149171,0.659994,-5.764262,0.404405
7,2,20,17204,0.55,0.145305,0.659994,-4.859031,0.415675
8,21,20,18975,0.6,0.149171,0.659994,-5.764262,0.404405
9,46,20,16179,0.7,0.177933,0.659994,-5.208414,0.364484


{'mode': 'svd',
 'k': 20,
 'n_personas': 20,
 'Precision@k_mean': 0.7575000000000001,
 'nDCG@k_mean': 0.18201216094934677,
 'ILD_mean': 0.8594247229515221,
 'Novelty_mean': -5.1630123895791,
 'PersonaAlign_mean': 0.4322123015873016}

Unnamed: 0,persona_idx,returned,gt_relevant,precision,nDCG,ILD,Novelty,PersonaAlign
0,18,20,20705,0.65,0.185071,0.859425,-4.935901,0.515119
1,36,20,17014,0.8,0.154744,0.859425,-5.134513,0.503214
2,6,20,16179,0.7,0.177933,0.859425,-5.24864,0.440079
3,33,20,19921,0.65,0.187527,0.859425,-5.371157,0.335317
4,34,20,19037,0.8,0.211152,0.859425,-4.856849,0.422619
5,27,20,19670,0.65,0.161399,0.859425,-4.993696,0.425595
6,5,20,23339,0.95,0.208932,0.859425,-5.025529,0.448413
7,2,20,18959,0.55,0.123572,0.859425,-5.282807,0.39746
8,16,20,17658,0.85,0.170144,0.859425,-5.187578,0.466667
9,38,20,19037,0.8,0.211152,0.859425,-4.856849,0.422619


## Evaluation Results (k = 20, n_personas = 20)

### Autoencoder (AE)
- **Precision@20:** **0.78**, strong; AE retrieves slightly more relevant items than SVD.
- **nDCG@20:** **0.171**, moderate ranking quality; relevant items are included but not always prioritized.
- **ILD:** **0.66**, fair diversity, but significantly lower than SVD.
- **Novelty:** **-5.07** → moderate novelty; slightly better than SVD (less skewed toward popularity).
- **Persona Alignment:** **0.39** → weaker alignment compared to SVD → **could improve by giving more weight to persona preferences during query building or persona boosting**.

### Truncated SVD
- **Precision@20:** **0.76**, slightly lower than AE, but still strong.
- **nDCG@20:** **0.182**, higher than AE; better ranking of relevant items.
- **ILD:** **0.86**, excellent diversity, much higher than AE → strong exploration benefit.
- **Novelty:** **-5.16**, a bit lower (worse) than AE, indicating heavier reliance on popular items
- **Persona Alignment:** **0.43**, better than AE, showing stronger reflection of user taste.

### Takeaways & Improvements
- **AE**: Best at raw relevance (precision) and slightly better novelty, but weaker at diversity and persona alignment.
   *Improvements*: tune **MMR λ** to favor more diversity, strengthen **persona boost** to align better with stated tastes.

- **SVD**: Best at ranking quality (nDCG), diversity (ILD), and persona alignment, but weaker at precision and novelty.
  *Improvements*: add **novelty-aware re-ranking** (e.g., penalize popular fragrances), and rebalance towards precision by slightly increasing **relevance weight** in MMR.

- **Overall**:
  - AE is safer for users wanting highly relevant & slightly less mainstream picks.
  - SVD is better for variety and taste alignment but risks over-recommending popular items.

### Save Manifest

In [24]:
manifest = {
    "feature_meta_hash": hash(json.dumps(feature_meta, sort_keys=True)),
    "X_shape": list(X.shape),
    "Z_ae_shape": list(Z_ae.shape),
    "Z_svd_shape": list(Z_svd.shape),
    "ZP_ae_shape": list(ZP_ae.shape),
    "ZP_svd_shape": list(ZP_svd.shape),
    "A_item_persona_ae_shape": list(A_item_persona_ae.shape),
    "A_item_persona_svd_shape": list(A_item_persona_svd.shape),
    "cfg": CFG,
}
(ART / 'model_manifest.json').write_text(json.dumps(manifest, indent=2))

583

## References

[The Use of MMR, Diversity-Based Reranking for Reordering Documents and Producing Summaries. Jamie Carbonell, and Jade Goldstein.](https://www.cs.cmu.edu/~jgc/publication/The_Use_MMR_Diversity_Based_LTMIR_1998.pdf)

[Masked Autoencoders: A Simple Yet Powerful Approach to Self-Supervised Vision Learning](https://medium.com/@jimcanary/masked-autoencoders-a-simple-yet-powerful-approach-to-self-supervised-vision-learning-0ec9dc849dd2)