<a href="https://colab.research.google.com/github/AarishB/FragrAI/blob/main/FragrAIi_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import ast
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display
from collections import Counter
import unicodedata

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# === STEP 1: Load and clean dataset ===
df = pd.read_csv('perfumes_table.csv', nrows=20000)
df = df.drop(columns=['url', 'rating', 'reviews'], errors='ignore')
pd.set_option('display.max_colwidth', None)

# Reorder columns in the order we see please
desired_order = ['title', 'notes', 'description']
remaining_cols = [col for col in df.columns if col not in desired_order]
df = df[desired_order + remaining_cols]

# Capitalize notes
def capitalize(s):
    if not isinstance(s, str): return s
    exceptions = {'and', 'for', 'of', 'the', 'in', 'on', 'at', 'with', 'a', 'an'}
    words = s.split()
    return ' '.join([
        word.capitalize() if i == 0 or word not in exceptions else word
        for i, word in enumerate(words)
    ])
df['designer'] = df['designer'].apply(capitalize)

def strip_accents(s): #turns the notes into unicode to start stripping unneccesary patterns
    return ''.join(c for c in unicodedata.normalize('NFKD', s) if not unicodedata.combining(c))

def canon_note(x): #these two methods help make unusual notes appear readable, used help from ChatGPT to get a regex pattern.
    if not isinstance(x, str): return None
    n = strip_accents(x).lower()
    n = re.sub(r'[^a-z0-9\s]', ' ', n).strip()


   #Takes cares of plurals and phrases of notes
    n = re.sub(r'\bnotes?\b', '', n).strip()
    n = re.sub(r'\bwoody notes?\b', 'woody', n)
    n = re.sub(r'\bgreen notes?\b', 'green', n)
    n = re.sub(r'\bfruity notes?\b', 'fruity', n)
    n = re.sub(r'\bfloral notes?\b', 'floral', n)
    n = re.sub(r'\bspicy notes?\b', 'spicy', n)
    n = re.sub(r'\baromatic notes?\b', 'aromatic', n)
    n = re.sub(r'\bsweet notes?\b', 'sweet', n)
    synonyms = {
        'calabrian bergamot':'bergamot','sicilian bergamot':'bergamot','italian bergamot':'bergamot',
        'cedarwood':'cedar','virginia cedar':'cedar','atlas cedar':'cedar',
        'oudh':'oud','agarwood':'oud','oud wood':'oud',
        'pink pepper':'pepper','black pepper':'pepper',
        'white musk':'musk','ambroxan':'amber','ambergris':'amber',
        'sea notes':'marine','marine notes':'marine','watery notes':'marine','aqua':'marine','calone':'marine',
        'orange blossom':'neroli','orris root':'iris','cashmeran':'woody'
    } #Start of the synonyms, more to come soon
    n = synonyms.get(n, n)
    return ' '.join(w.capitalize() for w in n.split())

def _extract_layer(text, layer_aliases, stop_regex): #Notes will be extracted layer by layer, not all at once. GPT provided an efficient pattern
   def _extract_layer(text, layer_aliases, stop_regex):
    """
    Extract a comma/semicolon/pipe separated list of notes after a layer alias.
    Adds more natural-language boundaries so we don't stop too late/too early.
    """
    if not isinstance(text, str):
        return []
    s = ' '.join(text.split())  # flatten whitespace

    # Add richer "stop" phrases that often separate layers
    # (e.g., "followed by", "transitioning into", "settling into", etc.)
    extra_stops = r"|".join([
        r"\bfollowed by\b",
        r"\btransition(?:ing)? into\b",
        r"\bsettling (?:down|into)\b",
        r"\bsupported by\b",
        r"\bgrounded in\b",
        r"\banchored by\b",
        r"\bon (?:a|the) (?:bed|base) of\b",
        r"\bthen\b",  # often used as a weak layer separator
    ])

    # Merge with provided stop regex
    full_stop = rf"(?:{stop_regex}|{extra_stops}|\.|$)"

    # e.g., "Top notes are: A, B and C" / "Opening: A; B | C"
    pat = rf"\b({'|'.join(layer_aliases)})\s*(?:note|notes)?\s*(?:are|include|:)?\s*(.*?)(?={full_stop})"
    m = re.search(pat, s, flags=re.IGNORECASE)
    if not m:
        return []
    chunk = m.group(2)
    toks = re.split(r'[;,/|]', chunk)
    return [canon_note(t) for t in toks if canon_note(t)]

def extract_notes(desc, note_type): #Better extraction of notes, which is more efficient and structured.
    aliases = {
        'Top': ['top','opening','head'],
        'Middle': ['middle','heart', 'core'],
        'Base': ['base','drydown', 'foundation']
    }
    if note_type == 'Top':
        stop = r"\b(?:middle|heart|base|drydown)\b"
    elif note_type == 'Middle':
        stop = r"\b(?:base|drydown)\b"
    else:
        # match until period or end (stop regex that never matches)
        stop = r"$^"

    return _extract_layer(desc, aliases[note_type], stop)

# Re-run extraction with the new regex pattern
df['top_notes']    = df['description'].apply(lambda x: extract_notes(x, "Top"))
df['middle_notes'] = df['description'].apply(lambda x: extract_notes(x, "Middle"))
df['base_notes']   = df['description'].apply(lambda x: extract_notes(x, "Base"))

if 'notes' in df.columns: #tokes to maximize outputs
    def fallback(row):
        if row['top_notes'] or row['middle_notes'] or row['base_notes']:
            return row
        raw = row['notes']
        toks = []
        if isinstance(raw, str):
            toks = [canon_note(t) for t in re.split(r'[;,/|]', raw) if canon_note(t)]
        elif isinstance(raw, (list, tuple)):
            toks = [canon_note(t) for t in raw if canon_note(t)]

        if toks:
            k = len(toks)
            if k >= 3:
                t1 = toks[:max(1, k//3)]
                t2 = toks[max(1, k//3):max(2, 2*k//3)]
                t3 = toks[max(2, 2*k//3):]
            elif k == 2:
                t1, t2, t3 = [toks[0]], [], [toks[1]]
            else:  # k == 1
                t1, t2, t3 = [toks[0]], [], []
            row['top_notes'], row['middle_notes'], row['base_notes'] = t1, t2, t3
        return row
    df = df.apply(fallback, axis=1)

# === STEP 3: Create combined notes with weight for base notes ===
for col in ['top_notes', 'middle_notes', 'base_notes']:
    df[col] = df[col].apply(lambda v: v if isinstance(v, list) else ([] if pd.isna(v) or v is None else [v] if isinstance(v, str) else []))
df['combined_notes'] = df.apply(lambda row: row['top_notes'] + row['middle_notes'] + row['base_notes'] * 2, axis=1)

# === STEP 4: Assign seasons for supervised learning ===
seasonal_notes = {
    'Summer': ['Citrus', 'Aquatic', 'Mint', 'Neroli', 'Grapefruit', 'Orange', 'Lemon', 'Bergamot', 'Melon', 'Pineapple', 'Coconut'],
    'Winter': ['Vanilla', 'Amber', 'Oud', 'Leather', 'Tobacco', 'Cinnamon', 'Tonka','Incense','Coffee'],
    'Spring': ['Jasmine', 'Rose', 'Green Tea', 'Lily', 'Pear', 'Apple', 'Lavender', 'Magnolia'],
    'Fall': ['Sandalwood', 'Patchouli', 'Nutmeg', 'Cardamom', 'Clove', 'Plum', 'Lavender', 'Cinnamon']
}
#weights the notes based on the layers(base gets more weightage u feel)
def assign_weighted_season_from_layers(top, mid, base):
    season_scores = {season: 0 for season in seasonal_notes}
    total_notes = len(top) + len(mid) + len(base)

    if total_notes == 0:
        return 'Inconclusive'

    top_wt = max(1, round((len(top)/total_notes) * 10))
    mid_wt = max(1, round((len(mid)/total_notes) * 10))
    base_wt = max(1, round((len(base)/total_notes) * 10))

    for note in top:
        for season, group in seasonal_notes.items():
            if note in group:
                season_scores[season] += (top_wt*2)
    for note in mid:
        for season, group in seasonal_notes.items():
            if note in group:
                season_scores[season] += (mid_wt*1.5)
    for note in base:
        for season, group in seasonal_notes.items():
            if note in group:
                season_scores[season] += (base_wt*2.5)

    return max(season_scores, key=season_scores.get) if any(season_scores.values()) else 'Inconclusive'

df['season'] = df.apply(
    lambda row: assign_weighted_season_from_layers(row['top_notes'], row['middle_notes'], row['base_notes']),
    axis=1
)
df = df[df['season'] != 'Inconclusive']

# === STEP 5: Train ML model ===

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['combined_notes'])
y = df['season']

X_bal, y_bal = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

xgb = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    objective="multi:softprob",   # multiclass probabilities
    eval_metric="mlogloss"
)
xgb.fit(X_train_xgb, y_train_xgb)
y_pred_xgb = le.inverse_transform(xgb.predict(X_test_xgb))

# LinearSVC -> Can help reduce inconvlusives
_base_svm = LinearSVC(C=1.0, class_weight='balanced', random_state=42)
svm = CalibratedClassifierCV(_base_svm, cv=5)  # sigmoid by default
svm.fit(X, y)

# Evaluate SVM
y_pred_svm = svm.predict(X_test)
print("\n[SVM] Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\n[SVM] Classification Report:\n", classification_report(y_test, y_pred_svm))

# === STEP 6: Evaluate model ===
y_pred_svm = svm.predict(X_test)  # use the calibrated SVM you actually fit
print("\n[SVM] Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\n[SVM] Classification Report:\n", classification_report(y_test, y_pred_svm))

# === STEP 7: Predict on full dataset and show notes ===
df['predicted_season_svm_raw'] = svm.predict(X)
df['predicted_season'] = df["predicted_season_svm_raw"]




#Restricts the SVM to choose a season
REAL_SEASONS = ['Spring', 'Summer', 'Fall', 'Winter']

def restrict_to_real_seasons(model, X, real_labels=REAL_SEASONS):
    # Get class order and probabilities
    classes = model.classes_
    proba = model.predict_proba(X)  # works for calibrated SVM and RF

    # Indices of the allowed classes
    keep_idx = [i for i, c in enumerate(classes) if c in real_labels]
    allowed_classes = classes[keep_idx]
    allowed_proba = proba[:, keep_idx]

    # Argmax over allowed only
    best_idx = allowed_proba.argmax(axis=1)
    return allowed_classes[best_idx]
df['predicted_season_svm'] = restrict_to_real_seasons(svm, X, REAL_SEASONS)
df['predicted_season_rf_no_inc'] = restrict_to_real_seasons(clf, X, REAL_SEASONS)


# === Evaluate individual models ===
print("SVM acc:", svm.score(X_test, y_test))
print("RF acc:", clf.score(X_test, y_test))

# === Ensemble ===
p_svm = svm.predict_proba(X_test)
p_rf = clf.predict_proba(X_test)
p_ens = (0.6 * p_svm + 0.4 * p_rf)
y_pred = [svm.classes_[i] for i in p_ens.argmax(axis=1)]

from sklearn.metrics import accuracy_score, classification_report
print("Ensemble acc:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# === STEP 8: Display results with notes used ===
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

display(df[['title', 'designer', 'top_notes', 'middle_notes', 'base_notes', 'combined_notes', 'predicted_season']].head(500))


# ======================================================================
# =========================
# === ADD-ON: V2 PIPELINE ===
# (Appended AFTER your current code; nothing above is modified)
# =========================
# ======================================================================

import numpy as np
import scipy.sparse as sp
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# --- A) Extra semantic features (keep originals intact) ---

# 1) Olfactory families from description
families_vocab = [
    'floral','amber','citrus','woody','spicy','aromatic','fresh','oriental',
    'green','fruity','gourmand','leather','aquatic','powdery','smoky','earthy'
]
def extract_families(desc):
    if not isinstance(desc, str): return []
    s = desc.lower()
    return [w.capitalize() for w in families_vocab if w in s]

df['families'] = df['description'].apply(extract_families)

# Combine into a NEW list column (keep your original intact)
df['combined_notes_plus'] = df['combined_notes'] + df['families']

# 2) === Build a LEAK-FREE designer_bias using a stratified split mask ===
# Create row ids to keep order stable
df['_row_id'] = np.arange(len(df))

# Use sklearn's stratified split (pandas.sample doesn't support 'stratify')
train_ids, test_ids = train_test_split(
    df['_row_id'],
    test_size=0.2,
    random_state=42,
    stratify=df['season']
)

train_mask = np.zeros(len(df), dtype=bool)
train_mask[train_ids] = True

# Compute designer's most common season *on training rows only*
designer_mode_train = (
    df.loc[train_mask].groupby('designer')['season']
      .agg(lambda x: Counter(x).most_common(1)[0][0])
      .to_dict()
)

df['designer_bias'] = df['designer'].map(designer_mode_train).fillna('Unknown')

# 3) Warm / Cool ratios and layer counts (domain priors)
warm_notes = {'Amber','Vanilla','Oud','Leather','Tobacco','Cinnamon','Sandalwood','Tonka','Incense','Coffee','Myrrh'}
cool_notes = {'Citrus','Mint','Marine','Green','Lavender','Grapefruit','Bergamot','Neroli','Aqua','Calone','Melon'}

def ratio_counts(notes, pos_set):
    n = len(notes) if isinstance(notes, list) else 0
    if n == 0: return 0.0
    return sum(1 for t in notes if t in pos_set) / n

df['warm_ratio'] = df['combined_notes_plus'].apply(lambda n: ratio_counts(n, warm_notes))
df['cool_ratio'] = df['combined_notes_plus'].apply(lambda n: ratio_counts(n, cool_notes))
df['top_count'] = df['top_notes'].apply(lambda x: len(x) if isinstance(x, list) else 0)
df['mid_count'] = df['middle_notes'].apply(lambda x: len(x) if isinstance(x, list) else 0)
df['base_count'] = df['base_notes'].apply(lambda x: len(x) if isinstance(x, list) else 0)
df['note_count'] = df['combined_notes_plus'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# --- B) Build NEW features (TF-IDF over notes + categorical + numeric) ---
df['note_string_plus'] = df['combined_notes_plus'].apply(lambda lst: ' '.join(lst))

tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=3)
X_tfidf = tfidf.fit_transform(df['note_string_plus'])

# Categorical one-hots (designer, designer_bias)
ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)
X_cat = ohe.fit_transform(df[['designer','designer_bias']])

# Numeric block
num_cols = ['warm_ratio','cool_ratio','top_count','mid_count','base_count','note_count']
scaler = StandardScaler(with_mean=False)
X_num = scaler.fit_transform(df[num_cols])

# Final augmented matrix
X_v2 = sp.hstack([X_tfidf, X_cat, X_num], format='csr')
y_v2 = df['season'].values

# --- C) Use the same indices from the mask for train/test split ---
train_row_idx = df.index[train_mask].to_numpy()
test_row_idx  = df.index[~train_mask].to_numpy()

X_train_v2 = X_v2[train_row_idx, :]
X_test_v2  = X_v2[test_row_idx, :]
y_train_v2 = y_v2[train_row_idx]
y_test_v2  = y_v2[test_row_idx]

# --- D) Models (new names so we don't touch yours) ---
rf_v2 = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=42
)

# Encode for XGB only
le_v2 = LabelEncoder()
y_enc_v2 = le_v2.fit_transform(y_v2)

# XGB will use the same split indices for fairness
X_tr_xgb_v2 = X_v2[train_row_idx, :]
X_te_xgb_v2 = X_v2[test_row_idx, :]
y_tr_xgb_v2 = y_enc_v2[train_row_idx]
y_te_xgb_v2 = y_enc_v2[test_row_idx]

xgb_v2 = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    eval_metric="mlogloss",
    n_jobs=-1,
    random_state=42
)
xgb_v2.fit(X_tr_xgb_v2, y_tr_xgb_v2)

# Calibrated LinearSVC (fresh fit on V2 features)
_base_svm_v2 = LinearSVC(C=1.0, class_weight='balanced', random_state=42)
svm_v2 = CalibratedClassifierCV(_base_svm_v2, cv=5)
svm_v2.fit(X_train_v2, y_train_v2)

# Quick individual evals
print("\n[V2] RF accuracy:", rf_v2.fit(X_train_v2, y_train_v2).score(X_test_v2, y_test_v2))
y_pred_xgb_v2 = xgb_v2.predict(X_te_xgb_v2)
acc_xgb_v2 = (y_pred_xgb_v2 == y_te_xgb_v2).mean()
print("\n[V2] XGB accuracy:", acc_xgb_v2)
print("\n[V2] SVM accuracy:", svm_v2.score(X_test_v2, y_test_v2))

# --- E) Stacking ensemble (RF + XGB + SVM -> LogisticRegression) ---
# Wrapper so XGB fits the StackingClassifier API with predict_proba over string labels
class XGBProbaWrapper:
    def __init__(self, model, label_encoder):
        self.model = model
        self.le = label_encoder
    def fit(self, X, y):
        y_enc = self.le.transform(y)
        self.model.fit(X, y_enc)
        return self
    def predict_proba(self, X):
        return self.model.predict_proba(X)
    def predict(self, X):
        return self.le.inverse_transform(np.argmax(self.predict_proba(X), axis=1))
    def score(self, X, y):
        from sklearn.metrics import accuracy_score
        return accuracy_score(y, self.predict(X))

xgb_wrap_v2 = XGBProbaWrapper(xgb_v2, le_v2)

estimators_v2 = [
    ('rf', rf_v2),
    ('svm', svm_v2),
    ('xgb', xgb_wrap_v2)
]

stack_v2 = StackingClassifier(
    estimators=estimators_v2,
    final_estimator=LogisticRegression(max_iter=400, class_weight='balanced'),
    passthrough=False,
    stack_method='predict_proba',
    n_jobs=-1
)

stack_v2.fit(X_train_v2, y_train_v2)
print("\n[V2] Stacked accuracy:", stack_v2.score(X_test_v2, y_test_v2))
print("\n[V2] Stacked classification report:\n", classification_report(y_test_v2, stack_v2.predict(X_test_v2)))

# --- F) Full-dataset predictions (new columns; your originals remain) ---
df['predicted_season_tfidf_svm'] = svm_v2.predict(X_v2)
df['predicted_season_stack'] = stack_v2.predict(X_v2)

# Optional: compare old vs new on the first 20 rows
cols_to_show = [
    'title','designer','top_notes','middle_notes','base_notes',
    'combined_notes','combined_notes_plus',
    'predicted_season',                # from your original run
    'predicted_season_tfidf_svm',      # new calibrated SVM on TF-IDF+features
    'predicted_season_stack'           # new stacked ensemble
]
display(df[cols_to_show].head(20))


# ===============================
# === MINI PERSONALIZER (2–3 Qs)
# ===============================
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1) Define 3 fast questions and answer choices
QUESTIONS = {
    "event": {
        "prompt": "Where are you wearing this?",
        "choices": ["Date night", "Family gathering", "School/Work", "Party/Going out"]
    },
    "vibe": {
        "prompt": "Pick a vibe:",
        "choices": ["Sweet/Comforting", "Fresh/Clean", "Woody/Spicy", "Floral/Soft"]
    },
    "strength": {
        "prompt": "Projection preference?",
        "choices": ["Skin-scent", "Moderate", "Strong"]
    }
}

# 2) Map answers -> preferred notes and season nudges
ANSWER_TO_NOTES = {
    "event": {
        "Date night":         {"notes": ["Vanilla","Amber","Tonka","Lavender","Cinnamon"], "season_bias": ["Fall","Winter"]},
        "Family gathering":   {"notes": ["Citrus","Bergamot","Grapefruit","Green","Neroli"], "season_bias": ["Spring","Summer"]},
        "School/Work":        {"notes": ["Fresh","Lavender","Tea","Aromatic","Woody"], "season_bias": ["Spring","Fall"]},
        "Party/Going out":    {"notes": ["Sweet","Amber","Oud","Leather","Pineapple"], "season_bias": ["Fall","Winter"]},
    },
    "vibe": {
        "Sweet/Comforting":   {"notes": ["Vanilla","Tonka","Amber","Cinnamon","Cocoa"], "season_bias": ["Fall","Winter"]},
        "Fresh/Clean":        {"notes": ["Citrus","Marine","Mint","Green","Neroli"], "season_bias": ["Spring","Summer"]},
        "Woody/Spicy":        {"notes": ["Sandalwood","Cedar","Cardamom","Pepper","Incense"], "season_bias": ["Fall"]},
        "Floral/Soft":        {"notes": ["Jasmine","Rose","Iris","Peony","Lily"], "season_bias": ["Spring"]},
    },
    "strength": {
        "Skin-scent":         {"notes": ["Musk","Iris","Tea","Soft Floral"], "season_bias": []},
        "Moderate":           {"notes": [], "season_bias": []},
        "Strong":             {"notes": ["Amber","Oud","Leather","Incense"], "season_bias": ["Fall","Winter"]},
    }
}

# 3) Utility: safe lowercase title-cased matching against your canonicalized notes
def _canon_list(lst):
    return [str(x).strip() for x in (lst or []) if isinstance(x, str) and str(x).strip()]

def _notes_string(lst):
    return " ".join(_canon_list(lst))

# 4) Ask questions (interactive) OR pass answers in programmatically
def ask_user(min_questions=2):
    answers = {}
    for k in ["event", "vibe", "strength"][:max(2, min_questions)]:
        ch = QUESTIONS[k]["choices"]
        prompt = f"{QUESTIONS[k]['prompt']} {ch}  -> "
        # Basic console input; you can replace with a UI later
        ans = input(prompt).strip()
        # fall back to first choice if mismatch
        if ans not in ch:
            ans = ch[0]
        answers[k] = ans
    return answers

# 5) Convert answers into a preference profile (notes + season weights)
def answers_to_profile(answers):
    target_notes = []
    season_weights = {s: 0.0 for s in ["Spring","Summer","Fall","Winter"]}

    for q, ans in answers.items():
        m = ANSWER_TO_NOTES.get(q, {}).get(ans, {"notes":[], "season_bias":[]})
        target_notes += m["notes"]
        for sb in m["season_bias"]:
            season_weights[sb] += 1.0

    # Normalize season weights; if none selected, trust model predictions as-is
    total = sum(season_weights.values())
    if total == 0:
        season_weights = {s: 0.0 for s in season_weights}
    else:
        season_weights = {k: v/total for k,v in season_weights.items()}

    return list(dict.fromkeys(target_notes)), season_weights

# 6) Rank perfumes by: (A) note similarity to the user prefs (tf-idf),
#    and (B) season agreement with your model predictions
def recommend_from_answers(answers, top_k=5):
    target_notes, season_w = answers_to_profile(answers)

    # Build a tiny "query doc" for tf-idf using your existing vectorizer (tfidf, built above)
    query_doc = " ".join(target_notes) if target_notes else ""
    if query_doc and 'tfidf' in globals():
        q_vec = tfidf.transform([query_doc])
        sims = cosine_similarity(q_vec, X_tfidf).ravel()
    else:
        # fallback: simple Jaccard overlap on lists
        tn = set(target_notes)
        sims = df['combined_notes_plus'].apply(lambda n: len(tn.intersection(set(n or []))) / max(1, len(tn.union(set(n or [])))) ).to_numpy()

    # Season agreement score: 1 if matches the user-biased “best” season, else fractional by weights
    # If no bias, we’ll give 0.5 to any match with model’s predicted season; else weight by bias
    model_season = df.get('predicted_season_stack', df.get('predicted_season_tfidf_svm', df.get('predicted_season')))
    model_season = model_season.fillna(df['season'])  # fallback to labeled season if present
    season_score = []
    if any(season_w.values()):
        for s in model_season:
            season_score.append(season_w.get(s, 0.0))
    else:
        # light reward for agreeing with the model if no bias
        for _ in model_season:
            season_score.append(0.5)
    season_score = np.array(season_score, dtype=float)

    # Combine: simple weighted blend (feel free to tune)
    score = 0.7 * sims + 0.3 * season_score

    # Build output frame
    out = df.assign(
        _score = score,
        _sim = sims,
        _season_pref = season_score
    ).sort_values("_score", ascending=False)

    cols = [
        "title","designer","predicted_season","predicted_season_tfidf_svm",
        "predicted_season_stack","top_notes","middle_notes","base_notes","combined_notes_plus","_score"
    ]
    cols = [c for c in cols if c in out.columns]
    return out[cols].head(top_k)

# 7) Example usage:
#    a) Interactive (console in Jupyter/Colab/terminal):
# answers = ask_user(min_questions=3)
# recs = recommend_from_answers(answers, top_k=5)
# display(recs)

#    b) Programmatic (no input calls):
example_answers = {
    "event": "Date night",
    "vibe": "Sweet/Comforting",
    "strength": "Moderate"
}
recs = recommend_from_answers(example_answers, top_k=5)
display(recs)
