<a href="https://colab.research.google.com/github/AarishB/FragrAI/blob/main/FragrAIi_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import ast
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display
from collections import Counter
import unicodedata

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# === STEP 1: Load and clean dataset ===
df = pd.read_csv('perfumes_table.csv', nrows=10000)
df = df.drop(columns=['url', 'rating', 'reviews'], errors='ignore')
pd.set_option('display.max_colwidth', None)

# Reorder columns in the order we see please
desired_order = ['title', 'notes', 'description']
remaining_cols = [col for col in df.columns if col not in desired_order]
df = df[desired_order + remaining_cols]

# Capitalize notes
def capitalize(s):
    if not isinstance(s, str): return s
    exceptions = {'and', 'for', 'of', 'the', 'in', 'on', 'at', 'with', 'a', 'an'}
    words = s.split()
    return ' '.join([
        word.capitalize() if i == 0 or word not in exceptions else word
        for i, word in enumerate(words)
    ])
df['designer'] = df['designer'].apply(capitalize)

def strip_accents(s): #turns the notes into unicode to start stripping unneccesary patterns
    return ''.join(c for c in unicodedata.normalize('NFKD', s) if not unicodedata.combining(c))

def canon_note(x): #these two methods help make unusual notes appear readable, used help from ChatGPT to get a regex pattern.
    if not isinstance(x, str): return None
    n = strip_accents(x).lower()
    n = re.sub(r'[^a-z0-9\s]', ' ', n).strip()


   #Takes cares of plurals and phrases of notes
    n = re.sub(r'\bnotes?\b', '', n).strip()
    n = re.sub(r'\bwoody notes?\b', 'woody', n)
    n = re.sub(r'\bgreen notes?\b', 'green', n)
    n = re.sub(r'\bfruity notes?\b', 'fruity', n)
    n = re.sub(r'\bfloral notes?\b', 'floral', n)
    n = re.sub(r'\bspicy notes?\b', 'spicy', n)
    n = re.sub(r'\baromatic notes?\b', 'aromatic', n)
    n = re.sub(r'\bsweet notes?\b', 'sweet', n)
    
    # üî• EXPANDED SYNONYMS - More variations = better matching
    synonyms = {
        'calabrian bergamot':'bergamot','sicilian bergamot':'bergamot','italian bergamot':'bergamot',
        'cedarwood':'cedar','virginia cedar':'cedar','atlas cedar':'cedar',
        'oudh':'oud','agarwood':'oud','oud wood':'oud','aoud':'oud',
        'pink pepper':'pepper','black pepper':'pepper','red pepper':'pepper',
        'white musk':'musk','ambroxan':'amber','ambergris':'amber','grey amber':'amber','amber wood':'amber',
        'sea notes':'marine','marine notes':'marine','watery notes':'marine','aqua':'marine','calone':'marine',
        'aquatic':'marine','ocean':'marine','water':'marine',
        'orange blossom':'neroli','orris root':'iris','cashmeran':'woody',
        'tonka bean':'tonka','tonka beans':'tonka',
        'vanilla absolute':'vanilla','madagascar vanilla':'vanilla','tahitian vanilla':'vanilla',
        'sandalwood':'sandalwood','mysore sandalwood':'sandalwood',
        'patchouli':'patchouli','dark patchouli':'patchouli',
        'leather accord':'leather','suede':'leather',
        'tobacco':'tobacco','tobacco leaf':'tobacco',
        'incense':'incense','frankincense':'incense',
        'jasmine sambac':'jasmine','jasmine absolute':'jasmine',
        'rose absolute':'rose','damascus rose':'rose','turkish rose':'rose',
        'lavender':'lavender','french lavender':'lavender',
        'grapefruit':'grapefruit','pink grapefruit':'grapefruit',
        'mandarin':'mandarin','tangerine':'mandarin',
        'lemon':'lemon','sicilian lemon':'lemon',
        'lime':'lime','persian lime':'lime',
        'mint':'mint','peppermint':'mint','spearmint':'mint',
        'eucalyptus':'eucalyptus','eucalyptol':'eucalyptus',
        'ginger':'ginger','fresh ginger':'ginger',
        'cardamom':'cardamom','green cardamom':'cardamom',
        'cinnamon':'cinnamon','ceylon cinnamon':'cinnamon',
        'clove':'clove','clove bud':'clove',
        'nutmeg':'nutmeg','nutmeg oil':'nutmeg',
        'saffron':'saffron','spanish saffron':'saffron',
        'cocoa':'cocoa','chocolate':'cocoa','dark chocolate':'cocoa',
        'coffee':'coffee','espresso':'coffee',
        'honey':'honey','beeswax':'honey',
        'caramel':'caramel','burnt caramel':'caramel',
        'praline':'praline','hazelnut':'praline',
        'vetiver':'vetiver','haitian vetiver':'vetiver',
        'myrrh':'myrrh','sweet myrrh':'myrrh',
        'fig':'fig','fig leaf':'fig',
        'violet':'violet','violet leaf':'violet',
        'iris':'iris','orris':'iris',
        'peony':'peony','pink peony':'peony',
        'magnolia':'magnolia','white magnolia':'magnolia',
        'lily':'lily','water lily':'lily','lily of the valley':'lily',
        'tea':'tea','green tea':'tea','black tea':'tea',
        'sage':'sage','clary sage':'sage',
        'thyme':'thyme','wild thyme':'thyme',
        'rosemary':'rosemary','fresh rosemary':'rosemary',
        'basil':'basil','sweet basil':'basil',
        'geranium':'geranium','rose geranium':'geranium',
        'ylang ylang':'ylang ylang','cananga':'ylang ylang',
        'neroli':'neroli','orange flower':'neroli',
        'petitgrain':'petitgrain','bitter orange':'petitgrain',
        'bergamot':'bergamot','earl grey':'bergamot',
        'orange':'orange','sweet orange':'orange','blood orange':'orange',
        'apple':'apple','green apple':'apple','red apple':'apple',
        'pear':'pear','asian pear':'pear',
        'peach':'peach','white peach':'peach',
        'plum':'plum','purple plum':'plum',
        'cherry':'cherry','black cherry':'cherry',
        'raspberry':'raspberry','red raspberry':'raspberry',
        'strawberry':'strawberry','wild strawberry':'strawberry',
        'blackcurrant':'blackcurrant','cassis':'blackcurrant',
        'blackberry':'blackberry','wild blackberry':'blackberry',
        'pineapple':'pineapple','fresh pineapple':'pineapple',
        'mango':'mango','alphonso mango':'mango',
        'papaya':'papaya','tropical papaya':'papaya',
        'coconut':'coconut','coconut milk':'coconut',
        'melon':'melon','watermelon':'melon','cantaloupe':'melon',
        'bamboo':'bamboo','bamboo leaf':'bamboo',
        'moss':'moss','oakmoss':'moss','tree moss':'moss',
        'lichen':'lichen','forest lichen':'lichen',
        'pine':'pine','pine needle':'pine','scots pine':'pine',
        'fir':'fir','balsam fir':'fir','silver fir':'fir',
        'cypress':'cypress','italian cypress':'cypress',
        'juniper':'juniper','juniper berry':'juniper',
        'birch':'birch','birch tar':'birch',
        'oakwood':'oakwood','oak':'oakwood',
        'ebony':'ebony','ebony wood':'ebony',
        'guaiac wood':'guaiac','guaiacwood':'guaiac',
        'papyrus':'papyrus','egyptian papyrus':'papyrus',
        'leather':'leather','russian leather':'leather','spanish leather':'leather',
        'animalic':'animalic','animalistic':'animalic',
        'castoreum':'castoreum','castor':'castoreum',
        'civet':'civet','civet cat':'civet',
        'musk':'musk','deer musk':'musk','white musk':'musk','black musk':'musk',
        'ambergris':'amber','grey amber':'amber','gray amber':'amber',
        'labdanum':'labdanum','cistus':'labdanum',
        'benzoin':'benzoin','styrax':'benzoin',
        'balsam':'balsam','peru balsam':'balsam','tolu balsam':'balsam',
        'resin':'resin','tree resin':'resin',
        'elemi':'elemi','manila elemi':'elemi',
        'copal':'copal','white copal':'copal',
        'olibanum':'incense','frankincense':'incense',
        'myrrh':'myrrh','somalian myrrh':'myrrh',
        'opoponax':'opoponax','sweet myrrh':'opoponax'
    }
    n = synonyms.get(n, n)
    return ' '.join(w.capitalize() for w in n.split())

def _extract_layer(text, layer_aliases, stop_regex):
    """
    Extract a comma/semicolon/pipe separated list of notes after a layer alias.
    Adds more natural-language boundaries so we don't stop too late/too early.
    """
    if not isinstance(text, str):
        return []
    s = ' '.join(text.split())  # flatten whitespace

    # Add richer "stop" phrases that often separate layers
    extra_stops = r"|".join([
        r"\bfollowed by\b",
        r"\btransition(?:ing)? into\b",
        r"\bsettling (?:down|into)\b",
        r"\bsupported by\b",
        r"\bgrounded in\b",
        r"\banchored by\b",
        r"\bon (?:a|the) (?:bed|base) of\b",
        r"\bthen\b",
    ])

    # Merge with provided stop regex
    full_stop = rf"(?:{stop_regex}|{extra_stops}|\.|$)"

    pat = rf"\b({'|'.join(layer_aliases)})\s*(?:note|notes)?\s*(?:are|include|:)?\s*(.*?)(?={full_stop})"
    m = re.search(pat, s, flags=re.IGNORECASE)
    if not m:
        return []
    chunk = m.group(2)
    toks = re.split(r'[;,/|]', chunk)
    return [canon_note(t) for t in toks if canon_note(t)]

def extract_notes(desc, note_type):
    aliases = {
        'Top': ['top','opening','head'],
        'Middle': ['middle','heart', 'core'],
        'Base': ['base','drydown', 'foundation']
    }
    if note_type == 'Top':
        stop = r"\b(?:middle|heart|base|drydown)\b"
    elif note_type == 'Middle':
        stop = r"\b(?:base|drydown)\b"
    else:
        stop = r"$^"

    return _extract_layer(desc, aliases[note_type], stop)

# Re-run extraction with the new regex pattern
df['top_notes']    = df['description'].apply(lambda x: extract_notes(x, "Top"))
df['middle_notes'] = df['description'].apply(lambda x: extract_notes(x, "Middle"))
df['base_notes']   = df['description'].apply(lambda x: extract_notes(x, "Base"))

if 'notes' in df.columns:
    def fallback(row):
        if row['top_notes'] or row['middle_notes'] or row['base_notes']:
            return row
        raw = row['notes']
        toks = []
        if isinstance(raw, str):
            toks = [canon_note(t) for t in re.split(r'[;,/|]', raw) if canon_note(t)]
        elif isinstance(raw, (list, tuple)):
            toks = [canon_note(t) for t in raw if canon_note(t)]

        if toks:
            k = len(toks)
            if k >= 3:
                t1 = toks[:max(1, k//3)]
                t2 = toks[max(1, k//3):max(2, 2*k//3)]
                t3 = toks[max(2, 2*k//3):]
            elif k == 2:
                t1, t2, t3 = [toks[0]], [], [toks[1]]
            else:  # k == 1
                t1, t2, t3 = [toks[0]], [], []
            row['top_notes'], row['middle_notes'], row['base_notes'] = t1, t2, t3
        return row
    df = df.apply(fallback, axis=1)

# === STEP 3: Create combined notes with weight for base notes ===
for col in ['top_notes', 'middle_notes', 'base_notes']:
    df[col] = df[col].apply(lambda v: v if isinstance(v, list) else ([] if pd.isna(v) or v is None else [v] if isinstance(v, str) else []))
df['combined_notes'] = df.apply(lambda row: row['top_notes'] + row['middle_notes'] + row['base_notes'] * 2, axis=1)

# === STEP 4: Assign seasons for supervised learning ===
seasonal_notes = {
    'Summer': ['Citrus', 'Aquatic', 'Mint', 'Neroli', 'Grapefruit', 'Orange', 'Lemon', 'Bergamot', 'Melon', 'Pineapple', 'Coconut'],
    'Winter': ['Vanilla', 'Amber', 'Oud', 'Leather', 'Tobacco', 'Cinnamon', 'Tonka','Incense','Coffee'],
    'Spring': ['Jasmine', 'Rose', 'Green Tea', 'Lily', 'Pear', 'Apple', 'Lavender', 'Magnolia'],
    'Fall': ['Sandalwood', 'Patchouli', 'Nutmeg', 'Cardamom', 'Clove', 'Plum', 'Lavender', 'Cinnamon']
}

def assign_weighted_season_from_layers(top, mid, base):
    season_scores = {season: 0 for season in seasonal_notes}
    total_notes = len(top) + len(mid) + len(base)

    if total_notes == 0:
        return 'Inconclusive'

    top_wt = max(1, round((len(top)/total_notes) * 10))
    mid_wt = max(1, round((len(mid)/total_notes) * 10))
    base_wt = max(1, round((len(base)/total_notes) * 10))

    for note in top:
        for season, group in seasonal_notes.items():
            if note in group:
                season_scores[season] += (top_wt*2)
    for note in mid:
        for season, group in seasonal_notes.items():
            if note in group:
                season_scores[season] += (mid_wt*1.5)
    for note in base:
        for season, group in seasonal_notes.items():
            if note in group:
                season_scores[season] += (base_wt*2.5)

    return max(season_scores, key=season_scores.get) if any(season_scores.values()) else 'Inconclusive'

df['season'] = df.apply(
    lambda row: assign_weighted_season_from_layers(row['top_notes'], row['middle_notes'], row['base_notes']),
    axis=1
)
df = df[df['season'] != 'Inconclusive']
df = df.reset_index(drop=True)

# === STEP 5: Train ML model ===

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
mlb = MultiLabelBinarizer()

# üî• CRITICAL FIX: Create combined_notes_plus BEFORE training
# (This is created in V2 pipeline but needed here for consistency)
families_vocab = [
    'floral','amber','citrus','woody','spicy','aromatic','fresh','oriental',
    'green','fruity','gourmand','leather','aquatic','powdery','smoky','earthy'
]
def extract_families(desc):
    if not isinstance(desc, str): return []
    s = desc.lower()
    return [w.capitalize() for w in families_vocab if w in s]

df['families'] = df['description'].apply(extract_families)
df['combined_notes_plus'] = df['combined_notes'] + df['families']

# üî• FIX: Use combined_notes_plus for training (includes olfactory families)
X = mlb.fit_transform(df['combined_notes_plus'])
y = df['season']

X_bal, y_bal = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

xgb = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    objective="multi:softprob",
    eval_metric="mlogloss"
)
xgb.fit(X_train_xgb, y_train_xgb)
y_pred_xgb = le.inverse_transform(xgb.predict(X_test_xgb))

# LinearSVC -> Can help reduce inconvlusives
_base_svm = LinearSVC(C=1.0, class_weight='balanced', random_state=42)
svm = CalibratedClassifierCV(_base_svm, cv=5)
svm.fit(X, y)

# === STEP 6: Predict on full dataset ===
df['predicted_season_svm_raw'] = svm.predict(X)
df['predicted_season'] = df["predicted_season_svm_raw"]

# Restrict the SVM to choose a season
REAL_SEASONS = ['Spring', 'Summer', 'Fall', 'Winter']

def restrict_to_real_seasons(model, X, real_labels=REAL_SEASONS):
    classes = model.classes_
    proba = model.predict_proba(X)
    keep_idx = [i for i, c in enumerate(classes) if c in real_labels]
    allowed_classes = classes[keep_idx]
    allowed_proba = proba[:, keep_idx]
    best_idx = allowed_proba.argmax(axis=1)
    return allowed_classes[best_idx]

df['predicted_season_svm'] = restrict_to_real_seasons(svm, X, REAL_SEASONS)
df['predicted_season_rf_no_inc'] = restrict_to_real_seasons(clf, X, REAL_SEASONS)

# === STEP 7: Ensemble (silently) ===
p_svm = svm.predict_proba(X_test)
p_rf = clf.predict_proba(X_test)
p_ens = (0.6 * p_svm + 0.4 * p_rf)
y_pred = [svm.classes_[i] for i in p_ens.argmax(axis=1)]

print("‚úì Model training complete! SVM and ensemble models ready.")


# ======================================================================
# =========================
# === ADD-ON: V2 PIPELINE ===
# =========================
# ======================================================================

import numpy as np
import scipy.sparse as sp
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# 1) Build designer bias (leak-free)
df['_row_id'] = np.arange(len(df))
train_ids, test_ids = train_test_split(
    df['_row_id'],
    test_size=0.2,
    random_state=42,
    stratify=df['season']
)

train_mask = np.zeros(len(df), dtype=bool)
train_mask[train_ids] = True

designer_mode_train = (
    df.loc[train_mask].groupby('designer')['season']
      .agg(lambda x: Counter(x).most_common(1)[0][0])
      .to_dict()
)

df['designer_bias'] = df['designer'].map(designer_mode_train).fillna('Unknown')

# 2) Warm/Cool ratios and layer counts
warm_notes = {'Amber','Vanilla','Oud','Leather','Tobacco','Cinnamon','Sandalwood','Tonka','Incense','Coffee','Myrrh'}
cool_notes = {'Citrus','Mint','Marine','Green','Lavender','Grapefruit','Bergamot','Neroli','Aqua','Calone','Melon'}

def ratio_counts(notes, pos_set):
    n = len(notes) if isinstance(notes, list) else 0
    if n == 0: return 0.0
    return sum(1 for t in notes if t in pos_set) / n

df['warm_ratio'] = df['combined_notes_plus'].apply(lambda n: ratio_counts(n, warm_notes))
df['cool_ratio'] = df['combined_notes_plus'].apply(lambda n: ratio_counts(n, cool_notes))
df['top_count'] = df['top_notes'].apply(lambda x: len(x) if isinstance(x, list) else 0)
df['mid_count'] = df['middle_notes'].apply(lambda x: len(x) if isinstance(x, list) else 0)
df['base_count'] = df['base_notes'].apply(lambda x: len(x) if isinstance(x, list) else 0)
df['note_count'] = df['combined_notes_plus'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# 3) üî• FIX: Build TF-IDF on combined_notes_plus (not combined_notes)
df['note_string_plus'] = df['combined_notes_plus'].apply(lambda lst: ' '.join(lst))
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_features=5000)
X_tfidf = tfidf.fit_transform(df['note_string_plus'])

# 4) Categorical and numeric features
ohe = OneHotEncoder(handle_unknown='ignore')
X_cat = ohe.fit_transform(df[['designer','designer_bias']])

num_cols = ['warm_ratio','cool_ratio','top_count','mid_count','base_count','note_count']
scaler = StandardScaler(with_mean=False)
X_num = scaler.fit_transform(df[num_cols])

# 5) Combine all features
X_v2 = sp.hstack([X_tfidf, X_cat, X_num], format='csr')
y_v2 = df['season'].values

# 6) Train/test split
train_row_idx = df.index[train_mask].to_numpy()
test_row_idx  = df.index[~train_mask].to_numpy()

X_train_v2 = X_v2[train_row_idx, :]
X_test_v2  = X_v2[test_row_idx, :]
y_train_v2 = y_v2[train_row_idx]
y_test_v2  = y_v2[test_row_idx]

# 7) Train V2 models (silently)
rf_v2 = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=42
)
rf_v2.fit(X_train_v2, y_train_v2)

le_v2 = LabelEncoder()
y_enc_v2 = le_v2.fit_transform(y_v2)

X_tr_xgb_v2 = X_v2[train_row_idx, :]
X_te_xgb_v2 = X_v2[test_row_idx, :]
y_tr_xgb_v2 = y_enc_v2[train_row_idx]
y_te_xgb_v2 = y_enc_v2[test_row_idx]

xgb_v2 = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    eval_metric="mlogloss",
    n_jobs=-1,
    random_state=42
)
xgb_v2.fit(X_tr_xgb_v2, y_tr_xgb_v2)

_base_svm_v2 = LinearSVC(C=1.0, class_weight='balanced', random_state=42)
svm_v2 = CalibratedClassifierCV(_base_svm_v2, cv=5)
svm_v2.fit(X_train_v2, y_train_v2)

# 8) Generate predictions
df['predicted_season_tfidf_svm'] = svm_v2.predict(X_v2)

print("‚úì V2 models training complete!")
print(f"   - TF-IDF features: {X_tfidf.shape[1]} dimensions")
print(f"   - Total features: {X_v2.shape[1]} dimensions")
print(f"   - Notes vocabulary size: {len(mlb.classes_)} unique notes")
print(f"   - Combined_notes_plus includes {len(families_vocab)} olfactory families")


‚úì Model training complete! SVM and ensemble models ready.




‚úì V2 models training complete!
   - TF-IDF features: 4720 dimensions
   - Total features: 6791 dimensions
   - Notes vocabulary size: 6285 unique notes
   - Combined_notes_plus includes 16 olfactory families




In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from IPython.display import display
import pandas as pd

QUESTIONS = {
    "gender": {
        "prompt": "What gender describes you best?",
        "type": "multiple_choice",
        "choices": ["Male", "Female", "Unisex/Any"]
    },
    
    # EXISTING RESEARCH-BASED QUESTIONS
    "personality": {
        "prompt": "How would you describe your personality?",
        "type": "multiple_choice",
        "choices": [
            "Bold & Adventurous (confident, outgoing, takes charge)",
            "Calm & Thoughtful (organized, reliable, introspective)", 
            "Energetic & Social (enthusiastic, expressive, loves attention)",
            "Creative & Open-minded (curious, artistic, unconventional)"
        ]
    },
    
    "scent_family": {
        "prompt": "Which scent profile appeals to you most?",
        "type": "multiple_choice",
        "choices": [
            "Warm & Rich (vanilla, amber, tonka, chocolate)",
            "Fresh & Clean (citrus, aquatic, mint, green notes)",
            "Deep & Intense (oud, leather, tobacco, incense)",
            "Soft & Floral (jasmine, rose, lavender, lily)"
        ]
    },
    
    "event": {
        "prompt": "Where are you wearing this?",
        "type": "multiple_choice",
        "choices": ["Date night", "Family gathering", "School/Work", "Party/Going out"]
    },
    
    "vibe": {
        "prompt": "Pick a vibe:",
        "type": "multiple_choice",
        "choices": ["Sweet/Comforting", "Fresh/Clean", "Woody/Spicy", "Floral/Soft"]
    },
    
    "strength": {
        "prompt": "Projection preference?",
        "type": "multiple_choice",
        "choices": ["Skin-scent", "Moderate", "Strong"]
    },
    
    #EveryHuman coming in clutch rn
    
    "energy_level": {
        "prompt": "How would you describe your energy?",
        "type": "slider",
        "left_label": "Calm & Relaxed",
        "right_label": "Energetic & Vibrant",
        "left_notes": ["Lavender", "Sandalwood", "Vanilla", "Musk", "Chamomile", "Tea"],
        "right_notes": ["Citrus", "Bergamot", "Ginger", "Mint", "Grapefruit", "Mandarin"],
        "left_season": ["Fall", "Winter"],
        "right_season": ["Spring", "Summer"]
    },
    
    "sophistication": {
        "prompt": "Your style preference:",
        "type": "slider",
        "left_label": "Classic & Timeless",
        "right_label": "Bold & Unconventional",
        "left_notes": ["Rose", "Iris", "Amber", "Cedar", "Sandalwood", "Neroli"],
        "right_notes": ["Oud", "Leather", "Tobacco", "Incense", "Patchouli", "Pepper"],
        "left_season": ["Spring", "Fall"],
        "right_season": ["Fall", "Winter"]
    },
    
    "mood_preference": {
        "prompt": "What mood do you want to project?",
        "type": "slider",
        "left_label": "Warm & Comforting",
        "right_label": "Fresh & Invigorating",
        "left_notes": ["Vanilla", "Tonka", "Amber", "Cinnamon", "Cocoa", "Caramel", "Honey"],
        "right_notes": ["Marine", "Aqua", "Mint", "Green", "Lime", "Eucalyptus", "Ozone"],
        "left_season": ["Fall", "Winter"],
        "right_season": ["Spring", "Summer"]
    },
    
    "complexity": {
        "prompt": "Fragrance complexity:",
        "type": "slider",
        "left_label": "Simple & Clean",
        "right_label": "Complex & Layered",
        "left_notes": ["Lemon", "Bergamot", "Marine", "Green", "Cotton"],
        "right_notes": ["Oud", "Incense", "Myrrh", "Cardamom", "Saffron", "Jasmine", "Patchouli"],
        "left_season": ["Spring", "Summer"],
        "right_season": ["Fall", "Winter"]
    }
}

# === ENHANCED NOTE MAPPINGS ===
ANSWER_TO_NOTES = {
    "personality": {
        "Bold & Adventurous (confident, outgoing, takes charge)": {
            "notes": ["Oud","Leather","Cedar","Pepper","Tobacco","Vetiver","Bergamot","Ginger"],
            "season_bias": ["Fall","Winter"]
        },
        "Calm & Thoughtful (organized, reliable, introspective)": {
            "notes": ["Lavender","Tea","Iris","Musk","Sandalwood","Sage","Neroli"],
            "season_bias": ["Spring","Fall"]
        },
        "Energetic & Social (enthusiastic, expressive, loves attention)": {
            "notes": ["Citrus","Bergamot","Grapefruit","Pineapple","Ginger","Pepper","Marine"],
            "season_bias": ["Spring","Summer"]
        },
        "Creative & Open-minded (curious, artistic, unconventional)": {
            "notes": ["Jasmine","Cardamom","Incense","Patchouli","Violet","Fig","Iris"],
            "season_bias": ["Spring","Fall"]
        }
    },
    
    "scent_family": {
        "Warm & Rich (vanilla, amber, tonka, chocolate)": {
            "notes": ["Vanilla","Amber","Tonka","Cinnamon","Cocoa","Praline","Caramel","Honey"],
            "season_bias": ["Fall","Winter"]
        },
        "Fresh & Clean (citrus, aquatic, mint, green notes)": {
            "notes": ["Bergamot","Lemon","Lime","Grapefruit","Marine","Mint","Green","Neroli","Aqua"],
            "season_bias": ["Spring","Summer"]
        },
        "Deep & Intense (oud, leather, tobacco, incense)": {
            "notes": ["Oud","Leather","Tobacco","Incense","Myrrh","Vetiver","Patchouli","Cedar"],
            "season_bias": ["Fall","Winter"]
        },
        "Soft & Floral (jasmine, rose, lavender, lily)": {
            "notes": ["Jasmine","Rose","Lavender","Lily","Peony","Magnolia","Violet","Iris"],
            "season_bias": ["Spring","Summer"]
        }
    },
    
    "event": {
        "Date night": {
            "notes": ["Vanilla","Amber","Tonka","Lavender","Cinnamon", "Oud"],
            "season_bias": ["Fall","Winter"]
        },
        "Family gathering": {
            "notes": ["Citrus","Bergamot","Grapefruit","Green","Neroli"],
            "season_bias": ["Spring","Summer"]
        },
        "School/Work": {
            "notes": ["Fresh","Lavender","Tea","Aromatic","Woody"],
            "season_bias": ["Spring","Fall"]
        },
        "Party/Going out": {
            "notes": ["Sweet","Amber","Oud","Leather","Pineapple"],
            "season_bias": ["Fall","Winter"]
        }
    },
    
    "vibe": {
        "Sweet/Comforting": {
            "notes": ["Vanilla","Tonka","Amber","Cinnamon","Cocoa","Lavender"],
            "season_bias": ["Fall","Winter"]
        },
        "Fresh/Clean": {
            "notes": ["Citrus","Marine","Mint","Green","Neroli","Aqua"],
            "season_bias": ["Spring","Summer"]
        },
        "Woody/Spicy": {
            "notes": ["Sandalwood","Cedar","Cardamom","Pepper","Incense", "Oud"],
            "season_bias": ["Fall", "Winter"]
        },
        "Floral/Soft": {
            "notes": ["Jasmine","Rose","Iris","Peony","Lily"],
            "season_bias": ["Spring", "Winter"]
        }
    },
    
    "strength": {
        "Skin-scent": {
            "notes": [],
            "season_bias": []
        },
        "Moderate": {
            "notes": [],
            "season_bias": []
        },
        "Strong": {
            "notes": ["Amber","Oud","Leather","Incense"],
            "season_bias": ["Fall","Winter"]
        }
    }
}

GENDER_KEYWORDS = {
    "Male": [
        "men", "man", "homme", "male", "masculin", 
        "pour homme", "for men", "for him", "men's",
        "gentlemen", "gentleman", "hombre",
        "sauvage", "invictus", "eros", "acqua di gio",
        "bleu de", "versace man", "prada l'homme",
        "aventus", "spicebomb", "le male", "jazz",
        "1 million", "sport", "intense", "black",
        "gentleman only", "wanted", "polo", "boss bottled"
    ],
    "Female": [
        "women", "woman", "femme", "female", "feminine",
        "pour femme", "for women", "for her", "women's",
        "lady", "ladies", "mademoiselle", "mujer", "donna",
        "coco", "miss dior", "flowerbomb", "black opium",
        "la vie est belle", "good girl", "si rose", "angel",
        "olympea", "scandal", "idole", "bright crystal",
        "si passione", "alien", "crystal noir", "hypnotic poison",
        "j'adore", "flora", "gucci bloom", "valentina",
        "bombshell", "chanel no", "mon paris", "narciso",
        "crystal", "her", "because it's you"
    ],
    "Unisex": [
        "unisex", "ungendered", "for all", "everyone",
        "neutral", "non-binary"
    ]
}


def infer_gender_from_row(row):
    """Infer gender from perfume title/description."""
    text_to_search = ""
    
    if 'title' in row.index and isinstance(row['title'], str):
        text_to_search += " " + row['title'].lower()
    
    if 'description' in row.index and isinstance(row['description'], str):
        text_to_search += " " + row['description'].lower()[:200]
    
    for keyword in GENDER_KEYWORDS["Unisex"]:
        if keyword in text_to_search:
            return "Unisex"
    
    male_matches = sum(1 for kw in GENDER_KEYWORDS["Male"] if kw in text_to_search)
    female_matches = sum(1 for kw in GENDER_KEYWORDS["Female"] if kw in text_to_search)
    
    if female_matches > male_matches:
        return "Female"
    elif male_matches > female_matches:
        return "Male"
    
    if any(word in text_to_search for word in ["floral", "rose", "peony", "jasmine", "lily"]) and male_matches == 0:
        return "Female"
    
    if any(word in text_to_search for word in ["woody", "leather", "tobacco", "oud"]) and female_matches == 0:
        return "Male"
    
    return "Unisex"


def slider_value_to_notes(question_key, slider_value, question_data):
    """
    Convert slider value (0-100) to weighted notes blend.
    """
    left_notes = question_data.get("left_notes", [])
    right_notes = question_data.get("right_notes", [])
    left_season = question_data.get("left_season", [])
    right_season = question_data.get("right_season", [])
    
    right_weight = slider_value / 100.0
    left_weight = 1.0 - right_weight
    
    total_notes_to_pick = 6
    num_right = int(round(right_weight * total_notes_to_pick))
    num_left = total_notes_to_pick - num_right
    
    selected_notes = []
    if num_left > 0 and left_notes:
        selected_notes += left_notes[:min(num_left, len(left_notes))]
    if num_right > 0 and right_notes:
        selected_notes += right_notes[:min(num_right, len(right_notes))]
    
    season_bias = []
    if left_weight > 0.3 and left_season:
        season_bias += left_season
    if right_weight > 0.3 and right_season:
        season_bias += right_season
    
    return {
        "notes": selected_notes,
        "season_bias": season_bias
    }


def answers_to_profile(answers, canon_note_func=None):
    """
    Convert user answers (multiple choice + sliders) into target notes and season weights.
    üî• FIX: Now properly canonicalizes notes if function is provided
    """
    target_notes = []
    season_weights = {s: 0.0 for s in ["Spring","Summer","Fall","Winter"]}

    for q, ans in answers.items():
        if q == "gender":
            continue
        
        q_data = QUESTIONS.get(q, {})
        q_type = q_data.get("type", "multiple_choice")
        
        if q_type == "slider":
            slider_value = ans
            result = slider_value_to_notes(q, slider_value, q_data)
            target_notes += result["notes"]
            for sb in result["season_bias"]:
                season_weights[sb] += 1.0
        else:
            mapping = ANSWER_TO_NOTES.get(q, {}).get(ans, {"notes":[], "season_bias":[]})
            target_notes += mapping["notes"]
            for sb in mapping["season_bias"]:
                season_weights[sb] += 1.0

    # üî• FIX: Apply canonicalization PROPERLY
    if canon_note_func:
        target_notes = [canon_note_func(n) for n in target_notes]
        target_notes = [n for n in target_notes if n]  # Remove None values
    
    # Remove duplicates while preserving order
    target_notes = list(dict.fromkeys(target_notes))

    # Normalize season weights
    total = sum(season_weights.values())
    if total > 0:
        season_weights = {k: v/total for k,v in season_weights.items()}
    
    return target_notes, season_weights


def predict_user_season_with_svm(target_notes, svm_model, mlb):
    """Use the trained SVM model to predict user's preferred season."""
    if not target_notes:
        return None, {}
    
    user_notes_encoded = mlb.transform([target_notes])
    predicted_season = svm_model.predict(user_notes_encoded)[0]
    
    probabilities = {}
    if hasattr(svm_model, 'predict_proba'):
        proba = svm_model.predict_proba(user_notes_encoded)[0]
        class_labels = svm_model.classes_
        probabilities = {label: prob for label, prob in zip(class_labels, proba)}
    
    return predicted_season, probabilities


def ask_user_questions():
    """Ask all 10 questions (6 multiple choice + 4 sliders)."""
    print("\n" + "="*60)
    print("FragrAI : Your next signature scent, picked before you spray")
    print("="*60)
    
    answers = {}
    
    question_order = [
        "gender",
        "personality", 
        "scent_family",
        "energy_level",      
        "sophistication",   
        "event",
        "vibe",
        "mood_preference",  
        "complexity",        
        "strength"
    ]
    
    for q_key in question_order:
        q_data = QUESTIONS[q_key]
        q_type = q_data.get("type", "multiple_choice")
        
        if q_type == "slider":
            print(f"\n {q_data['prompt']}")
            print(f"   ‚Üê {q_data['left_label']}  |  {q_data['right_label']} ‚Üí")
            print(f"   Enter a value from 0-100 (0 = left, 100 = right, 50 = balanced):")
            
            while True:
                ans = input(f"Your answer (0-100): ").strip()
                try:
                    value = int(ans)
                    if 0 <= value <= 100:
                        answers[q_key] = value
                        break
                except:
                    pass
                print("‚ùå dude try again, enter a number between 0-100")
        else:
            choices = q_data["choices"]
            print(f"\n {q_data['prompt']}")
            for i, choice in enumerate(choices, 1):
                print(f"   {i}. {choice}")
            
            while True:
                ans = input(f"Enter number (1-{len(choices)}): ").strip()
                try:
                    idx = int(ans) - 1
                    if 0 <= idx < len(choices):
                        answers[q_key] = choices[idx]
                        break
                except:
                    pass
                print("‚ùå dude try again")
    
    return answers


def recommend_from_answers(
    answers, 
    df,
    svm_model,
    mlb,
    top_k=3,
    canon_note_func=None,
    tfidf_vectorizer=None,
    X_tfidf=None,
    min_similarity=0.10,
    use_svm_season=True,
    debug=False
):
    """
    Recommend perfumes with SLIDER-ENHANCED note matching.
    üî• FIX: Now uses combined_notes_plus and properly canonicalizes notes
    """
    
    # Convert answers to profile (with canonicalization!)
    target_notes, season_w = answers_to_profile(answers, canon_note_func)
    user_gender = answers.get('gender', 'Unisex/Any')
    
    print(f"\n" + "="*60)
    print("üéØ YOUR ENHANCED PREFERENCE PROFILE:")
    print("="*60)
    print(f"   Gender: {user_gender}")
    print(f"   Target Notes ({len(target_notes)}): {', '.join(target_notes) if target_notes else 'None specified'}")
    
    # === USE SVM TO PREDICT SEASON ===
    svm_predicted_season = None
    season_probabilities = {}
    
    if use_svm_season and target_notes:
        print(f"\n Predicting season...")
        svm_predicted_season, season_probabilities = predict_user_season_with_svm(
            target_notes, svm_model, mlb
        )
        
        print(f"   ‚ú® You might prefer: {svm_predicted_season}")
        
        if season_probabilities:
            print(f"   üìä Season probabilities:")
            for season in ['Spring', 'Summer', 'Fall', 'Winter']:
                if season in season_probabilities:
                    prob = season_probabilities[season] * 100
                    bar = '‚ñà' * int(prob / 5)
                    print(f"      {season:8s}: {prob:5.1f}% {bar}")
    
    # === GENDER FILTERING ===
    if 'gender' not in df.columns:
        print("\n   üîç Inferring perfume genders...")
        df['gender'] = df.apply(infer_gender_from_row, axis=1)
    
    df_working = df.copy()
    
    if user_gender in ["Male", "Female"]:
        print(f"\n   üöπüö∫ Filtering to {user_gender} and Unisex fragrances...")
        df_working = df_working[
            (df_working['gender'] == user_gender) | 
            (df_working['gender'] == 'Unisex')
        ]
        print(f"   üìä Found {len(df_working)} {user_gender}/Unisex fragrances")
        
        if len(df_working) == 0:
            print(f"   ‚ö†Ô∏è  No {user_gender} fragrances found, showing all...")
            df_working = df.copy()
    
    # === SEASON FILTERING ===
    season_col = None
    for col in ['predicted_season_tfidf_svm', 'predicted_season_svm', 'predicted_season', 'season']:
        if col in df_working.columns:
            season_col = col
            break
    
    if season_col is None:
        raise ValueError("... season not found...")
    
    if svm_predicted_season and season_probabilities:
        sorted_seasons = sorted(season_probabilities.items(), key=lambda x: x[1], reverse=True)
        top_seasons = [s for s, p in sorted_seasons[:2] if p > 0.15]
        
        if top_seasons:
            print(f"\n   üîç Season filtering: {', '.join(top_seasons)}")
            season_filtered = df_working[df_working[season_col].isin(top_seasons)]
            
            if len(season_filtered) > 0:
                df_working = season_filtered
                print(f"    {len(df_working)} fragrances match predicted season(s)")
    
    # === ENHANCED NOTE SIMILARITY ===
    if target_notes:
        query_doc = " ".join(target_notes)
        
        if tfidf_vectorizer is not None and X_tfidf is not None:
            try:
                q_vec = tfidf_vectorizer.transform([query_doc])
                working_indices = df_working.index
                X_tfidf_working = X_tfidf[working_indices]
                sims = cosine_similarity(q_vec, X_tfidf_working).ravel()
            except Exception as e:
                if debug:
                    print(f"   ‚ö†Ô∏è  TF-IDF failed, using Jaccard: {str(e)}")
                # üî• FIX: Use combined_notes_plus (not combined_notes)
                target_set = set(target_notes)
                notes_col = 'combined_notes_plus' if 'combined_notes_plus' in df_working.columns else 'combined_notes'
                sims = df_working[notes_col].apply(
                    lambda n: len(target_set.intersection(set(n or []))) / max(1, len(target_set.union(set(n or []))))
                ).to_numpy()
        else:
            # üî• FIX: Use combined_notes_plus
            target_set = set(target_notes)
            notes_col = 'combined_notes_plus' if 'combined_notes_plus' in df_working.columns else 'combined_notes'
            sims = df_working[notes_col].apply(
                lambda n: len(target_set.intersection(set(n or []))) / max(1, len(target_set.union(set(n or []))))
            ).to_numpy()
    else:
        sims = np.ones(len(df_working)) * 0.5
    
    # === SEASON SCORING ===
    model_seasons = df_working[season_col].fillna('Unknown')
    
    if svm_predicted_season and season_probabilities:
        season_scores = np.array([season_probabilities.get(s, 0.0) for s in model_seasons])
    elif any(season_w.values()):
        season_scores = np.array([season_w.get(s, 0.0) for s in model_seasons])
    else:
        season_scores = np.array([0.5 if s in season_w else 0.0 for s in model_seasons])
    
    # Apply minimum similarity threshold
    valid_mask = sims >= min_similarity
    
    # üî• ENHANCED SCORING: 85% notes, 15% season (more weight on notes!)
    combined_score = 0.85 * sims + 0.15 * season_scores
    combined_score[~valid_mask] = -1.0
    
    # Build output
    result = df_working.copy()
    result['Gender'] = result['gender']
    result['Note Match'] = (sims * 100).round(1).astype(str) + '%'
    result['Season Match'] = (season_scores * 100).round(1).astype(str) + '%'
    result['Total Score'] = combined_score.round(3)
    
    # Sort and get top K
    result = result[combined_score >= 0].sort_values('Total Score', ascending=False)
    
    if len(result) > 0:
        print(f"\n   üéØ Top match score: {result['Total Score'].iloc[0]:.3f}")
        avg_match = result['Note Match'].head(3).apply(lambda x: float(x.strip('%'))).mean()
        print(f"   üìä Average note match in top 3: {avg_match:.1f}%")
    else:
        print(f"\n   ‚ö†Ô∏è  No matches found above threshold")
    
    # Select columns
    display_cols = ['title', 'designer', 'Gender', season_col, 'top_notes', 'middle_notes', 'base_notes', 
                    'Note Match', 'Season Match', 'Total Score']
    display_cols = [c for c in display_cols if c in result.columns]
    
    return result[display_cols].head(top_k)


def get_perfume_recommendations(
    df, 
    svm_model, 
    mlb,
    canon_note_func=None, 
    tfidf_vectorizer=None, 
    X_tfidf=None, 
    top_k=3,
    debug=False
):
    """
    Main function with 10 questions (6 multiple choice + 4 sliders).
    Returns TOP 3 recommendations.
    
    üî• CRITICAL: Pass canon_note from your training model!
    Example usage:
        recs = get_perfume_recommendations(df, svm, mlb, canon_note, tfidf, X_tfidf)
    """
    answers = ask_user_questions()
    
    print(f"\n" + "="*60)
    print(" YOUR TOP 3 RECOMMENDATIONS: ")
    print("="*60 + "\n")
    
    recs = recommend_from_answers(
        answers,
        df,
        svm_model,
        mlb,
        top_k=top_k,
        canon_note_func=canon_note_func,
        tfidf_vectorizer=tfidf_vectorizer,
        X_tfidf=X_tfidf,
        use_svm_season=True,
        debug=debug
    )
    
    return recs

In [9]:
recs = get_perfume_recommendations(
    df, 
    svm,
    mlb,
    canon_note, 
    tfidf, 
    X_tfidf
)
display(recs)


FragrAI : Your next signature scent, picked before you spray

 What gender describes you best?
   1. Male
   2. Female
   3. Unisex/Any

 How would you describe your personality?
   1. Bold & Adventurous (confident, outgoing, takes charge)
   2. Calm & Thoughtful (organized, reliable, introspective)
   3. Energetic & Social (enthusiastic, expressive, loves attention)
   4. Creative & Open-minded (curious, artistic, unconventional)

 Which scent profile appeals to you most?
   1. Warm & Rich (vanilla, amber, tonka, chocolate)
   2. Fresh & Clean (citrus, aquatic, mint, green notes)
   3. Deep & Intense (oud, leather, tobacco, incense)
   4. Soft & Floral (jasmine, rose, lavender, lily)

 How would you describe your energy?
   ‚Üê Calm & Relaxed  |  Energetic & Vibrant ‚Üí
   Enter a value from 0-100 (0 = left, 100 = right, 50 = balanced):

 Your style preference:
   ‚Üê Classic & Timeless  |  Bold & Unconventional ‚Üí
   Enter a value from 0-100 (0 = left, 100 = right, 50 = balanced):


Unnamed: 0,title,designer,Gender,predicted_season_tfidf_svm,top_notes,middle_notes,base_notes,Note Match,Season Match,Total Score
3836,Ambero Bvlgari for men,Bvlgari Perfumes and Colognes,Male,Winter,[Incense],[Saffron],"[Ginger, Pepper]",23.8%,100.0%,0.352
2843,Professor Portland General Store for men,Portland General Store Perfumes and Colognes,Male,Winter,[Leather],[],[Tobacco],17.5%,100.0%,0.298
690,Roman Essence de Roumanie for men,Essence De Roumanie Perfumes and Colognes,Male,Winter,"[S Are Coriander Extract, Pepper And Tangerine]","[S Are Patchouli, Cedar, Grass And Rose]","[S Are Leather, Tobacco, Vanilla, Amber And Musk]",17.1%,100.0%,0.295
