<a href="https://colab.research.google.com/github/AarishB/FragrAI/blob/main/FragrAIi_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import ast
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display
from collections import Counter
import unicodedata

# === STEP 1: Load and clean dataset ===
df = pd.read_csv('perfumes_table.csv', nrows=10000)
df = df.drop(columns=['url', 'rating', 'reviews'], errors='ignore')
pd.set_option('display.max_colwidth', None)

# Reorder columns
desired_order = ['title', 'notes', 'description']
remaining_cols = [col for col in df.columns if col not in desired_order]
df = df[desired_order + remaining_cols]

# Capitalize designer
def capitalize(s):
    if not isinstance(s, str): return s
    exceptions = {'and', 'for', 'of', 'the', 'in', 'on', 'at', 'with', 'a', 'an'}
    words = s.split()
    return ' '.join([
        word.capitalize() if i == 0 or word not in exceptions else word
        for i, word in enumerate(words)
    ])
df['designer'] = df['designer'].apply(capitalize)

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFKD', s) if not unicodedata.combining(c))

def canon_note(x): #these two methods help make unusual notes appear readable, used help from ChatGPT to get a regex pattern.
    if not isinstance(x, str): return None
    n = strip_accents(x).lower()
    n = re.sub(r'[^a-z0-9\s]', ' ', n).strip()
    synonyms = {
        'calabrian bergamot':'bergamot','sicilian bergamot':'bergamot','italian bergamot':'bergamot',
        'cedarwood':'cedar','virginia cedar':'cedar','atlas cedar':'cedar',
        'oudh':'oud','agarwood':'oud','oud wood':'oud',
        'pink pepper':'pepper','black pepper':'pepper',
        'white musk':'musk','ambroxan':'amber','ambergris':'amber',
        'sea notes':'marine','marine notes':'marine','watery notes':'marine','aqua':'marine','calone':'marine',
        'orange blossom':'neroli','orris root':'iris','cashmeran':'woody'
    }
    n = synonyms.get(n, n)
    return ' '.join(w.capitalize() for w in n.split())

def _extract_layer(text, layer_aliases, stop_regex):
    if not isinstance(text, str): return []
    s = ' '.join(text.split())  # flatten newlines/spaces

    # RegEX pattern for identification
    pat = rf"\b({'|'.join(layer_aliases)})\s*(?:notes?)?\s*(?:are|include|:)?\s*(.*?)(?={stop_regex}|\.|$)"

    m = re.search(pat, s, flags=re.IGNORECASE)
    if not m:
        return []
    chunk = m.group(2)
    toks = re.split(r'[;,/|]', chunk)
    return [canon_note(t) for t in toks if canon_note(t)]

def extract_notes(desc, note_type): #Better extraction of notes, which is more efficient and structured.
    aliases = {
        'Top': ['top','opening','head'],
        'Middle': ['middle','heart'],
        'Base': ['base','drydown']
    }
    if note_type == 'Top':
        stop = r"\b(?:middle|heart|base|drydown)\b"
    elif note_type == 'Middle':
        stop = r"\b(?:base|drydown)\b"
    else:
        # match until period or end (stop regex that never matches)
        stop = r"$^"

    return _extract_layer(desc, aliases[note_type], stop)

# Re-run extraction
df['top_notes']    = df['description'].apply(lambda x: extract_notes(x, "Top"))
df['middle_notes'] = df['description'].apply(lambda x: extract_notes(x, "Middle"))
df['base_notes']   = df['description'].apply(lambda x: extract_notes(x, "Base"))

if 'notes' in df.columns: #This section incorporates the cannon notes to make notes more broad and general, and outpu
    def fallback(row):
        if row['top_notes'] or row['middle_notes'] or row['base_notes']: return row
        raw = row['notes']
        if isinstance(raw, str):
            toks = [canon_note(t) for t in re.split(r'[;,/|]', raw) if canon_note(t)]
            row['top_notes'] = toks
        elif isinstance(raw, (list, tuple)):
            row['top_notes'] = [canon_note(t) for t in raw if canon_note(t)]
        return row
    df = df.apply(fallback, axis=1)

# === STEP 3: Create combined notes with weight for base notes ===
df['combined_notes'] = df.apply(lambda row: row['top_notes'] + row['middle_notes'] + row['base_notes'] * 2, axis=1)

# === STEP 4: Assign seasons for supervised learning ===
seasonal_notes = {
    'Summer': ['Citrus', 'Aquatic', 'Mint', 'Neroli', 'Grapefruit', 'Orange', 'Lemon', 'Bergamot'],
    'Winter': ['Vanilla', 'Amber', 'Oud', 'Leather', 'Tobacco', 'Cinnamon', 'Tonka'],
    'Spring': ['Jasmine', 'Rose', 'Green Tea', 'Lily', 'Pear', 'Apple'],
    'Fall': ['Sandalwood', 'Patchouli', 'Nutmeg', 'Cardamom', 'Clove', 'Plum']
}

def assign_weighted_season_from_layers(top, mid, base):
    season_scores = {season: 0 for season in seasonal_notes}
    total_notes = len(top) + len(mid) + len(base)

    if total_notes == 0:
        return 'Inconclusive'

    top_wt = max(1, round((len(top)/total_notes) * 10))
    mid_wt = max(1, round((len(mid)/total_notes) * 10))
    base_wt = max(1, round((len(base)/total_notes) * 10))

    for note in top:
        for season, group in seasonal_notes.items():
            if note in group:
                season_scores[season] += (top_wt+365)
    for note in mid:
        for season, group in seasonal_notes.items():
            if note in group:
                season_scores[season] += mid_wt
    for note in base:
        for season, group in seasonal_notes.items():
            if note in group:
                season_scores[season] += base_wt

    return max(season_scores, key=season_scores.get) if any(season_scores.values()) else 'Inconclusive'

df['season'] = df.apply(
    lambda row: assign_weighted_season_from_layers(row['top_notes'], row['middle_notes'], row['base_notes']),
    axis=1
)

# === STEP 5: Train ML model ===
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['combined_notes'])
y = df['season']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# === STEP 6: Evaluate model ===
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# === STEP 7: Predict on full dataset and show notes ===
df['predicted_season'] = clf.predict(X)

# === STEP 8: Display results with notes used ===
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

display(df[['title', 'designer', 'top_notes', 'middle_notes', 'base_notes', 'combined_notes', 'predicted_season']].head(10000))

# === STEP 9: Analyze no-note descriptions ===
no_notes_df = df[
    (df['top_notes'].apply(len) == 0) &
    (df['middle_notes'].apply(len) == 0) &
    (df['base_notes'].apply(len) == 0)
]

print(f"\nTotal fragrances with no extracted notes: {len(no_notes_df)}\n")

# Count recurring fragments (even without filtering)
phrases = []
for desc in no_notes_df['description'].dropna().head(200):
    lines = desc.lower().split('.')
    phrases.extend([line.strip() for line in lines if line.strip()])

common = Counter(phrases)
print("Most common sentence fragments in no-note descriptions:")
for phrase, count in common.most_common(30):
    print(f"{count} â†’ {phrase}")

# === STEP 10: Show 5 full descriptions that had no extracted notes ===
print("\n\nExamples of full descriptions with no extracted notes:")
for i, desc in enumerate(no_notes_df['description'].dropna().head(5), 1):
    print(f"\n{i}. {desc}")