<a href="https://colab.research.google.com/github/AliAI11/fragranceBERT/blob/main/notebooks/01_dataset_exploration_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# imports
import pandas as pd
import numpy as np
import json
import random
from tqdm import tqdm
import os
import time

random.seed(42)
np.random.seed(42)

In [2]:
# load dataset
import kagglehub

path = kagglehub.dataset_download("olgagmiufana1/fragrantica-com-fragrance-dataset")
print(f"dataset path: {path}")

df = pd.read_csv(f'{path}/fra_cleaned.csv', encoding='cp1252', sep=';')
print(f'loaded {len(df)} fragrances')
print(f'\ncolumns: {df.columns.tolist()}')

Using Colab cache for faster access to the 'fragrantica-com-fragrance-dataset' dataset.
dataset path: /kaggle/input/fragrantica-com-fragrance-dataset
loaded 24063 fragrances

columns: ['url', 'Perfume', 'Brand', 'Country', 'Gender', 'Rating Value', 'Rating Count', 'Year', 'Top', 'Middle', 'Base', 'Perfumer1', 'Perfumer2', 'mainaccord1', 'mainaccord2', 'mainaccord3', 'mainaccord4', 'mainaccord5']


In [3]:
# looking at data
print('\ndata overview:')
print(df.head())
print(f'\nshape: {df.shape}')
print(f'\nmissing values:')
print(df.isnull().sum())

# check main accords
if 'mainaccord1' in df.columns:
    print(f'\ntop 10 main accords:')
    print(df['mainaccord1'].value_counts().head(10))


data overview:
                                                 url  \
0  https://www.fragrantica.com/perfume/xerjoff/ac...   
1  https://www.fragrantica.com/perfume/jean-paul-...   
2  https://www.fragrantica.com/perfume/jean-paul-...   
3  https://www.fragrantica.com/perfume/bruno-bana...   
4  https://www.fragrantica.com/perfume/jean-paul-...   

                          Perfume               Brand  Country  Gender  \
0  accento-overdose-pride-edition             xerjoff    Italy  unisex   
1            classique-pride-2024  jean-paul-gaultier   France   women   
2            classique-pride-2023  jean-paul-gaultier   France  unisex   
3               pride-edition-man        bruno-banani  Germany     men   
4         le-male-pride-collector  jean-paul-gaultier   France     men   

  Rating Value  Rating Count    Year  \
0         1,42           201  2022.0   
1         1,86            70  2024.0   
2         1,91           285  2023.0   
3         1,92            59  2019.0   
4 

In [4]:
print('\ncleaning data...')

# keep only rows with essential info
df_clean = df.dropna(subset=['Perfume', 'Brand']).copy()

# fill missing notes with empty string
for col in ['Top', 'Middle', 'Base']:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna('')

# fill missing accords
for col in ['mainaccord1', 'mainaccord2', 'mainaccord3']:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna('')

print(f'cleaned dataset: {len(df_clean)} fragrances')


cleaning data...
cleaned dataset: 24063 fragrances


In [5]:
def create_description(row):
    """create detailed perfume description for embedding"""
    parts = []

    # basic info
    parts.append(f"{row['Perfume']} by {row['Brand']}")

    # accords
    accords = []
    for col in ['mainaccord1', 'mainaccord2', 'mainaccord3']:
        if col in row and row[col]:
            accords.append(row[col])
    if accords:
        parts.append(f"accords: {', '.join(accords)}")

    # notes
    if 'Top' in row and row['Top']:
        parts.append(f"top notes: {row['Top']}")
    if 'Middle' in row and row['Middle']:
        parts.append(f"middle notes: {row['Middle']}")
    if 'Base' in row and row['Base']:
        parts.append(f"base notes: {row['Base']}")

    return '. '.join(parts) + '.'

df_clean['description'] = df_clean.apply(create_description, axis=1)

print('\nsample descriptions:')
for i in range(3):
    print(f'\n{i+1}. {df_clean.iloc[i]["description"][:200]}...')


sample descriptions:

1. accento-overdose-pride-edition by xerjoff. accords: rose, woody, fruity. top notes: fruity notes, aldehydes, green notes. middle notes: bulgarian rose, egyptian jasmine, lily-of-the-valley. base notes...

2. classique-pride-2024 by jean-paul-gaultier. accords: citrus, white floral, sweet. top notes: yuzu, citruses. middle notes: orange blossom, neroli. base notes: musk, blonde woods....

3. classique-pride-2023 by jean-paul-gaultier. accords: citrus, white floral, sweet. top notes: blood orange, yuzu. middle notes: neroli, orange blossom. base notes: musk, white woods....


In [6]:
# use top 1000 most complete perfumes for query generation
# based on best buy approach of selecting most-used documents

def completeness_score(row):
    """score based on how much info is available"""
    score = 0
    if row['Top']: score += 1
    if row['Middle']: score += 1
    if row['Base']: score += 1
    if row.get('mainaccord1', ''): score += 1
    if row.get('mainaccord2', ''): score += 1
    return score

df_clean['completeness'] = df_clean.apply(completeness_score, axis=1)
df_sampled = df_clean.nlargest(1000, 'completeness').copy()

print(f'\nsampled {len(df_sampled)} most complete perfumes for query generation')


sampled 1000 most complete perfumes for query generation


In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print('\nloading qwen2.5-7b-instruct model...')
model_name = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

print('model loaded')



loading qwen2.5-7b-instruct model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

model loaded


In [14]:
def generate_queries_for_perfume(description, n=10):
    """
    generate n diverse search queries for a perfume
    following best buy's llm-based synthetic query generation
    """

    prompt = f"""Generate {n} diverse natural language search queries that someone might use to find this perfume:

{description}

Requirements:
- queries should vary in style: emotional (feel romantic), seasonal (winter evening), descriptive (warm vanilla), occasion-based (date night)
- use natural language like real users
- include different lengths: short (3-5 words) and longer (8-15 words)
- vary specificity: some generic, some very specific

Return ONLY a JSON array of {n} query strings, nothing else.
Example format: ["query 1", "query 2", ...]"""

    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # extract json from response - remove the prompt
        response_text = response_text.split(prompt)[-1].strip()

        # find the JSON array - handle multiple brackets
        if '[' in response_text and ']' in response_text:
            start = response_text.index('[')
            # find the matching closing bracket
            bracket_count = 0
            end = start
            for i in range(start, len(response_text)):
                if response_text[i] == '[':
                    bracket_count += 1
                elif response_text[i] == ']':
                    bracket_count -= 1
                    if bracket_count == 0:
                        end = i + 1
                        break

            json_str = response_text[start:end]
            queries = json.loads(json_str)

            # ensure we return strings and the right number
            queries = [str(q) for q in queries if q][:n]
            return queries
        else:
            print(f"no json array found in response")
            return []

    except json.JSONDecodeError as e:
        print(f"json decode error: {e}")
        # try to extract queries with regex as fallback
        import re
        matches = re.findall(r'"([^"]+)"', response_text)
        if matches:
            return matches[:n]
        return []
    except Exception as e:
        print(f"error generating queries: {e}")
        return []

In [15]:
# retry logic to the main loop
print('\ngenerating synthetic queries for all perfumes...')

all_training_pairs = []
failed_count = 0

for idx, row in tqdm(df_sampled.iterrows(), total=len(df_sampled), desc='generating'):
    queries = generate_queries_for_perfume(row['description'], n=10)

    if not queries:
        failed_count += 1
        # skip this perfume if generation failed
        continue

    for query in queries:
        all_training_pairs.append({
            'query': query,
            'perfume_id': idx,
            'perfume_name': row['Perfume'],
            'brand': row['Brand'],
            'description': row['description']
        })

    # rate limit: wait between requests
    time.sleep(0.5)

print(f'\ngenerated {len(all_training_pairs)} training pairs')
print(f'unique queries: {len(set([p["query"] for p in all_training_pairs]))}')
print(f'perfumes covered: {len(set([p["perfume_id"] for p in all_training_pairs]))}')
print(f'failed generations: {failed_count}')


generating synthetic queries for all perfumes...


generating: 100%|██████████| 1000/1000 [1:27:48<00:00,  5.27s/it]


generated 10000 training pairs
unique queries: 7517
perfumes covered: 1000
failed generations: 0





In [16]:
training_df = pd.DataFrame(all_training_pairs)

print('\nsample training pairs:')
print(training_df.head(10))


sample training pairs:
                                               query  perfume_id  \
0                          romantic winter fragrance           0   
1                 elegant winter perfume for evening           0   
2                         woody fruity perfume notes           0   
3                      bulgarian rose perfume review           0   
4             xerjoff accento overdose pride edition           0   
5                     date night fragrance with rose           0   
6                      eucalyptus pine perfume notes           0   
7  perfume with egyptian jasmine and lily-of-the-...           0   
8               warm vanilla perfume for cold nights           0   
9                      woody fruity aldehyde perfume           0   

                     perfume_name    brand  \
0  accento-overdose-pride-edition  xerjoff   
1  accento-overdose-pride-edition  xerjoff   
2  accento-overdose-pride-edition  xerjoff   
3  accento-overdose-pride-edition  xerjoff 

In [17]:
# train/val/test split

# split by perfume to prevent leakage
unique_perfumes = training_df['perfume_id'].unique()
np.random.shuffle(unique_perfumes)

n_train = int(len(unique_perfumes) * 0.7)
n_val = int(len(unique_perfumes) * 0.15)

train_perfumes = unique_perfumes[:n_train]
val_perfumes = unique_perfumes[n_train:n_train+n_val]
test_perfumes = unique_perfumes[n_train+n_val:]

train_df = training_df[training_df['perfume_id'].isin(train_perfumes)]
val_df = training_df[training_df['perfume_id'].isin(val_perfumes)]
test_df = training_df[training_df['perfume_id'].isin(test_perfumes)]

print(f'\ntrain: {len(train_df)} pairs from {len(train_perfumes)} perfumes')
print(f'val: {len(val_df)} pairs from {len(val_perfumes)} perfumes')
print(f'test: {len(test_df)} pairs from {len(test_perfumes)} perfumes')


train: 7000 pairs from 700 perfumes
val: 1500 pairs from 150 perfumes
test: 1500 pairs from 150 perfumes


In [18]:
os.makedirs('data', exist_ok=True)

df_clean.to_csv('data/perfumes_all.csv', index=False)
train_df.to_csv('data/train.csv', index=False)
val_df.to_csv('data/val.csv', index=False)
test_df.to_csv('data/test.csv', index=False)

print('\nsaved:')
print('  - data/perfumes_all.csv')
print('  - data/train.csv')
print('  - data/val.csv')
print('  - data/test.csv')



saved:
  - data/perfumes_all.csv
  - data/train.csv
  - data/val.csv
  - data/test.csv


In [19]:
# queries for evaluation
eval_queries = [
    "warm cozy scent for winter mornings by the fireplace",
    "fresh citrus for spring afternoons",
    "romantic floral for date night",
    "professional clean scent for office",
    "sweet vanilla cookies baking",
    "masculine woody leather",
    "sensual amber evening",
    "energizing morning coffee and bergamot",
    "summer beach coconut and salt",
    "elegant powdery iris",
    "spicy cinnamon autumn",
    "calming lavender bedtime",
    "confident oud and tobacco",
    "playful fruity peach",
    "sophisticated rose and musk"
]

with open('data/eval_queries.json', 'w') as f:
    json.dump(eval_queries, f, indent=2)

print('  - data/eval_queries.json')

  - data/eval_queries.json


In [20]:
# ============================================================================
# summary
# ============================================================================

print('\n' + '='*80)
print('data preparation complete')
print('='*80)
print(f'total perfumes in catalog: {len(df_clean)}')
print(f'perfumes with synthetic queries: {len(df_sampled)}')
print(f'total training pairs: {len(training_df)}')
print(f'train: {len(train_df)} pairs')
print(f'val: {len(val_df)} pairs')
print(f'test: {len(test_df)} pairs')
print(f'evaluation queries: {len(eval_queries)}')
print('='*80)


data preparation complete
total perfumes in catalog: 24063
perfumes with synthetic queries: 1000
total training pairs: 10000
train: 7000 pairs
val: 1500 pairs
test: 1500 pairs
evaluation queries: 15
