<a href="https://colab.research.google.com/github/AliAI11/fragranceBERT/blob/main/notebooks/03_evaluation_and_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers scikit-learn torch pandas numpy tqdm



In [2]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
from tqdm import tqdm
import os
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

random.seed(42)
np.random.seed(42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'using device: {device}')

using device: cuda


In [6]:
from google.colab import files
import zipfile
import os

os.makedirs('./data', exist_ok=True)
os.makedirs('./models', exist_ok=True)

print('\nupload files:')
print('1. perfumes_with_ids.csv')
print('2. perfume_embeddings.npy')
print('3. test.csv')
print('4. fragrance-retriever.zip (model)')

uploaded = files.upload()

# handle each file
for filename in uploaded.keys():
    if filename.endswith('.zip'):
        # extract zip to models directory
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall('./models/')
        print(f'extracted {filename} to ./models/')

        # check what was extracted
        extracted_files = os.listdir('./models/')
        print(f'extracted files: {extracted_files}')
    else:
        # move data files
        target_path = f'./data/{filename}'
        with open(target_path, 'wb') as f:
            f.write(uploaded[filename])
        print(f'moved {filename} to ./data/')

print('\nupload complete')

# verify model directory structure
print('\nmodel directory structure:')
for root, dirs, files in os.walk('./models/'):
    level = root.replace('./models/', '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        print(f'{subindent}{file}')


upload files:
1. perfumes_with_ids.csv
2. perfume_embeddings.npy
3. test.csv
4. fragrance-retriever.zip (model)


Saving fragrance-retriever.zip to fragrance-retriever (1).zip
Saving perfume_embeddings.npy to perfume_embeddings.npy
Saving perfumes_with_ids.csv to perfumes_with_ids.csv
Saving test.csv to test.csv
extracted fragrance-retriever (1).zip to ./models/
extracted files: ['2_Normalize', 'sentence_bert_config.json', 'model.safetensors', 'config_sentence_transformers.json', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'README.md', 'config.json', '1_Pooling', 'modules.json', 'eval', 'vocab.txt']
moved perfume_embeddings.npy to ./data/
moved perfumes_with_ids.csv to ./data/
moved test.csv to ./data/

upload complete

model directory structure:
/
  sentence_bert_config.json
  model.safetensors
  config_sentence_transformers.json
  special_tokens_map.json
  tokenizer.json
  tokenizer_config.json
  README.md
  config.json
  modules.json
  vocab.txt
2_Normalize/
1_Pooling/
  config.json
eval/
  Information-Retrieval_evaluation_val_results.csv


In [8]:
perfumes_df = pd.read_csv('./data/perfumes_with_ids.csv', index_col=0)
perfume_embeddings = np.load('./data/perfume_embeddings.npy')
test_df = pd.read_csv('./data/test.csv')

print(f'\nloaded {len(perfumes_df)} perfumes')
print(f'embeddings shape: {perfume_embeddings.shape}')
print(f'test set: {len(test_df)} examples')

# load trained model - files are directly in ./models/
model = SentenceTransformer('./models/')
print(f'loaded model: {model.get_sentence_embedding_dimension()}-dim embeddings')


loaded 24063 perfumes
embeddings shape: (24063, 384)
test set: 1500 examples
loaded model: 384-dim embeddings


In [9]:
# hand-crafted realistic queries for comprehensive evaluation
eval_queries = [
    "warm cozy scent for winter mornings by the fireplace",
    "fresh citrus for spring afternoons",
    "romantic floral for date night",
    "professional clean scent for office",
    "sweet vanilla cookies baking",
    "masculine woody leather",
    "sensual amber evening",
    "energizing morning coffee and bergamot",
    "summer beach coconut and salt",
    "elegant powdery iris",
    "spicy cinnamon autumn",
    "calming lavender bedtime",
    "confident oud and tobacco",
    "playful fruity peach",
    "sophisticated rose and musk"
]

print(f'\nevaluation queries: {len(eval_queries)}')


evaluation queries: 15


In [10]:
# ============================================================================
# baseline 1: random selection
# ============================================================================

class RandomBaseline:
    """randomly select perfumes - worst case baseline"""

    def __init__(self, perfumes_df):
        self.perfumes_df = perfumes_df
        self.indices = list(range(len(perfumes_df)))

    def search(self, query: str, top_k: int = 10) -> List[int]:
        """return random perfume indices"""
        return random.sample(self.indices, top_k)

In [11]:
# ============================================================================
# baseline 2: tf-idf with cosine similarity
# ============================================================================

class TfidfBaseline:
    """traditional information retrieval with tf-idf"""

    def __init__(self, perfumes_df):
        self.perfumes_df = perfumes_df
        self.descriptions = perfumes_df['description'].tolist()

        # fit tf-idf vectorizer
        self.vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            stop_words='english'
        )
        self.doc_vectors = self.vectorizer.fit_transform(self.descriptions)
        print('tf-idf vectorizer fitted')

    def search(self, query: str, top_k: int = 10) -> List[int]:
        """search using tf-idf cosine similarity"""
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.doc_vectors)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        return top_indices.tolist()

In [12]:
# ============================================================================
# baseline 3: keyword matching with rules
# ============================================================================

class KeywordBaseline:
    """rule-based keyword matching"""

    def __init__(self, perfumes_df):
        self.perfumes_df = perfumes_df
        self.descriptions = perfumes_df['description'].str.lower().tolist()

        # keyword categories
        self.keywords = {
            'vanilla': ['vanilla', 'vanille'],
            'citrus': ['citrus', 'lemon', 'orange', 'bergamot', 'grapefruit'],
            'floral': ['floral', 'rose', 'jasmine', 'lily', 'iris', 'violet'],
            'woody': ['woody', 'wood', 'cedar', 'sandalwood', 'vetiver'],
            'fresh': ['fresh', 'aquatic', 'marine', 'water'],
            'spicy': ['spicy', 'cinnamon', 'pepper', 'ginger', 'cardamom'],
            'sweet': ['sweet', 'honey', 'caramel', 'sugar'],
            'leather': ['leather', 'suede'],
            'amber': ['amber', 'resin'],
            'musk': ['musk', 'musky']
        }

    def search(self, query: str, top_k: int = 10) -> List[int]:
        """match keywords and score perfumes"""
        query_lower = query.lower()
        scores = np.zeros(len(self.descriptions))

        # score based on keyword matches
        for desc_idx, desc in enumerate(self.descriptions):
            for category, keywords in self.keywords.items():
                # check if query mentions this category
                category_in_query = any(kw in query_lower for kw in keywords)
                # check if description contains this category
                category_in_desc = any(kw in desc for kw in keywords)

                if category_in_query and category_in_desc:
                    scores[desc_idx] += 1

        # if no matches, return random
        if scores.sum() == 0:
            return random.sample(range(len(self.descriptions)), top_k)

        top_indices = np.argsort(scores)[-top_k:][::-1]
        return top_indices.tolist()

In [13]:
# ============================================================================
# initialize baselines
# ============================================================================

print('\ninitializing baselines...')
random_baseline = RandomBaseline(perfumes_df)
tfidf_baseline = TfidfBaseline(perfumes_df)
keyword_baseline = KeywordBaseline(perfumes_df)
print('baselines ready')


initializing baselines...
tf-idf vectorizer fitted
baselines ready


In [14]:
# ============================================================================
# bi-encoder retrieval system
# ============================================================================

class BiEncoderRetriever:
    """trained sentence-bert retrieval"""

    def __init__(self, model, perfume_embeddings, perfumes_df):
        self.model = model
        self.perfume_embeddings = perfume_embeddings
        self.perfumes_df = perfumes_df

    def search(self, query: str, top_k: int = 10) -> List[int]:
        """encode query and find nearest neighbors"""
        query_embedding = self.model.encode([query], convert_to_tensor=False)
        similarities = cosine_similarity(query_embedding, self.perfume_embeddings)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        return top_indices.tolist()

bi_encoder = BiEncoderRetriever(model, perfume_embeddings, perfumes_df)
print('bi-encoder retriever ready')

bi-encoder retriever ready


In [15]:
# ============================================================================
# evaluation metrics
# ============================================================================

def precision_at_k(relevant: set, retrieved: List, k: int) -> float:
    """fraction of top-k results that are relevant"""
    retrieved_k = set(retrieved[:k])
    if len(retrieved_k) == 0:
        return 0.0
    return len(relevant & retrieved_k) / k

def mean_reciprocal_rank(relevant: set, retrieved: List) -> float:
    """inverse rank of first relevant item"""
    for i, item in enumerate(retrieved, 1):
        if item in relevant:
            return 1.0 / i
    return 0.0

def ndcg_at_k(relevant: set, retrieved: List, k: int) -> float:
    """normalized discounted cumulative gain"""
    dcg = 0.0
    for i, item in enumerate(retrieved[:k], 1):
        if item in relevant:
            dcg += 1.0 / np.log2(i + 1)

    # ideal dcg
    idcg = sum(1.0 / np.log2(i + 1) for i in range(1, min(len(relevant), k) + 1))

    return dcg / idcg if idcg > 0 else 0.0

def evaluate_retrieval(retriever, test_df, k_values=[5, 10]):
    """evaluate retrieval system on test set"""

    results = {
        'precision@5': [],
        'precision@10': [],
        'mrr': [],
        'ndcg@10': []
    }

    # group by query to get relevant perfumes
    query_groups = test_df.groupby('query')['perfume_id'].apply(set).to_dict()

    for query, relevant_ids in tqdm(query_groups.items(), desc='evaluating'):
        # retrieve top-k
        retrieved = retriever.search(query, top_k=10)

        # calculate metrics
        results['precision@5'].append(precision_at_k(relevant_ids, retrieved, 5))
        results['precision@10'].append(precision_at_k(relevant_ids, retrieved, 10))
        results['mrr'].append(mean_reciprocal_rank(relevant_ids, retrieved))
        results['ndcg@10'].append(ndcg_at_k(relevant_ids, retrieved, 10))

    # aggregate
    return {k: np.mean(v) for k, v in results.items()}


In [17]:
# ============================================================================
# evaluate all systems
# ============================================================================

print('\n' + '='*80)
print('evaluating all systems on test set')
print('='*80)

systems = {
    'bi-encoder': bi_encoder,
    'tf-idf': tfidf_baseline,
    'keyword matching': keyword_baseline,
    'random': random_baseline
}

all_results = {}

for name, system in systems.items():
    print(f'\nevaluating {name}...')
    results = evaluate_retrieval(system, test_df)
    all_results[name] = results

    print(f'  precision@5:  {results["precision@5"]:.3f}')
    print(f'  precision@10: {results["precision@10"]:.3f}')
    print(f'  mrr:          {results["mrr"]:.3f}')
    print(f'  ndcg@10:      {results["ndcg@10"]:.3f}')


evaluating all systems on test set

evaluating bi-encoder...


evaluating: 100%|██████████| 1286/1286 [01:03<00:00, 20.25it/s]


  precision@5:  0.047
  precision@10: 0.028
  mrr:          0.192
  ndcg@10:      0.211

evaluating tf-idf...


evaluating: 100%|██████████| 1286/1286 [00:23<00:00, 54.76it/s]


  precision@5:  0.030
  precision@10: 0.020
  mrr:          0.104
  ndcg@10:      0.125

evaluating keyword matching...


evaluating: 100%|██████████| 1286/1286 [08:49<00:00,  2.43it/s]


  precision@5:  0.001
  precision@10: 0.001
  mrr:          0.005
  ndcg@10:      0.005

evaluating random...


evaluating: 100%|██████████| 1286/1286 [00:00<00:00, 75542.35it/s]

  precision@5:  0.000
  precision@10: 0.000
  mrr:          0.000
  ndcg@10:      0.000





In [19]:
# ============================================================================
# results comparison table
# ============================================================================

print('\n' + '='*80)
print('results comparison')
print('='*80)

results_df = pd.DataFrame(all_results).T
results_df = results_df[['precision@5', 'precision@10', 'mrr', 'ndcg@10']]
print(results_df.to_string())

# calculate improvements
bi_encoder_results = all_results['bi-encoder']
tfidf_results = all_results['tf-idf']

improvements = {}
for metric in bi_encoder_results.keys():
    if tfidf_results[metric] > 0:
        improvement = (bi_encoder_results[metric] - tfidf_results[metric]) / tfidf_results[metric] * 100
        improvements[metric] = improvement

print(f'\nbi-encoder vs tf-idf improvements:')
for metric, improvement in improvements.items():
    print(f'  {metric}: +{improvement:.1f}%')


results comparison
                  precision@5  precision@10       mrr   ndcg@10
bi-encoder           0.047278      0.027605  0.191644  0.211436
tf-idf               0.030482      0.019518  0.103725  0.125231
keyword matching     0.000933      0.000778  0.004518  0.005238
random               0.000000      0.000000  0.000000  0.000000

bi-encoder vs tf-idf improvements:
  precision@5: +55.1%
  precision@10: +41.4%
  mrr: +84.8%
  ndcg@10: +68.8%


In [20]:
# ============================================================================
# qualitative evaluation on hand crafted queries
# ============================================================================

print('\n' + '='*80)
print('qualitative evaluation on hand-crafted queries')
print('='*80)

def display_results(query: str, retriever, top_k: int = 3):
    """show top-k results for a query"""
    print(f'\nquery: "{query}"')
    print('-' * 80)

    indices = retriever.search(query, top_k)
    for i, idx in enumerate(indices, 1):
        perfume = perfumes_df.iloc[idx]
        print(f'{i}. {perfume["Perfume"]} by {perfume["Brand"]}')

        # extract accords
        desc = perfume['description']
        if 'accords:' in desc:
            accords = desc.split('accords:')[1].split('.')[0].strip()
            print(f'   accords: {accords}')

# test on 5 diverse queries
test_queries = [
    "warm vanilla for cozy winter evenings",
    "fresh citrus for spring afternoons",
    "romantic floral for date night",
    "masculine woody leather",
    "energizing morning coffee and bergamot"
]

for query in test_queries:
    display_results(query, bi_encoder, top_k=3)



qualitative evaluation on hand-crafted queries

query: "warm vanilla for cozy winter evenings"
--------------------------------------------------------------------------------
1. vanille-passion by comptoir-sud-pacifique
   accords: vanilla, powdery, musky
2. nature-s-sexy by linn-young
   accords: vanilla, floral, fresh
3. vanille by molinard
   accords: vanilla, powdery, almond

query: "fresh citrus for spring afternoons"
--------------------------------------------------------------------------------
1. fresh-life by fresh
   accords: citrus, floral, fresh
2. cologne-summer-flash by mugler
   accords: citrus, green, fresh
3. soleil-de-capri by montale
   accords: citrus, fresh, white floral

query: "romantic floral for date night"
--------------------------------------------------------------------------------
1. kenzo-amour-florale by kenzo
   accords: citrus, white floral, floral
2. elixir-charnel-floral-romantique by guerlain
   accords: white floral, sweet, floral
3. beautiful-

In [21]:
# ============================================================================
# ablation study: pretrained vs fine-tuned
# ============================================================================

print('\n' + '='*80)
print('ablation study: pretrained vs fine-tuned')
print('='*80)

# load pretrained model without fine-tuning
pretrained_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# encode with pretrained model
print('encoding with pretrained model...')
pretrained_embeddings = pretrained_model.encode(
    perfumes_df['description'].tolist(),
    show_progress_bar=True,
    batch_size=64
)

pretrained_retriever = BiEncoderRetriever(pretrained_model, pretrained_embeddings, perfumes_df)

# evaluate
print('\nevaluating pretrained model...')
pretrained_results = evaluate_retrieval(pretrained_retriever, test_df)

print('\ncomparison:')
print(f'{"metric":<15} {"pretrained":<12} {"fine-tuned":<12} {"improvement":<12}')
print('-' * 55)
for metric in ['precision@5', 'precision@10', 'mrr', 'ndcg@10']:
    pretrained_val = pretrained_results[metric]
    finetuned_val = bi_encoder_results[metric]
    improvement = (finetuned_val - pretrained_val) / pretrained_val * 100 if pretrained_val > 0 else 0
    print(f'{metric:<15} {pretrained_val:<12.3f} {finetuned_val:<12.3f} +{improvement:<11.1f}%')



ablation study: pretrained vs fine-tuned


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

encoding with pretrained model...


Batches:   0%|          | 0/376 [00:00<?, ?it/s]


evaluating pretrained model...


evaluating: 100%|██████████| 1286/1286 [01:03<00:00, 20.17it/s]


comparison:
metric          pretrained   fine-tuned   improvement 
-------------------------------------------------------
precision@5     0.026        0.047        +81.0       %
precision@10    0.015        0.028        +84.9       %
mrr             0.106        0.192        +80.4       %
ndcg@10         0.116        0.211        +81.8       %



