<a href="https://colab.research.google.com/github/AliAI11/fragranceBERT/blob/main/notebooks/02_train_bi_encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers scikit-learn torch pandas numpy tqdm



In [2]:
import pandas as pd
import numpy as np
import json
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from sentence_transformers.util import cos_sim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
from tqdm import tqdm
import os
from typing import List
import warnings
warnings.filterwarnings('ignore')

# set seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'using device: {device}')

using device: cuda


In [3]:
from google.colab import files

os.makedirs('./data', exist_ok=True)

print('upload 4 data files:')
print('1. perfumes_all.csv')
print('2. train.csv')
print('3. val.csv')
print('4. test.csv')

uploaded = files.upload()

for filename in uploaded.keys():
    os.rename(filename, f'./data/{filename}')
    print(f'moved {filename} to ./data/')

print('\nupload complete')

upload 4 data files:
1. perfumes_all.csv
2. train.csv
3. val.csv
4. test.csv


Saving perfumes_all.csv to perfumes_all.csv
Saving test.csv to test.csv
Saving train.csv to train.csv
Saving val.csv to val.csv
moved perfumes_all.csv to ./data/
moved test.csv to ./data/
moved train.csv to ./data/
moved val.csv to ./data/

upload complete


In [4]:
data_dir = './data/'

perfumes_df = pd.read_csv(os.path.join(data_dir, 'perfumes_all.csv'))
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
val_df = pd.read_csv(os.path.join(data_dir, 'val.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))

print(f'loaded {len(perfumes_df)} perfumes')
print(f'train: {len(train_df)} examples')
print(f'val: {len(val_df)} examples')
print(f'test: {len(test_df)} examples')

loaded 24063 perfumes
train: 7000 examples
val: 1500 examples
test: 1500 examples


In [5]:
def create_input_examples(df):
    """convert dataframe to sentence-transformer input examples"""
    examples = []
    for _, row in df.iterrows():
        example = InputExample(
            texts=[row['query'], row['description']]
        )
        examples.append(example)
    return examples

print('\npreparing training examples...')
train_examples = create_input_examples(train_df)
val_examples = create_input_examples(val_df)

print(f'train examples: {len(train_examples)}')
print(f'val examples: {len(val_examples)}')

# sample example
print(f'\nsample training example:')
print(f'query: {train_examples[0].texts[0]}')
print(f'perfume: {train_examples[0].texts[1][:100]}...')


preparing training examples...
train examples: 7000
val examples: 1500

sample training example:
query: romantic winter fragrance
perfume: accento-overdose-pride-edition by xerjoff. accords: rose, woody, fruity. top notes: fruity notes, al...


In [6]:
print('\nloading base model: all-MiniLM-L6-v2')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

print(f'model embedding dimension: {model.get_sentence_embedding_dimension()}')


loading base model: all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model embedding dimension: 384


In [9]:
# multiple negatives ranking loss
# pulls positives closer, pushes negatives apart in embedding space
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model)

# validation evaluator
from sentence_transformers import evaluation

# convert to dictionaries with string ids
val_queries_dict = {str(i): query for i, query in enumerate(val_df['query'].tolist())}
val_corpus_dict = {str(i): desc for i, desc in enumerate(val_df['description'].tolist())}
val_relevant_docs = {str(i): {str(i)} for i in range(len(val_df))}

evaluator = evaluation.InformationRetrievalEvaluator(
    val_queries_dict,
    val_corpus_dict,
    val_relevant_docs,
    name='val'
)

In [10]:
num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

print('\ntraining configuration:')
print(f'epochs: {num_epochs}')
print(f'batch size: 16')
print(f'warmup steps: {warmup_steps}')
print(f'total training steps: {len(train_dataloader) * num_epochs}')



training configuration:
epochs: 3
batch size: 16
warmup steps: 131
total training steps: 1314


In [11]:
print('\nstarting training...')

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    evaluator=evaluator,
    evaluation_steps=500,
    output_path='./models/fragrance-retriever',
    save_best_model=True,
    show_progress_bar=True
)

print('\ntraining complete')


starting training...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmmava2004[0m ([33mmmava2004-virginia-tech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Val Cosine Accuracy@1,Val Cosine Accuracy@3,Val Cosine Accuracy@5,Val Cosine Accuracy@10,Val Cosine Precision@1,Val Cosine Precision@3,Val Cosine Precision@5,Val Cosine Precision@10,Val Cosine Recall@1,Val Cosine Recall@3,Val Cosine Recall@5,Val Cosine Recall@10,Val Cosine Ndcg@10,Val Cosine Mrr@10,Val Cosine Map@100
438,No log,No log,0.034,0.112,0.183333,0.366667,0.034,0.037333,0.036667,0.036667,0.034,0.112,0.183333,0.366667,0.166338,0.107067,0.118261
500,1.362500,No log,0.030667,0.106667,0.175333,0.366,0.030667,0.035556,0.035067,0.0366,0.030667,0.106667,0.175333,0.366,0.163068,0.10315,0.11435
876,1.362500,No log,0.032667,0.108,0.180667,0.377333,0.032667,0.036,0.036133,0.037733,0.032667,0.108,0.180667,0.377333,0.168404,0.106768,0.117952
1000,1.120900,No log,0.028667,0.107333,0.18,0.379333,0.028667,0.035778,0.036,0.037933,0.028667,0.107333,0.18,0.379333,0.166802,0.10408,0.115323
1314,1.120900,No log,0.031333,0.109333,0.18,0.383333,0.031333,0.036444,0.036,0.038333,0.031333,0.109333,0.18,0.383333,0.169244,0.10619,0.117126



training complete


In [12]:
print('\nloading best model...')
model = SentenceTransformer('./models/fragrance-retriever')


loading best model...


In [13]:
print('\nencoding all perfumes...')
perfume_descriptions = perfumes_df['description'].tolist()

perfume_embeddings = model.encode(
    perfume_descriptions,
    convert_to_tensor=False,
    show_progress_bar=True,
    batch_size=64
)

print(f'encoded {len(perfume_embeddings)} perfumes')
print(f'embedding shape: {perfume_embeddings.shape}')


encoding all perfumes...


Batches:   0%|          | 0/376 [00:00<?, ?it/s]

encoded 24063 perfumes
embedding shape: (24063, 384)


In [14]:
np.save('./data/perfume_embeddings.npy', perfume_embeddings)
perfumes_df.to_csv('./data/perfumes_with_ids.csv', index=True)

print('\nsaved:')
print('  - ./data/perfume_embeddings.npy')
print('  - ./data/perfumes_with_ids.csv')


saved:
  - ./data/perfume_embeddings.npy
  - ./data/perfumes_with_ids.csv


In [15]:
print('\ntesting retrieval...')

test_query = "warm vanilla for cozy winter evenings"
query_embedding = model.encode([test_query], convert_to_tensor=False)

from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(query_embedding, perfume_embeddings)[0]
top_k = 5
top_indices = np.argsort(similarities)[-top_k:][::-1]

print(f'\nquery: "{test_query}"')
print(f'\ntop {top_k} results:')
for i, idx in enumerate(top_indices):
    perfume = perfumes_df.iloc[idx]
    print(f'\n{i+1}. {perfume["Perfume"]} by {perfume["Brand"]}')
    print(f'   similarity: {similarities[idx]:.3f}')
    print(f'   notes: {perfume["description"][:100]}...')



testing retrieval...

query: "warm vanilla for cozy winter evenings"

top 5 results:

1. vanille-passion by comptoir-sud-pacifique
   similarity: 0.335
   notes: vanille-passion by comptoir-sud-pacifique. accords: vanilla, powdery, musky. top notes: vanilla. mid...

2. nature-s-sexy by linn-young
   similarity: 0.292
   notes: nature-s-sexy by linn-young. accords: vanilla, floral, fresh. top notes: tea, bergamot, orange. midd...

3. vanille by molinard
   similarity: 0.283
   notes: vanille by molinard. accords: vanilla, powdery, almond. top notes: vanilla. middle notes: vanilla. b...

4. vanille-extreme-eau-de-parfum by comptoir-sud-pacifique
   similarity: 0.279
   notes: vanille-extreme-eau-de-parfum by comptoir-sud-pacifique. accords: vanilla, powdery, musky. top notes...

5. amour-de-cacao by comptoir-sud-pacifique
   similarity: 0.278
   notes: amour-de-cacao by comptoir-sud-pacifique. accords: cacao, vanilla, warm spicy. top notes: orange. mi...


In [16]:
print('\ndownloading model and embeddings...')

# zip model directory
import shutil
shutil.make_archive('fragrance-retriever', 'zip', './models/fragrance-retriever')

# download files
files.download('fragrance-retriever.zip')
files.download('./data/perfume_embeddings.npy')
files.download('./data/perfumes_with_ids.csv')

print('\ndownload complete')


downloading model and embeddings...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


download complete


In [18]:
print('='*80)
print(f'trained model saved to: ./models/fragrance-retriever')
print(f'perfume embeddings: {perfume_embeddings.shape}')
print('='*80)

trained model saved to: ./models/fragrance-retriever
perfume embeddings: (24063, 384)
