In [1]:
import pandas as pd

from sentence_transformers import InputExample
from sentence_transformers import SentenceTransformer

# Load your dataset
data = pd.read_csv('AnnotatedSynonyms.csv')

# Convert dataframe to list of InputExample
training_examples = []
for _, row in data.iterrows():
    example = InputExample(texts=[row['keyphrases'], row['synonyms']], label=row['score'])
    training_examples.append(example)

# Load a pre-trained model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sentence_transformers import losses
from torch.utils.data import DataLoader

# Prepare a dataloader
train_dataloader = DataLoader(training_examples, shuffle=True, batch_size=16)

# Define the loss
train_loss = losses.CosineSimilarityLoss(model=model)

In [3]:
from sentence_transformers import evaluation, LoggingHandler
import logging

# For logging
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

# Configure the training
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=10,
          warmup_steps=100,
          output_path='output/mpnetv2-finetuned',
          save_best_model=True,
          show_progress_bar=True)


Iteration: 100%|██████████| 23/23 [01:01<00:00,  2.67s/it]
Iteration: 100%|██████████| 23/23 [00:59<00:00,  2.60s/it]
Iteration: 100%|██████████| 23/23 [01:09<00:00,  3.01s/it]
Iteration: 100%|██████████| 23/23 [01:02<00:00,  2.71s/it]
Iteration: 100%|██████████| 23/23 [00:59<00:00,  2.57s/it]
Iteration: 100%|██████████| 23/23 [00:59<00:00,  2.58s/it]
Iteration: 100%|██████████| 23/23 [01:01<00:00,  2.66s/it]
Iteration: 100%|██████████| 23/23 [01:03<00:00,  2.74s/it]
Iteration: 100%|██████████| 23/23 [01:02<00:00,  2.72s/it]
Iteration: 100%|██████████| 23/23 [01:00<00:00,  2.64s/it]
Epoch: 100%|██████████| 10/10 [10:18<00:00, 61.90s/it]


2024-06-25 17:03:20 - Save model to output/mpnetv2-finetuned


In [5]:
model = SentenceTransformer('output/mpnetv2-finetuned')

2024-06-25 17:28:21 - Load pretrained SentenceTransformer: output/mpnetv2-finetuned
2024-06-25 17:28:22 - Use pytorch device_name: cpu
