In [1]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

In [2]:
dataset = load_dataset("mteb/cqadupstack-physics", "corpus")

In [3]:
model_name = 'sentence-transformers/multi-qa-distilbert-cos-v1'
train_batch_size = 16
num_epochs = 4
model_save_path = (
    "output/fune_tuning_model-multi-qa-distilbert-cos-v1-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

In [4]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)



In [5]:
index = dataset['corpus'].shape[0]
train_index = int(index*0.6)
dev_index = int(index*0.8)

In [6]:
print(index, train_index, dev_index)

38316 22989 30652


In [63]:
dataset_shuffled = dataset['corpus'].shuffle()
train_samples = Dataset.from_dict(dataset_shuffled[0:train_index])
dev_samples = Dataset.from_dict(dataset_shuffled[train_index:dev_index])
test_samples = Dataset.from_dict(dataset_shuffled[dev_index:index])

In [8]:
train_samples

Dataset({
    features: ['_id', 'title', 'text'],
    num_rows: 22989
})

In [62]:
scores = 0
for i in range(dev_samples.shape[0]):
    query_emb = model.encode(dev_samples['title'][i])
    doc_emb = model.encode(dev_samples['text'][i])
    scores = scores + util.dot_score(query_emb, doc_emb)[0].cpu().tolist()[0]
average_scores = scores/dev_samples.shape[0]
average_scores

0.6247630876302719

In [55]:
title_emb = model.encode(test_samples['title'])
text_emb = model.encode(test_samples['text'])

In [60]:
util.dot_score(title_emb, text_emb)

tensor([[ 0.5289,  0.0455,  0.0390, -0.0522],
        [ 0.1713,  0.9204,  0.0970, -0.0586],
        [ 0.0643,  0.0635,  0.7392, -0.0274],
        [ 0.0803, -0.0065, -0.0295,  0.5844]])

In [30]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [31]:
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="dev")

AttributeError: 'dict' object has no attribute 'texts'

In [None]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

In [None]:
# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

In [32]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="test")
test_evaluator(model, output_path=model_save_path)



OSError: output/fune_tuning_model-multi-qa-distilbert-cos-v1-2024-05-09_15-48-55 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`