In [1]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

In [69]:
dataset = load_dataset("mteb/cqadupstack-physics", "corpus")

In [70]:
model_name = 'sentence-transformers/multi-qa-distilbert-cos-v1'
train_batch_size = 16
num_epochs = 4
model_save_path = (
    "output/fune_tuning_model-multi-qa-distilbert-cos-v1-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

In [71]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name, device="cuda")



In [104]:
model_ori = SentenceTransformer(model_name, device="cuda")



In [72]:
index = dataset['corpus'].shape[0]
train_index = int(index*0.6)
dev_index = int(index*0.8)

In [73]:
index = int(index*0.1)
train_index = int(train_index*0.1)
dev_index = int(dev_index*0.1)

In [74]:
print(index, train_index, dev_index)

3831 2298 3065


In [75]:
dataset_shuffled = dataset['corpus'].shuffle()
train_samples = Dataset.from_dict(dataset_shuffled[0:train_index])
dev_samples = Dataset.from_dict(dataset_shuffled[train_index:dev_index])
test_samples = Dataset.from_dict(dataset_shuffled[dev_index:index])

In [76]:
def InputData(dataset):
    samples_set = []
    for title, text in zip(dataset['title'], dataset['text']):
        input = InputExample(texts=[title, text], label = 1.0)
        samples_set.append(input)
    return samples_set   

In [77]:
def InputData_tri(dataset):
    samples_set = []
    l = len(dataset['title'])
    h = int(l/2)
    for title, text in zip(dataset['title'], dataset['text']):
        input = InputExample(texts=[title, text], label = 1.0)
        samples_set.append(input)
    for title, text in zip(dataset['title'][0:h], dataset['text'][h:h*2]):
        input = InputExample(texts=[title, text], label = 0.0)
        samples_set.append(input)
    for title, text in zip(dataset['title'][h:h*2], dataset['text'][0:h]):
        input = InputExample(texts=[title, text], label = 0.0)
        samples_set.append(input)
    return samples_set  

In [78]:
train_samples_set = InputData_tri(train_samples)
test_samples_set = InputData_tri(test_samples)
dev_samples_set = InputData_tri(dev_samples)

In [79]:
scores = 0
for i in range(test_samples.shape[0]):
    query_emb = model.encode(test_samples['title'][i])
    doc_emb = model.encode(test_samples['text'][i])
    scores = scores + util.dot_score(query_emb, doc_emb)[0].cpu().tolist()[0]
average_scores = scores/test_samples.shape[0]
average_scores

0.6346251873333523

In [87]:
scores = 0
for i in range(test_samples.shape[0]-2):
    query_emb = model.encode(test_samples['title'][i])
    doc_emb = model.encode(test_samples['text'][i+2])
    scores = scores + util.dot_score(query_emb, doc_emb)[0].cpu().tolist()[0]
average_scores = scores/test_samples.shape[0]
average_scores

0.1123767632303146

In [None]:
scores = 0
for i in range(test_samples.shape[0]):
    query_emb = model.encode(test_samples['title'][i])
    doc_emb = model.encode(test_samples['text'][i])
    scores = scores + util.dot_score(query_emb, doc_emb)[0].cpu().tolist()[0]
average_scores = scores/test_samples.shape[0]
average_scores

In [94]:
scores = []
label = []
for data in test_samples_set:
    query_emb = model.encode(data.texts[0])
    doc_emb = model.encode(data.texts[1])
    scores.append(util.dot_score(query_emb, doc_emb)[0].cpu().tolist()[0])
    label.append(data.label)

In [99]:
from sklearn.metrics import roc_auc_score
roc_auc_score(label, scores)

0.9947286435929074

In [107]:
i = 0
j = 0
for data in test_samples_set:
    query_emb = model_ori.encode(data.texts[0])
    doc_emb = model_ori.encode(data.texts[1])
    scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()[0]
    label = data.label
    i += 1
    if scores > 0.5:
        if label == 1.0:
            j += 1
    else:
        if label == 0.0:
            j += 1
j/i


0.9118798955613577

In [108]:
i = 0
j = 0
for data in test_samples_set:
    query_emb = model.encode(data.texts[0])
    doc_emb = model.encode(data.texts[1])
    scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()[0]
    label = data.label
    i += 1
    if scores > 0.5:
        if label == 1.0:
            j += 1
    else:
        if label == 0.0:
            j += 1
j/i

0.9634464751958225

In [80]:
train_dataloader = DataLoader(train_samples_set, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [81]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x15e48c4f250>

In [82]:
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples_set, name="dev")

In [83]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

In [84]:
# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/288 [00:00<?, ?it/s]

Iteration:   0%|          | 0/288 [00:00<?, ?it/s]

Iteration:   0%|          | 0/288 [00:00<?, ?it/s]

Iteration:   0%|          | 0/288 [00:00<?, ?it/s]

In [39]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples_set, name="test")
# test_evaluator(model, output_path=model_save_path)



OSError: output/fune_tuning_model-multi-qa-distilbert-cos-v1-2024-05-18_16-40-38 does not appear to have a file named config.json. Checkout 'https://huggingface.co/output/fune_tuning_model-multi-qa-distilbert-cos-v1-2024-05-18_16-40-38/tree/None' for available files.

In [27]:
input = InputExample(texts=[train_samples['title'][0], train_samples['text'][0]])

In [28]:
input.texts[0]

'Is there a [set of] rules/patterns that apply to elements'