In [None]:
import os

from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

In [None]:
# Load environment variables from .env
load_dotenv()

huggingface_key = os.getenv("HUGGINGFACE_KEY")

In [None]:
model_name = "sentence-transformers/all-distilroberta-v1"
model = SentenceTransformer(model_name)

In [None]:
dataset = load_dataset("trihoang131/movie_dataset_50K")

In [None]:
evaluator_valid = TripletEvaluator(
    anchors=dataset["validation"]["overview"],
    positives=dataset["validation"]["positive_overview"],
    negatives=dataset["validation"]["negative_overview"],
    name="ai-movie-validation",
)

In [None]:
num_epochs = 1
batch_size = 32
lr = 2e-5
finetuned_model_name = "distilroberta-movies-embeddings"

train_args = SentenceTransformerTrainingArguments(
    output_dir=f"models/{finetuned_model_name}",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=lr,
    warmup_ratio=0.1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    report_to="none"
)

In [None]:
loss = MultipleNegativesRankingLoss(model)

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=train_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    loss=loss,
    evaluator=evaluator_valid
)
trainer.train()

In [None]:
evaluator_test = TripletEvaluator(
    anchors=dataset["test"]["overview"],
    positives=dataset["test"]["positive_overview"],
    negatives=dataset["test"]["negative_overview"],
    name="ai-movie-test",
)
print("Validation:", evaluator_valid(model))
print("Test:", evaluator_test(model))

In [None]:
from huggingface_hub import login

login(huggingface_key)

In [None]:
model.push_to_hub(f"trihoang131/{finetuned_model_name}")