# Finetuning the base bi-encoder using glaiveai/godot-4-docs dataset from HF

In [None]:
from datasets import load_dataset

ds = load_dataset("glaiveai/godot_4_docs")

In [None]:
ds.set_format("torch")

In [None]:
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, SentenceTransformerModelCardData
from sentence_transformers.training_args import BatchSamplers

In [None]:
checkpoint = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

In [None]:
model = SentenceTransformer(checkpoint)

In [None]:
loss = MultipleNegativesRankingLoss(model)

In [None]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/multi-qa-mpnet-base-dot-v1",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="no",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
)

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=ds,
    loss=loss,
)
trainer.train()

In [None]:
# 8. Save the trained model
model.save_pretrained("models/multi-qa-mpnet-glaive-godotdocs-dot/final")



In [None]:
# 9. (Optional) Push it to the Hugging Face Hub
model.push_to_hub("multi-qa-mpnet-glaive-godotdocs-dot")