In [None]:
import os
import json
import torch
import datasets
import pandas as pd
from torch.utils.data import DataLoader
from sentence_transformers import (
    SentenceTransformer, models,
    losses, util,
    InputExample, evaluation,
    SentenceTransformerTrainingArguments, SentenceTransformerTrainer
)
from accelerate import Accelerator
from datasets import load_dataset

## Parameters

In [None]:
# Model to be fine tuned
st_model = 'myrkur/sentence-transformer-parsbert-fa'
st_name = 'parsbert-fa'
# train and validation data
# train_data_file = 'Fine_tune_data/FT_Data_Mordad_1403/Fine_tune_data_17-Mordad_CV/Ft_CV_train_data.csv'
# val_data_file = 'Fine_tune_data/FT_Data_Mordad_1403/Fine_tune_data_17-Mordad_CV/Ft_CV_val_data.csv'


train_data_file = 'Fine_tune_data/FT_Data_Mordad_1403/Ft_train_data.csv'
val_data_file = 'Fine_tune_data/FT_Data_Mordad_1403/Ft_val_data.csv'


In [None]:
torch.cuda.empty_cache()

In [None]:
from datasets import load_dataset
data_files = {"train": train_data_file, "validation": val_data_file}
data  = load_dataset("csv", data_files=data_files)
data = data.remove_columns('Unnamed: 0')

In [None]:
data

In [None]:
train_data = data['train']
val_data = data['validation']

In [None]:
type(train_data)

In [None]:
type(train_data['sentence1'])

In [None]:
data.shape

In [None]:
train_data

In [None]:
# Example data from 5th record (taking randomly to just display)
print("Sentence 1: ", train_data['sentence1'][5], "\nSentence 2: ", train_data['sentence2'][5], "\nScore: ", train_data['score'][5])

In [None]:
# loading the model
embedder = SentenceTransformer(st_model, cache_folder ='XXXX-1/.cache/huggingface/hub',  trust_remote_code=True )
# XXXXX-1\.cache\huggingface\hub

In [None]:
def main(model, st):  # st is the sentence teansformer model

    # Get number of GPUs working
    accelerator = Accelerator()
    print(f"Using GPUs: {accelerator.num_processes}")

    # Sentence Transformer BERT Model
    
    #model = SentenceTransformer(st)
    #model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

    # Define loss
    #loss = losses.CoSENTLoss(model)
    loss = losses.CoSENTLoss(model)

    # Define evaluator for evaluation
    evaluator = evaluation.EmbeddingSimilarityEvaluator(
        sentences1=val_data['sentence1'],
        sentences2=val_data['sentence2'],
        scores=val_data['score'],
        # main_similarity=evaluation.SimilarityFunction.COSINE,
        main_similarity=evaluation.SimilarityFunction.COSINE,
        name="semantic"
    )

    evaluator(model)
    
    # Training arguments
    # optimized set : lr = 1e-6 . warmup_ratio = 0.1 , num_epochs = 3, batch_size=2, weight_decay = 0.01
    training_args = SentenceTransformerTrainingArguments(
        output_dir='./XXX-4-checkpoint', # Save checkpoints
        num_train_epochs=3,
        #seed=33,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        learning_rate=5e-6,
        fp16=True, # Loading model in mixed-precision
        warmup_ratio=0.1,
        eval_strategy="steps",
        eval_steps=50,
        save_steps = 100,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model = 'eval_loss', 
        save_only_model=True,
        greater_is_better=False, 
        logging_steps=50,
        weight_decay = 0.01

        #auto_find_batch_size = True
    )


    # Train model
    trainer = SentenceTransformerTrainer(
        model=model,
        evaluator=evaluator,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        loss=loss
    )
    trainer.train()

    # save the model
    model.save_pretrained("./FT_models/"+ st + "/")

In [None]:
main(embedder, st_model)   # train the model save the trained model