finetuning all-MiniLM-L6-v2 model using sentence_transformers library.

the training process in this heavily inspired by instructions in [Training and Finetuning Embedding Models with Sentence Transformers v3](https://huggingface.co/blog/train-sentence-transformers#trainer)

In [1]:
import util 
import numpy as np
import pandas as pd


## results of all-MiniLM-L6-v2 before finetuning

In [2]:
model_name = 'all-MiniLM-L6-v2'

from sentence_transformers import SentenceTransformer

file_path = '../data/datasets/mohler_dataset.csv'

exams = pd.read_csv(file_path)

LIMIT = exams.shape[0]
data = exams.iloc[:,1:7]

data = data.drop(columns=['score_me','score_other'])

correct_answers = data.iloc[:,1].to_numpy().reshape(-1,1)
student_answers = data.iloc[:,2].to_numpy().reshape(-1,1)


Model = SentenceTransformer(model_name)

s = student_answers.reshape(-1).tolist()
c = correct_answers.reshape(-1).tolist()

correct_answers_embedded = Model.encode(c)
student_answers_embedded = Model.encode(s)

dot_score_matrice = np.dot(correct_answers_embedded , student_answers_embedded.T)
dot_scores = np.diag(dot_score_matrice).reshape(-1,1) 

score_avg = data['score_avg'].to_numpy().reshape(-1,1)

util.print_scores(score_avg, dot_scores * 5) # result for rounding up to 0.25. the result wasn't any better for 0.5



rmse:  1.716176163092704 r:  0.49212198221556186


## preparing data

In [3]:
from datasets import Dataset
from sentence_transformers import SentenceTransformer

# path = 'U:\\gradingPapers\\datasets\\xbai-train-answers_embedded-k-5.npy'
train_path , test_path  = '../data/datasets/train_all_row_k_is_11.csv' , '../data/datasets/test_with_eval_all_row_k_is_14.csv'


data_train = pd.read_csv(train_path)
data_train = data_train[['desired_answer', 'student_answer', 'score_avg']]


data_train['score_avg'] = data_train['score_avg']/5
data_train  = data_train.rename(columns={'desired_answer': 'sentence1', 'student_answer': 'sentence2', 'score_avg' :'label'})
data_train.to_csv('../data/datasets/train_triplets_row_k_is_11.csv')

#########

data_test = pd.read_csv(test_path)
data_test = data_test[['desired_answer', 'student_answer', 'score_avg']]


data_test['score_avg'] = data_test['score_avg']/5

data_test  = data_test.rename(columns={'desired_answer': 'sentence1', 'student_answer': 'sentence2', 'score_avg' :'label'})
data_test.to_csv('../data/datasets/test_triplets_row_k_is_14.csv')


dataset = Dataset.from_pandas(data_train, split='train')
dataset = Dataset.from_pandas(data_test, split='test')

#####

data_eval = pd.read_csv('../data/datasets/eval_all_row_k_is_3.csv')
data_eval = data_eval[['desired_answer', 'student_answer', 'score_avg']]


data_eval['score_avg'] = data_eval['score_avg']/5

data_eval  = data_eval.rename(columns={'desired_answer': 'sentence1', 'student_answer': 'sentence2', 'score_avg' :'label'})
data_eval.to_csv('../data/datasets/eval_triplets_row_k_is_3.csv')

#####


dataset = Dataset.from_pandas(data_train, split='train')
dataset = Dataset.from_pandas(data_eval, split='eval')
dataset = Dataset.from_pandas(data_test, split='test')


## train with evaluation

In [4]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.losses import CoSENTLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator


# 1. Load a model to finetune with 2. (Optional) model card data
model = SentenceTransformer(model_name) # changed
# 3. Load a dataset to finetune on
# dataset = load_dataset("sentence-transformers/all-nli", "triplet")

train_dataset  = Dataset.from_pandas(data_train, split='train')


#eval_dataset = dataset["dev"]
eval_dataset = Dataset.from_pandas(data_eval, split='eval')


#test_dataset = dataset["test"]
test_dataset = Dataset.from_pandas(data_test, split='test')


# 4. Define a loss function
loss = CoSENTLoss(model)

# 5. (Optional) Specify training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=f"models/{model_name}",
    # Optional training parameters:
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if GPU can't handle FP16
    bf16=False,  # Set to True if GPU supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy='steps',#'no', #"steps",
    eval_steps=40,
    save_strategy="steps",
    save_steps=40,
    save_total_limit=2,
    logging_steps=40,
    run_name=model_name,  # Used in W&B if `wandb` is installed
)


# 6. (Optional) Create an evaluator & evaluate the base model
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1 =eval_dataset["sentence1"],
    sentences2 =eval_dataset["sentence2"],
    scores =eval_dataset["label"],
    name=model_name,
)
dev_evaluator(model)

# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,#None, #args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset, #
    loss=loss,
    evaluator=dev_evaluator, #
)
trainer.train()


# (Optional) Evaluate the trained model on the test set, after training completes
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1 =eval_dataset["sentence1"],
    sentences2 =eval_dataset["sentence2"],
    scores =eval_dataset["label"],
    name=model_name,
)



test_evaluator(model)


# (Optional) Evaluate the trained model on the test set, after training completes
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1 =test_dataset["sentence1"],
    sentences2 =test_dataset["sentence2"],
    scores =test_dataset["label"],
    name=model_name,
)



test_evaluator(model)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,All-minilm-l6-v2 Pearson Cosine,All-minilm-l6-v2 Spearman Cosine
40,5.6989,4.980604,0.531558,0.571732
80,4.289,3.428425,0.576483,0.612186
120,2.6065,2.923556,0.62889,0.666007
160,2.594,2.884269,0.639173,0.687426
200,2.4098,2.9749,0.693015,0.730666
240,2.0921,3.139904,0.683579,0.736058
280,1.9158,3.101959,0.701266,0.744399
320,2.1554,2.940845,0.714505,0.752634
360,1.8955,2.99681,0.71765,0.743283
400,1.6914,3.183284,0.722582,0.750845


{'all-MiniLM-L6-v2_pearson_cosine': 0.7287760514161874,
 'all-MiniLM-L6-v2_spearman_cosine': 0.7665893036676932}

## results after finetuning

In [5]:

file_path = '../data/datasets/test_with_eval_all_row_k_is_14.csv'

exams = pd.read_csv(file_path)

exams = exams[['desired_answer', 'student_answer', 'score_avg']]

correct_answers = exams['desired_answer'].to_numpy().reshape(-1,1)
student_answers = exams['student_answer'].to_numpy().reshape(-1,1)


c = correct_answers.reshape(-1).tolist()
s = student_answers.reshape(-1).tolist()

correct_answers_embedded = model.encode(c)
student_answers_embedded = model.encode(s)

dot_score_matrice = np.dot(correct_answers_embedded , student_answers_embedded.T)
dot_scores = np.diag(dot_score_matrice).reshape(-1,1)    


score_avg = exams['score_avg'].to_numpy().reshape(-1,1)
predicted_scores = (dot_scores * 5 ) 

predicted_scores = np.clip(predicted_scores , 0, 5)

util.print_scores(score_avg, predicted_scores)

rmse:  0.974232003614517 r:  0.7287608095510645


an obvious improvement

## simple regression model on scores.

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

file_path = '../data/datasets/test_with_eval_all_row_k_is_14.csv' # './dataset/test_all_row_k_is_11.csv



exams = pd.read_csv(file_path)

exams = exams[['desired_answer', 'student_answer', 'score_avg']]

correct_answers = exams['desired_answer'].to_numpy().reshape(-1,1)
student_answers = exams['student_answer'].to_numpy().reshape(-1,1)


c = correct_answers.reshape(-1).tolist()
s = student_answers.reshape(-1).tolist()

correct_answers_embedded = model.encode(c)
student_answers_embedded = model.encode(s)

dot_score_matrice = np.dot(correct_answers_embedded , student_answers_embedded.T)
dot_scores = np.diag(dot_score_matrice).reshape(-1,1)    

X = dot_scores * 5
y = exams['score_avg'].to_numpy().reshape(-1,1)


for i in range(1,5):
    poly = PolynomialFeatures(degree=i, include_bias=False)
    poly_features = poly.fit_transform(X)

    # X = poly_features # uncomment to train a pure linear regression of change degree to 1
    X_train , X_test, y_train, y_test = train_test_split(X , y ,shuffle=True ,train_size=0.8)

    linear_model = LinearRegression()
    
    linear_model.fit(X_train, y_train)
    
    y_pred_train = linear_model.predict(X_train)
    y_pred_train = np.clip(y_pred_train , 0, 5)

    y_pred_test = linear_model.predict(X_test)
    y_pred_test = np.clip(y_pred_test , 0, 5)
    print(f'result for poly nomial regression with degree {i}')
    util.print_scores(y_test, y_pred_test)
    print()


result for poly nomial regression with degree 1
rmse:  0.7244350771730226 r:  0.7374733667698069

result for poly nomial regression with degree 2
rmse:  0.7391949523750205 r:  0.7897683189393312

result for poly nomial regression with degree 3
rmse:  0.6943670756622625 r:  0.778751187551213

result for poly nomial regression with degree 4
rmse:  0.7902873767186833 r:  0.7479107166436902



In [7]:
# poly = PolynomialFeatures(degree=3, include_bias=False)
# poly_features = poly.fit_transform(X)

# X = poly_features # uncomment to train a pure linear regression of change degree to 1
X_train , X_test, y_train, y_test = train_test_split(X , y ,shuffle=True ,train_size=0.8)

linear_model = LinearRegression()


linear_model.fit(X_train, y_train)


y_pred_train = linear_model.predict(X_train)
y_pred_train = np.clip(y_pred_train , 0, 5)

y_pred_test = linear_model.predict(X_test)
y_pred_test = np.clip(y_pred_test , 0, 5)



##### eval on train
print(f'loss function on train:')
# Calculate and print R^2 score.
r2 = r2_score(y_train, y_pred_train)
print(f"R-squared: {r2:.4f}")

# Calculate and print RMSE for train
rmse = mean_squared_error(y_train, y_pred_train)  ** 0.5
print(f"Root mean squared error: {rmse:.4f}")

######  eval on test
print('\n' , 10*'-','\n')
print(f'loss function on test:')


# Calculate and print R^2 score.
r2 = r2_score(y_test, y_pred_test)
print(f"R-squared: {r2:.4f}")

# Calculate and print RMSE for test
rmse = mean_squared_error(y_test, y_pred_test)  ** 0.5
print(f"Root mean squared error: {rmse:.4f}")


loss function on train:
R-squared: 0.5372
Root mean squared error: 0.7769

 ---------- 

loss function on test:
R-squared: 0.6225
Root mean squared error: 0.6630
