In [2]:
import logging
import sys
import traceback
from datetime import datetime
import datasets
from datasets import load_dataset
import sentence_transformers
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

In [3]:
path='D:\\SBERT-Training\\distilbert-base-uncased'
model=SentenceTransformer(path)

No sentence-transformers model found with name D:\SBERT-Training\distilbert-base-uncased. Creating a new one with mean pooling.


In [4]:
# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
model_name="distilbert-base-uncased"
train_batch_size = 16
num_epochs = 1
output_dir = (
    "output/training_stsbenchmark_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)


In [5]:
train_dataset = load_dataset("sentence-transformers/stsb", split="train")
eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
logging.info(train_dataset)

2024-06-28 23:34:26 - Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 5749
})


### Save to and load from local

In [21]:
train_dataset.save_to_disk('D:\\SBERT-Training\\datasets\\train')

Saving the dataset (0/1 shards):   0%|          | 0/5749 [00:00<?, ? examples/s]

In [22]:
eval_dataset.save_to_disk('D:\\SBERT-Training\\datasets\\eval')

Saving the dataset (0/1 shards):   0%|          | 0/1500 [00:00<?, ? examples/s]

In [23]:
test_dataset.save_to_disk('D:\\SBERT-Training\\datasets\\test')

Saving the dataset (0/1 shards):   0%|          | 0/1379 [00:00<?, ? examples/s]

In [24]:
train_dataset=datasets.load_from_disk("D:\\SBERT-Training\\datasets\\train")
train_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 5749
})

In [25]:
test_dataset=datasets.load_from_disk("D:\\SBERT-Training\\datasets\\test")
test_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 1379
})

In [26]:
eval_dataset=datasets.load_from_disk("D:\\SBERT-Training\\datasets\\eval")
eval_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 1500
})

### Converting to csv format

In [27]:
import pandas as pd

In [28]:
train_data=pd.DataFrame({'sentence1':train_dataset['sentence1'],
                         'sentence2':train_dataset['sentence2'],
                         'score':train_dataset['score']})
train_data

Unnamed: 0,sentence1,sentence2,score
0,A plane is taking off.,An air plane is taking off.,1.00
1,A man is playing a large flute.,A man is playing a flute.,0.76
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,0.76
3,Three men are playing chess.,Two men are playing chess.,0.52
4,A man is playing the cello.,A man seated is playing the cello.,0.85
...,...,...,...
5744,Severe Gales As Storm Clodagh Hits Britain,Merkel pledges NATO solidarity with Latvia,0.00
5745,Dozens of Egyptians hostages taken by Libyan t...,Egyptian boat crash death toll rises as more b...,0.00
5746,President heading to Bahrain,President Xi: China to continue help to fight ...,0.00
5747,"China, India vow to further bilateral ties",China Scrambles to Reassure Jittery Stock Traders,0.00


In [29]:
eval_data=pd.DataFrame({'sentence1':eval_dataset['sentence1'],
                         'sentence2':eval_dataset['sentence2'],
                         'score':eval_dataset['score']})
eval_data

Unnamed: 0,sentence1,sentence2,score
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,1.00
1,A young child is riding a horse.,A child is riding a horse.,0.95
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,1.00
3,A woman is playing the guitar.,A man is playing guitar.,0.48
4,A woman is playing the flute.,A man is playing a flute.,0.55
...,...,...,...
1495,Scientists prove there is water on Mars,Has Nasa discovered water on Mars?,0.40
1496,Pranab stresses need to strive for peace by na...,WTO: India regrets action of developed nations,0.00
1497,Volkswagen skids into red in wake of pollution...,"Volkswagen's ""gesture of goodwill"" to diesel o...",0.40
1498,Obama is right: Africa deserves better leadership,Obama waiting for midterm to name attorney gen...,0.00


In [30]:
test_data=pd.DataFrame({'sentence1':test_dataset['sentence1'],
                         'sentence2':test_dataset['sentence2'],
                         'score':test_dataset['score']})
test_data

Unnamed: 0,sentence1,sentence2,score
0,A girl is styling her hair.,A girl is brushing her hair.,0.50
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,0.72
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,1.00
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,0.84
4,A man is playing a harp.,A man is playing a keyboard.,0.30
...,...,...,...
1374,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks,0.00
1375,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe...",0.20
1376,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...,0.20
1377,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...,0.00


### CONTD.

In [31]:
# 3. Define our training loss
# CosineSimilarityLoss (https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosinesimilarityloss) needs two text columns and one
# similarity score column (between 0 and 1)
train_loss = losses.CosineSimilarityLoss(model=model)
# train_loss = losses.CoSENTLoss(model=model)

In [32]:
# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_data["sentence1"],
    sentences2=eval_data["sentence2"],
    scores=eval_data["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)

%pip install accelerate -U
%pip install transformers[torch]

In [33]:
# 5. Define the training arguments

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=output_dir,
    # Optional training parameters:
    num_train_epochs=num_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="sts",  # Will be used in W&B if `wandb` is installed
)

In [34]:
# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=train_loss,
    evaluator=dev_evaluator,
)
trainer.train()

  0%|          | 0/360 [00:00<?, ?it/s]

{'loss': 0.0569, 'grad_norm': 0.6537118554115295, 'learning_rate': 4.012345679012346e-05, 'epoch': 0.28}


  0%|          | 0/94 [00:00<?, ?it/s]

2024-06-28 23:41:55 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset:
2024-06-28 23:42:32 - Cosine-Similarity :	Pearson: 0.8316	Spearman: 0.8311
2024-06-28 23:42:32 - Manhattan-Distance:	Pearson: 0.7852	Spearman: 0.7905
2024-06-28 23:42:32 - Euclidean-Distance:	Pearson: 0.7850	Spearman: 0.7899
2024-06-28 23:42:32 - Dot-Product-Similarity:	Pearson: 0.7634	Spearman: 0.7733
2024-06-28 23:42:34 - Saving model checkpoint to output/training_stsbenchmark_distilbert-base-uncased-2024-06-28_23-33-56\checkpoint-100
2024-06-28 23:42:34 - Save model to output/training_stsbenchmark_distilbert-base-uncased-2024-06-28_23-33-56\checkpoint-100


{'eval_loss': 0.03234798088669777, 'eval_sts-dev_pearson_cosine': 0.8315716867771941, 'eval_sts-dev_spearman_cosine': 0.8311043630445402, 'eval_sts-dev_pearson_manhattan': 0.7852257137637472, 'eval_sts-dev_spearman_manhattan': 0.7904576983895036, 'eval_sts-dev_pearson_euclidean': 0.7850433859759056, 'eval_sts-dev_spearman_euclidean': 0.7899307151847849, 'eval_sts-dev_pearson_dot': 0.7634203913355301, 'eval_sts-dev_spearman_dot': 0.7732731832401658, 'eval_sts-dev_pearson_max': 0.8315716867771941, 'eval_sts-dev_spearman_max': 0.8311043630445402, 'eval_runtime': 96.1762, 'eval_samples_per_second': 15.596, 'eval_steps_per_second': 0.977, 'epoch': 0.28}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'loss': 0.0294, 'grad_norm': 0.7777989506721497, 'learning_rate': 2.4691358024691357e-05, 'epoch': 0.56}


  0%|          | 0/94 [00:00<?, ?it/s]

2024-06-28 23:47:32 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset:
2024-06-28 23:48:09 - Cosine-Similarity :	Pearson: 0.8540	Spearman: 0.8538
2024-06-28 23:48:09 - Manhattan-Distance:	Pearson: 0.8223	Spearman: 0.8248
2024-06-28 23:48:09 - Euclidean-Distance:	Pearson: 0.8224	Spearman: 0.8253
2024-06-28 23:48:09 - Dot-Product-Similarity:	Pearson: 0.7917	Spearman: 0.7953
2024-06-28 23:48:11 - Saving model checkpoint to output/training_stsbenchmark_distilbert-base-uncased-2024-06-28_23-33-56\checkpoint-200
2024-06-28 23:48:11 - Save model to output/training_stsbenchmark_distilbert-base-uncased-2024-06-28_23-33-56\checkpoint-200


{'eval_loss': 0.028163645416498184, 'eval_sts-dev_pearson_cosine': 0.8540389125075275, 'eval_sts-dev_spearman_cosine': 0.8537676301971472, 'eval_sts-dev_pearson_manhattan': 0.8222901771642033, 'eval_sts-dev_spearman_manhattan': 0.8247671160730421, 'eval_sts-dev_pearson_euclidean': 0.8223848586207668, 'eval_sts-dev_spearman_euclidean': 0.8252880529621309, 'eval_sts-dev_pearson_dot': 0.7917213035466448, 'eval_sts-dev_spearman_dot': 0.7953351854069197, 'eval_sts-dev_pearson_max': 0.8540389125075275, 'eval_sts-dev_spearman_max': 0.8537676301971472, 'eval_runtime': 96.3017, 'eval_samples_per_second': 15.576, 'eval_steps_per_second': 0.976, 'epoch': 0.56}
{'loss': 0.0271, 'grad_norm': 0.7825676798820496, 'learning_rate': 9.259259259259259e-06, 'epoch': 0.83}


  0%|          | 0/94 [00:00<?, ?it/s]

2024-06-28 23:53:24 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset:
2024-06-28 23:54:25 - Cosine-Similarity :	Pearson: 0.8657	Spearman: 0.8657
2024-06-28 23:54:25 - Manhattan-Distance:	Pearson: 0.8298	Spearman: 0.8348
2024-06-28 23:54:25 - Euclidean-Distance:	Pearson: 0.8303	Spearman: 0.8354
2024-06-28 23:54:25 - Dot-Product-Similarity:	Pearson: 0.8093	Spearman: 0.8120
2024-06-28 23:54:25 - Saving model checkpoint to output/training_stsbenchmark_distilbert-base-uncased-2024-06-28_23-33-56\checkpoint-300
2024-06-28 23:54:25 - Save model to output/training_stsbenchmark_distilbert-base-uncased-2024-06-28_23-33-56\checkpoint-300


{'eval_loss': 0.025070011615753174, 'eval_sts-dev_pearson_cosine': 0.8656936470200879, 'eval_sts-dev_spearman_cosine': 0.8657437835817434, 'eval_sts-dev_pearson_manhattan': 0.8298011668556344, 'eval_sts-dev_spearman_manhattan': 0.8348366586643715, 'eval_sts-dev_pearson_euclidean': 0.8302750243599466, 'eval_sts-dev_spearman_euclidean': 0.835359842903855, 'eval_sts-dev_pearson_dot': 0.8093013609209458, 'eval_sts-dev_spearman_dot': 0.812021947141667, 'eval_sts-dev_pearson_max': 0.8656936470200879, 'eval_sts-dev_spearman_max': 0.8657437835817434, 'eval_runtime': 125.2928, 'eval_samples_per_second': 11.972, 'eval_steps_per_second': 0.75, 'epoch': 0.83}


2024-06-28 23:57:06 - Saving model checkpoint to output/training_stsbenchmark_distilbert-base-uncased-2024-06-28_23-33-56\checkpoint-360
2024-06-28 23:57:06 - Save model to output/training_stsbenchmark_distilbert-base-uncased-2024-06-28_23-33-56\checkpoint-360


{'train_runtime': 1217.1452, 'train_samples_per_second': 4.723, 'train_steps_per_second': 0.296, 'train_loss': 0.03560277389155494, 'epoch': 1.0}


TrainOutput(global_step=360, training_loss=0.03560277389155494, metrics={'train_runtime': 1217.1452, 'train_samples_per_second': 4.723, 'train_steps_per_second': 0.296, 'total_flos': 0.0, 'train_loss': 0.03560277389155494, 'epoch': 1.0})

In [35]:
# 7. Evaluate the model performance on the STS Benchmark test dataset
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["sentence1"],
    sentences2=test_dataset["sentence2"],
    scores=test_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)
test_evaluator(model)

2024-06-28 23:57:15 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-test dataset:
2024-06-28 23:57:43 - Cosine-Similarity :	Pearson: 0.8254	Spearman: 0.8233
2024-06-28 23:57:43 - Manhattan-Distance:	Pearson: 0.8143	Spearman: 0.8110
2024-06-28 23:57:43 - Euclidean-Distance:	Pearson: 0.8142	Spearman: 0.8110
2024-06-28 23:57:43 - Dot-Product-Similarity:	Pearson: 0.7421	Spearman: 0.7348


{'sts-test_pearson_cosine': 0.8254450627634602,
 'sts-test_spearman_cosine': 0.8233011242085118,
 'sts-test_pearson_manhattan': 0.8142549198916363,
 'sts-test_spearman_manhattan': 0.8109806921769,
 'sts-test_pearson_euclidean': 0.8141837739915558,
 'sts-test_spearman_euclidean': 0.811002712881752,
 'sts-test_pearson_dot': 0.7420972823132005,
 'sts-test_spearman_dot': 0.7348270129340309,
 'sts-test_pearson_max': 0.8254450627634602,
 'sts-test_spearman_max': 0.8233011242085118}

In [36]:
# 8. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)


2024-06-28 23:57:43 - Save model to output/training_stsbenchmark_distilbert-base-uncased-2024-06-28_23-33-56/final
