In [1]:
import logging
import sys
import traceback
from datetime import datetime
import datasets
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

In [2]:
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)


In [3]:
path='D:\\SBERT-Training\\distilbert-base-uncased'
model=SentenceTransformer(path)
model_name='distilbert-base-uncased'
train_batch_size = 16

2024-07-10 18:49:11 - Use pytorch device_name: cpu
2024-07-10 18:49:11 - Load pretrained SentenceTransformer: D:\SBERT-Training\distilbert-base-uncased
2024-07-10 18:49:11 - No sentence-transformers model found with name D:\SBERT-Training\distilbert-base-uncased. Creating a new one with mean pooling.


In [4]:
output_dir = "output/training_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [28]:
logging.info("Read AllNLI train dataset")
train_dataset = load_dataset("kiddothe2b/contract-nli", "contractnli_a",split="train")
eval_dataset = load_dataset("kiddothe2b/contract-nli", "contractnli_a",split="validation")
logging.info(train_dataset)


2024-07-10 19:56:00 - Read AllNLI train dataset
2024-07-10 19:56:05 - Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 6819
})


In [29]:
train_dataset.save_to_disk('D:\\SBERT-Training\\datasets\\ContractNLI\\train')

Saving the dataset (0/1 shards):   0%|          | 0/6819 [00:00<?, ? examples/s]

In [30]:
eval_dataset.save_to_disk('D:\\SBERT-Training\\datasets\\ContractNLI\\eval')

Saving the dataset (0/1 shards):   0%|          | 0/978 [00:00<?, ? examples/s]

In [31]:
train_data=datasets.load_from_disk("D:\\SBERT-Training\\datasets\\ContractNLI\\train")
train_data

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 6819
})

In [32]:
eval_data=datasets.load_from_disk("D:\\SBERT-Training\\datasets\\ContractNLI\\eval")
eval_data

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 978
})

In [33]:
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=3,
)

2024-07-10 20:00:59 - Softmax loss: #Vectors concatenated: 3


In [34]:
stsb_eval_dataset = datasets.load_from_disk("D:\\SBERT-Training\\datasets\\stsb\\eval")
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=stsb_eval_dataset["sentence1"],
    sentences2=stsb_eval_dataset["sentence2"],
    scores=stsb_eval_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)

In [35]:
logging.info("Evaluation before training:")
dev_evaluator(model)

2024-07-10 20:01:15 - Evaluation before training:
2024-07-10 20:01:15 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset:
2024-07-10 20:01:50 - Cosine-Similarity :	Pearson: 0.6370	Spearman: 0.6524
2024-07-10 20:01:50 - Manhattan-Distance:	Pearson: 0.6728	Spearman: 0.6793
2024-07-10 20:01:50 - Euclidean-Distance:	Pearson: 0.6723	Spearman: 0.6787
2024-07-10 20:01:50 - Dot-Product-Similarity:	Pearson: 0.2970	Spearman: 0.2951


{'sts-dev_pearson_cosine': 0.6369675487602559,
 'sts-dev_spearman_cosine': 0.652439417702261,
 'sts-dev_pearson_manhattan': 0.6727626699554619,
 'sts-dev_spearman_manhattan': 0.6792686799546075,
 'sts-dev_pearson_euclidean': 0.6722744772739301,
 'sts-dev_spearman_euclidean': 0.6787143086126965,
 'sts-dev_pearson_dot': 0.2970442554404871,
 'sts-dev_spearman_dot': 0.29508480972211193,
 'sts-dev_pearson_max': 0.6727626699554619,
 'sts-dev_spearman_max': 0.6792686799546075}

In [36]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=output_dir,
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    warmup_ratio=0.1,
    learning_rate=1e-6,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="nli-v1",  # Will be used in W&B if `wandb` is installed
)


In [38]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    loss=train_loss,
    evaluator=dev_evaluator,
)
trainer.train()


  0%|          | 0/427 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["sentence1"],
    sentences2=test_dataset["sentence2"],
    scores=test_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)
test_evaluator(model)

2024-07-10 19:35:30 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-test dataset:
2024-07-10 19:36:01 - Cosine-Similarity :	Pearson: 0.5368	Spearman: 0.5421
2024-07-10 19:36:01 - Manhattan-Distance:	Pearson: 0.5727	Spearman: 0.5690
2024-07-10 19:36:01 - Euclidean-Distance:	Pearson: 0.5716	Spearman: 0.5678
2024-07-10 19:36:01 - Dot-Product-Similarity:	Pearson: 0.1173	Spearman: 0.0953


{'sts-test_pearson_cosine': 0.5367877980116353,
 'sts-test_spearman_cosine': 0.5420947097068436,
 'sts-test_pearson_manhattan': 0.5726520266708057,
 'sts-test_spearman_manhattan': 0.569024490085097,
 'sts-test_pearson_euclidean': 0.5716196483917076,
 'sts-test_spearman_euclidean': 0.5678475769867374,
 'sts-test_pearson_dot': 0.11725843845834863,
 'sts-test_spearman_dot': 0.0952750432122054,
 'sts-test_pearson_max': 0.5726520266708057,
 'sts-test_spearman_max': 0.569024490085097}

In [15]:
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)

2024-07-10 19:36:18 - Save model to output/training_nli_distilbert-base-uncased-2024-07-10_18-49-15/final
