In [1]:
!pip install -q huggingface_hub[hf_xet]

In [2]:
from datasets import load_dataset

daa_file_path='/kaggle/input/pair-dataset/paired_dataset_2025-04-10_06-34-03.csv'

train_ds=load_dataset('csv', data_files=daa_file_path)['train']
train_ds

Dataset({
    features: ['anchor', 'positive'],
    num_rows: 4988
})

In [3]:
train_ds[0]

{'anchor': 'R: In April, Natalia sold 48 clips. In May, she sold half as many, which is 48 / 2.\nSo, the total number of clips sold in April and May is 48 + (48 / 2).\n\nA: 72',
 'positive': 'R: In April, Natalia sold 48 clips. In May, she sold half as many, which is 48 / 2.\nSo, the total number of clips sold in April and May is 48 + (48 / 2).\n\nA: 72'}

# Pre-processing dataset

We will split dataset into training, testing and evaluation.

In [4]:
train_test_split=train_ds.train_test_split(test_size=0.1, seed=42)
train_test_split

DatasetDict({
    train: Dataset({
        features: ['anchor', 'positive'],
        num_rows: 4489
    })
    test: Dataset({
        features: ['anchor', 'positive'],
        num_rows: 499
    })
})

In [5]:
train_eval_split=train_test_split['train'].train_test_split(test_size=0.1, seed=42)
train_eval_split

DatasetDict({
    train: Dataset({
        features: ['anchor', 'positive'],
        num_rows: 4040
    })
    test: Dataset({
        features: ['anchor', 'positive'],
        num_rows: 449
    })
})

In [6]:
from datasets import DatasetDict

findal_ds=DatasetDict({
    'train': train_eval_split['train'],
    'test': train_eval_split['test'],
    'eval': train_test_split['test']
})
findal_ds

DatasetDict({
    train: Dataset({
        features: ['anchor', 'positive'],
        num_rows: 4040
    })
    test: Dataset({
        features: ['anchor', 'positive'],
        num_rows: 449
    })
    eval: Dataset({
        features: ['anchor', 'positive'],
        num_rows: 499
    })
})

In [7]:
MODEL_NAME='all-MiniLM-L6-v2'
OUTPUT_DIR='encoder-L6-V2/training'
MODEL_DIR='aisuko/encoder-L6-V2'

# Load the Model

In [8]:
# https://www.sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss
import torch
from sentence_transformers import SentenceTransformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model=SentenceTransformer(MODEL_NAME).to(device)
model

2025-04-10 08:59:28.643472: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744275568.666644     119 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744275568.673666     119 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [9]:
from sentence_transformers.losses import MultipleNegativesRankingLoss

loss=MultipleNegativesRankingLoss(model)

In [10]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction

eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")

dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["sentence1"],
    sentences2=eval_dataset["sentence2"],
    scores=eval_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)

In [11]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers import SentenceTransformerTrainer
from transformers import EarlyStoppingCallback


train_args=SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    eval_strategy='steps',
    eval_steps=100,
    save_strategy='steps',
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    report_to='tensorboard',
    run_name='encoder-L6-V2'
)

# evaluate the base model
dev_evaluator(model)

trainer=SentenceTransformerTrainer(
    model=model,
    args=train_args,
    train_dataset=findal_ds['train'],
    eval_dataset=findal_ds['test'],
    loss=loss,
    evaluator=dev_evaluator,
    
)

# early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

# trainer.train(callbacks=[early_stopping_callback])

trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine
100,0.003,1.5e-05,0.874435,0.871489
200,0.0,8e-06,0.876419,0.873709
300,0.0,7e-06,0.876496,0.873711
400,0.0,6e-06,0.876508,0.873731
500,0.0,6e-06,0.8765,0.8737


TrainOutput(global_step=506, training_loss=0.0006024160485919954, metrics={'train_runtime': 65.4171, 'train_samples_per_second': 123.515, 'train_steps_per_second': 7.735, 'total_flos': 0.0, 'train_loss': 0.0006024160485919954, 'epoch': 2.0})

In [12]:
# evaluate the trained model on the test set
test_dataset = load_dataset("sentence-transformers/stsb", split="test")

test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["sentence1"],
    sentences2=eval_dataset["sentence2"],
    scores=eval_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)

test_evaluator(model)

{'sts-test_pearson_cosine': 0.8765015560278244,
 'sts-test_spearman_cosine': 0.8737031292128038}

In [13]:
model.save_pretrained(MODEL_DIR)

In [14]:
import kagglehub

VARIATION_SLUG='v0.1.3'

handler = f'aisuko/encoder-L6-V2/transformers/{VARIATION_SLUG}'

kagglehub.model_upload(
    handle=handler,
    local_model_dir=MODEL_DIR,
    license_name='MIT',
    version_notes='v0.1.0'
)

Uploading Model https://www.kaggle.com/models/aisuko/encoder-L6-V2/transformers/v0.1.3 ...
Starting upload for file aisuko/encoder-L6-V2/tokenizer.json


Uploading: 100%|██████████| 712k/712k [00:00<00:00, 853kB/s]

Upload successful: aisuko/encoder-L6-V2/tokenizer.json (695KB)
Starting upload for file aisuko/encoder-L6-V2/modules.json



Uploading: 100%|██████████| 349/349 [00:00<00:00, 1.12kB/s]

Upload successful: aisuko/encoder-L6-V2/modules.json (349B)
Starting upload for file aisuko/encoder-L6-V2/model.safetensors



Uploading: 100%|██████████| 90.9M/90.9M [00:03<00:00, 28.7MB/s]

Upload successful: aisuko/encoder-L6-V2/model.safetensors (87MB)
Starting upload for file aisuko/encoder-L6-V2/README.md



Uploading: 100%|██████████| 28.8k/28.8k [00:00<00:00, 86.9kB/s]

Upload successful: aisuko/encoder-L6-V2/README.md (28KB)
Starting upload for file aisuko/encoder-L6-V2/tokenizer_config.json



Uploading: 100%|██████████| 1.46k/1.46k [00:00<00:00, 4.41kB/s]

Upload successful: aisuko/encoder-L6-V2/tokenizer_config.json (1KB)
Starting upload for file aisuko/encoder-L6-V2/config_sentence_transformers.json



Uploading: 100%|██████████| 205/205 [00:00<00:00, 638B/s]

Upload successful: aisuko/encoder-L6-V2/config_sentence_transformers.json (205B)
Starting upload for file aisuko/encoder-L6-V2/special_tokens_map.json



Uploading: 100%|██████████| 695/695 [00:00<00:00, 2.10kB/s]

Upload successful: aisuko/encoder-L6-V2/special_tokens_map.json (695B)
Starting upload for file aisuko/encoder-L6-V2/config.json



Uploading: 100%|██████████| 617/617 [00:00<00:00, 1.80kB/s]

Upload successful: aisuko/encoder-L6-V2/config.json (617B)
Starting upload for file aisuko/encoder-L6-V2/sentence_bert_config.json



Uploading: 100%|██████████| 53.0/53.0 [00:00<00:00, 155B/s]

Upload successful: aisuko/encoder-L6-V2/sentence_bert_config.json (53B)
Starting upload for file aisuko/encoder-L6-V2/vocab.txt



Uploading: 100%|██████████| 232k/232k [00:00<00:00, 338kB/s]

Upload successful: aisuko/encoder-L6-V2/vocab.txt (226KB)
Starting upload for file aisuko/encoder-L6-V2/1_Pooling/config.json



Uploading: 100%|██████████| 296/296 [00:00<00:00, 885B/s]

Upload successful: aisuko/encoder-L6-V2/1_Pooling/config.json (296B)





Your model instance has been created.
Files are being processed...
See at: https://www.kaggle.com/models/aisuko/encoder-L6-V2/transformers/v0.1.3
