In [1]:
from datasets import load_dataset
from sentence_transformers import (
SentenceTransformer,
SentenceTransformerTrainer,
SentenceTransformerTrainingArguments
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator



The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [2]:
checkpoint = 'sentence-transformers/all-distilroberta-v1'
model = SentenceTransformer(checkpoint)



In [3]:
dataset = load_dataset("shawhin/ai-job-embedding-finetuning")

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['query', 'job_description_pos', 'job_description_neg'],
        num_rows: 809
    })
    validation: Dataset({
        features: ['query', 'job_description_pos', 'job_description_neg'],
        num_rows: 101
    })
    test: Dataset({
        features: ['query', 'job_description_pos', 'job_description_neg'],
        num_rows: 102
    })
})


# Checking untrained Model

In [5]:
evaluater_valid = TripletEvaluator(
    anchors=dataset['validation']['query'],
    positives=dataset['validation']['job_description_pos'],
    negatives=dataset['validation']['job_description_neg'],
    name = 'si-job-validation'
)
evaluater_valid(model)

{'si-job-validation_cosine_accuracy': 0.8811880946159363}

Define Loss Function

In [6]:
loss = MultipleNegativesRankingLoss(model)

Defining Training Args

In [7]:
num_epochs = 1
batch_size= 16
learning_rate= 2e-5
FT_model_name = 'Job-Embedding-Fine-Tuned-distilroberta-v2'
training_args = SentenceTransformerTrainingArguments(
    output_dir = f'Fine_Tuned/{FT_model_name}',
    num_train_epochs = num_epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    learning_rate = learning_rate,
    warmup_ratio=0.1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy='steps',
    eval_steps=100,
    logging_steps=100,
)

# Now Fine Tune the Model

In [9]:
%time
trainer = SentenceTransformerTrainer(
    model = model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    loss= loss,
    evaluator=evaluater_valid,
)
trainer.train()

CPU times: total: 0 ns
Wall time: 0 ns


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss


TrainOutput(global_step=51, training_loss=0.8848036971746707, metrics={'train_runtime': 255.4748, 'train_samples_per_second': 3.167, 'train_steps_per_second': 0.2, 'total_flos': 0.0, 'train_loss': 0.8848036971746707, 'epoch': 1.0})

In [10]:
evaluater_test = TripletEvaluator(
    anchors=dataset['test']['query'],
    positives=dataset['test']['job_description_pos'],
    negatives=dataset['test']['job_description_neg'],
    name = 'si-job-test'
)
print('Validation: ', evaluater_valid(model))
print('Test: ', evaluater_test(model))

Validation:  {'si-job-validation_cosine_accuracy': 0.9900990128517151}
Test:  {'si-job-test_cosine_accuracy': 1.0}


In [12]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub(f'dawoodk/{FT_model_name}')