In [None]:
import os
import sys

# Find project root (cv-job-matcher-project/)
# Notebooks are at */notebooks/ so we need to go up TWO levels
cwd = os.getcwd()
if 'notebooks' in cwd:
    PROJECT_ROOT = os.path.dirname(os.path.dirname(cwd))  # TWO levels up
else:
    PROJECT_ROOT = cwd
sys.path.insert(0, PROJECT_ROOT)

print(f"Project root: {PROJECT_ROOT}")

In [None]:
# QUICK_MODE toggle for demo/testing
# when True: single training run with best hyperparams (lr=5e-05, warmup=0.05)
# when False: full hyperparameter sweep (6 combinations)

QUICK_MODE = True  # set to False for full hyperparameter sweep

# best hyperparameters from sweep (user specified)
BEST_LR = 5e-05
BEST_WARMUP = 0.05

if QUICK_MODE:
    print(f'QUICK_MODE enabled: single run with lr={BEST_LR}, warmup={BEST_WARMUP}')
else:
    print('FULL MODE: running hyperparameter sweep (6 combinations)')

In [1]:
# Clean up previous sweep results before starting new sweep
import shutil
import os

sweep_dir = "output/models/sweep"
if os.path.exists(sweep_dir):
    print(f"Found existing sweep directory with:")
    for item in os.listdir(sweep_dir):
        print(f"  - {item}")
    shutil.rmtree(sweep_dir)
    print(f"\nDeleted {sweep_dir}")
else:
    print("No previous sweep results found")

# Also clean initial training model
initial_dir = os.path.join(PROJECT_ROOT, "training", "output", "models", "cv-job-matcher-e5")
if os.path.exists(initial_dir):
    print(f"\nDeleting initial training model at: {initial_dir}")
    shutil.rmtree(initial_dir)
    print("Deleted")
else:
    print("\nNo initial training model found")

# Clean best model copy if exists
best_dir = os.path.join(PROJECT_ROOT, "training", "output", "models", "cv-job-matcher-e5-best")
if os.path.exists(best_dir):
    print(f"\nDeleting previous best model at: {best_dir}")
    shutil.rmtree(best_dir)
    print("Deleted")

# Optional: clean local wandb logs (your runs are still saved online)
wandb_dir = "wandb"
if os.path.exists(wandb_dir):
    print(f"\nDeleting local W&B logs at: {wandb_dir}")
    shutil.rmtree(wandb_dir)
    print("Deleted (online logs preserved at wandb.ai)")

print("\n✓ Ready for fresh sweep")

No previous sweep results found

No initial training model found

✓ Ready for fresh sweep


In [2]:
# set working directory to project root
import os
os.chdir(PROJECT_ROOT)
print(f"Working directory: {os.getcwd()}")

# kill any lingering Spark sessions to free RAM
try:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.getOrCreate()
    spark.stop()
    print("Stopped lingering Spark session")
except:
    print("No Spark session to stop")

# check available memory
import subprocess
result = subprocess.run(['free', '-h'], capture_output=True, text=True)
print(f"\nSystem memory:\n{result.stdout}")

Working directory: /home/developer/project


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/27 02:28:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Stopped lingering Spark session

System memory:
               total        used        free      shared  buff/cache   available
Mem:            30Gi        15Gi       1.5Gi       297Mi        13Gi        12Gi
Swap:           19Gi       3.1Gi        16Gi



In [3]:
from nbconvert import export
import pandas as pd
import numpy as np
import torch
import wandb
import gc

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
from sentence_transformers.losses import MultipleNegativesRankingLoss, MatryoshkaLoss
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from transformers import EarlyStoppingCallback
from datasets import Dataset
    
#CHECK: W&B API Key
if 'WANDB_API_KEY' not in os.environ:
    print("WARNING: WANDB_API_KEY not set in environment")
    print("Either set it with: export WANDB_API_KEY=your_key_here")
    print("Or run: wandb login")
else:
    print("WANDB_API_KEY found in environment")

# Check GPU
print(f"\nCUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("!!!No GPU available, training will be slow")

# CHECK: training data from Plan 01
assert os.path.exists(os.path.join(PROJECT_ROOT, 'training', 'output', 'training_data', 'training_dataset.parquet')), \
    "training data not found, run 08 first"
print("\nTraining data found, ready to proceed")

Either set it with: export WANDB_API_KEY=your_key_here
Or run: wandb login

CUDA available: True
GPU: NVIDIA GeForce RTX 3090
VRAM: 25.3 GB

Training data found, ready to proceed


In [4]:
# load training data from previous output
train_df = pd.read_parquet(os.path.join(PROJECT_ROOT, 'training', 'output', 'training_data', 'training_dataset.parquet'))
val_df = pd.read_parquet(os.path.join(PROJECT_ROOT, 'training', 'output', 'training_data', 'validation_dataset.parquet'))
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

# show sample
print("\nSample training pair:")
print(f"CV (anchor): {train_df.iloc[0]['anchor_text'][:300]}...")
print(f"Job (positive): {train_df.iloc[0]['positive_text'][:300]}...")

Training samples: 5839
Validation samples: 730

Sample training pair:
CV (anchor): query: I am a Devops Engineer with 3 years of experience, (mid-level). My skills include: GitLab, C++, English, Go, REST, Google Cloud, fluent, MongoDB, AWS, Agile, GraphQL, PostgreSQL, Jenkins, CI/CD, Node.js, Java, Django. I worked as Devops Engineer from 2019 2022 (N/A). I worked as Devops Engine...
Job (positive): passage: Role of DevOps/Software Engineer III at Steneral Consulting in Houston, TX. Required skills: Python, CI/CD, Azure DevOps, Jenkins, GitLab, Kubernetes, Helm, Ansible, Chef, Puppet. Experience level: Mid-level, 3-5 years experience....


In [5]:
# !!! sentence-transformers v3 uses Dataset, NOT InputExample
# column names must be 'anchor' and 'positive' for MNR loss
train_dataset = Dataset.from_dict({
    "anchor": train_df['anchor_text'].tolist(),
    "positive": train_df['positive_text'].tolist()
})

val_dataset = Dataset.from_dict({
    "anchor": val_df['anchor_text'].tolist(),
    "positive": val_df['positive_text'].tolist()
})

print(f"train dataset: {train_dataset}")
print(f"val dataset: {val_dataset}")

train dataset: Dataset({
    features: ['anchor', 'positive'],
    num_rows: 5839
})
val dataset: Dataset({
    features: ['anchor', 'positive'],
    num_rows: 730
})


In [6]:
# load base model
print("Loading e5-base-v2 model")
model = SentenceTransformer("intfloat/e5-base-v2")
print(f"Model embedding dimension: {model.get_sentence_embedding_dimension()}")

# MNR loss (uses in-batch negatives)
base_loss = MultipleNegativesRankingLoss(model)

# wrap with MatryoshkaLoss for multi-dimension training
loss = MatryoshkaLoss(
    model=model,
    loss=base_loss,
    matryoshka_dims=[768, 512, 256, 128, 64]  # train at all these dimensions
)
print("Configured MNR + MatryoshkaLoss")

Loading e5-base-v2 model
Model embedding dimension: 768
Configured MNR + MatryoshkaLoss


In [None]:
# training arguments (SentenceTransformerTrainer uses HF Trainer backend)
args = SentenceTransformerTrainingArguments(
    output_dir=os.path.join(PROJECT_ROOT, "training", "output", "models", "cv-job-matcher-e5"),
    num_train_epochs=10,
    per_device_train_batch_size=64,  # fp16
    learning_rate=2e-5,
    warmup_ratio=0.1,  # 10% warmup to avoid forgetting
    fp16=True,  # mixed precision for speed

    # evaluation
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # keep only best checkpoint
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",

    # logging
    logging_steps=10,
    run_name="cv-job-e5-mnr-matryoshka",  # W&B run name

    # Early stopping via Trainer
    greater_is_better=False,  # Lower loss is better
)

print("Training arguments configured")
print(f" epochs: {args.num_train_epochs}")
print(f" batch size: {args.per_device_train_batch_size}")
print(f" learning rate: {args.learning_rate}")

Training arguments configured
 epochs: 10
 batch size: 64
 learning rate: 2e-05


In [8]:
# initialize W&B
# will prompt for login if WANDB_API_KEY not set
wandb.init(
    project="talent-matching",
    name="cv-job-e5-mnr-matryoshka",
    config={
        "model": "intfloat/e5-base-v2",
        "loss": "MNR+Matryoshka",
        "matryoshka_dims": [768, 512, 256, 128, 64],
        "batch_size": 64,
        "learning_rate": 2e-5,
        "epochs": 10,
        "train_samples": len(train_df),
        "val_samples": len(val_df)
    }
)
print(f"W&B initialized: {wandb.run.url}")

[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /home/developer/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mscaranoalex[0m ([33mscaranoalex-university-of-trento[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


W&B initialized: https://wandb.ai/scaranoalex-university-of-trento/talent-matching/runs/3225ik34


In [None]:
# early stopping callback, stops if val loss doesn't improve for 3 epochs
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.001
)

# trainer with early stopping
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=loss,
    callbacks=[early_stopping]
)

# train
print("Starting training with early stopping (patience=3)")
trainer.train()

print("Training complete")
print(f"Best validation loss: {trainer.state.best_metric:.4f}")
print(f"Epochs trained: {trainer.state.epoch:.0f}")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Starting training with early stopping (patience=3)


Epoch,Training Loss,Validation Loss
1,5.4085,1.23813
2,3.885,1.121766
3,3.2319,1.130548
4,2.8093,1.129452
5,2.4657,1.071146
6,2.3361,1.081982
7,2.0675,1.105268
8,1.8686,1.116916


Training complete
Best validation loss: 1.0711
Epochs trained: 8


In [10]:
# save the trained model
model.save(os.path.join(PROJECT_ROOT, "training", "output", "models", "cv-job-matcher-e5"))
print(f"Model saved to: {os.path.join(PROJECT_ROOT, 'training', 'output', 'models', 'cv-job-matcher-e5')}")

# list saved files
import os
for f in os.listdir(os.path.join(PROJECT_ROOT, "training", "output", "models", "cv-job-matcher-e5")):
    size = os.path.getsize(os.path.join(PROJECT_ROOT, "training", "output", "models", "cv-job-matcher-e5", f)) / 1e6
    print(f"  {f}: {size:.1f} MB")

# finish W&B run
wandb.finish()
print("W&B run finished")

Model saved to: output/models/cv-job-matcher-e5
  2_Normalize: 0.0 MB
  README.md: 0.0 MB
  model.safetensors: 438.0 MB
  tokenizer.json: 0.7 MB
  config.json: 0.0 MB
  sentence_bert_config.json: 0.0 MB
  modules.json: 0.0 MB
  tokenizer_config.json: 0.0 MB
  checkpoint-460: 0.0 MB
  vocab.txt: 0.2 MB
  special_tokens_map.json: 0.0 MB
  1_Pooling: 0.0 MB
  config_sentence_transformers.json: 0.0 MB


0,1
eval/loss,█▃▃▃▁▁▂▃
eval/runtime,█▂▁▄▁▅▃▃
eval/samples_per_second,▁▇█▅█▄▆▆
eval/steps_per_second,▁▇█▅█▄▆▆
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇███
train/grad_norm,▄█▄▂▃▃▃▃▄▂▄▂▂▂▂▁▁▃▃▃▂▂▃▄▄▁▆▁▂▂▁▂▄▂▁▃▁▃▄▂
train/learning_rate,▁▃▄▅▇████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂
train/loss,█▆▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,1.11692
eval/runtime,1.7425
eval/samples_per_second,418.948
eval/steps_per_second,52.799
total_flos,0
train/epoch,8
train/global_step,736
train/grad_norm,33.20267
train/learning_rate,0.0
train/loss,1.8686


W&B run finished


In [11]:
# test the trained model
trained_model = SentenceTransformer(os.path.join(PROJECT_ROOT, "training", "output", "models", "cv-job-matcher-e5"))

# encode a sample CV and job
sample_cv = "query: python developer with 5 years experience in Django and PostgreSQL"
sample_job = "passage: Title: Senior Python Developer. Required: Python, Django, PostgreSQL, 5+ years experience"

cv_emb = trained_model.encode(sample_cv)
job_emb = trained_model.encode(sample_job)

# compute similarity
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity([cv_emb], [job_emb])[0][0]
print(f"Sample similarity: {sim:.4f}")
print("(Higher is better, expect > 0.7 for good match)")

Sample similarity: 0.8006
(Higher is better, expect > 0.7 for good match)


In [None]:
# hyperparameter sweep configuration, controlled by QUICK_MODE
if QUICK_MODE:
    # single run with best hyperparameters
    LEARNING_RATES = [BEST_LR]
    WARMUP_RATIOS = [BEST_WARMUP]
    SWEEP_EPOCHS = 9
    SWEEP_PATIENCE = 3
else:
    # full sweep
    LEARNING_RATES = [2e-5, 1e-5, 5e-6]
    WARMUP_RATIOS = [0.1, 0.05]
    SWEEP_EPOCHS = 13
    SWEEP_PATIENCE = 3

total_runs = len(LEARNING_RATES) * len(WARMUP_RATIOS)
print(f"Sweep will run {total_runs} experiments:")
print(f"  learning rates: {LEARNING_RATES}")
print(f"  warmup ratios: {WARMUP_RATIOS}")
print(f"  max epochs per run: {SWEEP_EPOCHS}")
print(f"  early stopping patience: {SWEEP_PATIENCE}")

Sweep will run 6 experiments:
  learning rates: [2e-05, 1e-05, 5e-06]
  warmup ratios: [0.1, 0.05]
  max epochs per run: 13
  early stopping patience: 3


In [14]:
# run hyperparameter sweep with proper memory management
sweep_results = []
run_num = 1

# cleanup: free memory from initial training (Sections 4-8) before starting sweep
try:
    del model, trainer, loss, base_loss
    print("Cleaned up initial training objects")
except NameError:
    pass

gc.collect()
torch.cuda.empty_cache()

for lr in LEARNING_RATES:
    for warmup in WARMUP_RATIOS:
        # cleanup before each run
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        
        # Log memory status
        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated() / 1e9
            print(f"\n{'='*50}")
            print(f"GPU memory before run: {allocated:.2f} GB allocated")
        
        run_name = f"sweep_lr{lr}_warmup{warmup}"
        output_dir = f"output/models/sweep/{run_name}"
        
        print(f"RUN {run_num}/{total_runs}: lr={lr}, warmup={warmup}")
        
        # init wandb for this run
        wandb.init(
            project="talent-matching-sweep",
            name=run_name,
            config={"learning_rate": lr, "warmup_ratio": warmup, "epochs": SWEEP_EPOCHS},
            reinit=True
        )
        
        # load fresh model
        sweep_model = SentenceTransformer("intfloat/e5-base-v2")
        sweep_base_loss = MultipleNegativesRankingLoss(sweep_model)
        sweep_loss = MatryoshkaLoss(
            model=sweep_model,
            loss=sweep_base_loss,
            matryoshka_dims=[768, 512, 256, 128, 64]
        )
        
        # training args for this run
        sweep_args = SentenceTransformerTrainingArguments(
            output_dir=output_dir,
            num_train_epochs=SWEEP_EPOCHS,
            per_device_train_batch_size=64,
            learning_rate=lr,
            warmup_ratio=warmup,
            fp16=True,
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            logging_steps=20,
            report_to="wandb",
        )
        
        # early stopping
        sweep_early_stop = EarlyStoppingCallback(
            early_stopping_patience=SWEEP_PATIENCE,
            early_stopping_threshold=0.001
        )
        
        # trainer
        sweep_trainer = SentenceTransformerTrainer(
            model=sweep_model,
            args=sweep_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            loss=sweep_loss,
            callbacks=[sweep_early_stop]
        )
        
        # train
        sweep_trainer.train()
        
        # save result
        final_loss = sweep_trainer.state.best_metric
        epochs_done = sweep_trainer.state.epoch
        sweep_results.append({
            "run": run_name,
            "lr": lr,
            "warmup": warmup,
            "val_loss": final_loss,
            "epochs": epochs_done
        })
        
        print(f"Result: val_loss={final_loss:.4f}, epochs={epochs_done:.0f}")
        
        # save model
        sweep_model.save(output_dir)
        
        # finish wandb run
        wandb.finish()
        
        # aggressive cleanup after each run
        # must delete in correct order: trainer first (holds references), then model
        del sweep_trainer
        del sweep_loss, sweep_base_loss
        del sweep_model
        
        # force Python garbage collection
        gc.collect()
        
        # Clear CUDA cache
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        
        run_num += 1

print("\nSweep complete")

Cleaned up initial training objects

GPU memory before run: 0.47 GB allocated
RUN 1/6: lr=2e-05, warmup=0.1




Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Epoch,Training Loss,Validation Loss
1,5.5572,1.245936
2,4.2097,1.137908
3,3.6083,1.123663
4,2.9411,1.127357
5,2.4994,1.088344
6,2.4628,1.092251
7,2.1332,1.118051
8,2.0253,1.102149


Result: val_loss=1.0883, epochs=8


0,1
eval/loss,█▃▃▃▁▁▂▂
eval/runtime,▄█▆▁▆▆▇▇
eval/samples_per_second,▅▁▃█▃▃▂▂
eval/steps_per_second,▅▁▃█▃▃▂▂
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,█▂▃▃▃▃▄▃▅▃▃▅▄▃▆▂▄▄▃▄▅▂▇▄▃▁▂▂▃▃▃▄▃▂▅▃
train/learning_rate,▁▂▄▅▇█████▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▄
train/loss,█▅▄▃▃▂▂▂▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,1.10215
eval/runtime,1.7724
eval/samples_per_second,411.881
eval/steps_per_second,51.908
total_flos,0
train/epoch,8
train/global_step,736
train/grad_norm,27.682
train/learning_rate,1e-05
train/loss,2.0253



GPU memory before run: 0.47 GB allocated
RUN 2/6: lr=2e-05, warmup=0.05


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Epoch,Training Loss,Validation Loss
1,4.943,1.208726
2,4.0026,1.112778
3,3.457,1.119361
4,2.8726,1.141682
5,2.4653,1.090458
6,2.443,1.09938
7,2.1325,1.121802
8,2.0175,1.120751


Result: val_loss=1.0905, epochs=8


0,1
eval/loss,█▂▃▄▁▂▃▃
eval/runtime,▄▁▄▃█▇▃▅
eval/samples_per_second,▅█▅▆▁▂▆▄
eval/steps_per_second,▅█▅▆▁▂▆▄
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,█▂▃▃▄▂▃▃▄▃▂▄▃▃▆▂▄▃▃▄▄▂▇▃▃▁▂▂▃▃▂▄▃▂▄▃
train/learning_rate,▁▄████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂
train/loss,█▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,1.12075
eval/runtime,1.7767
eval/samples_per_second,410.863
eval/steps_per_second,51.78
total_flos,0
train/epoch,8
train/global_step,736
train/grad_norm,29.51261
train/learning_rate,1e-05
train/loss,2.0175



GPU memory before run: 0.47 GB allocated
RUN 3/6: lr=1e-05, warmup=0.1


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Epoch,Training Loss,Validation Loss
1,6.5524,1.372548
2,4.5746,1.119838
3,4.0142,1.091149
4,3.3781,1.09961
5,2.9345,1.045433
6,3.0162,1.028601
7,2.7172,1.060705
8,2.6021,1.069274
9,2.4844,1.052855


Result: val_loss=1.0286, epochs=9


0,1
eval/loss,█▃▂▂▁▁▂▂▁
eval/runtime,▂▁▁▂▃▅█▃█
eval/samples_per_second,▇██▇▆▄▁▆▁
eval/steps_per_second,▇██▇▆▄▁▆▁
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▆█▁▁▂▂▃▃▃▃▂▄▃▃▅▂▃▄▃▄▂▃▇▄▄▁▂▃▃▂▄▄▄▃▅▂▃▃▃▃
train/learning_rate,▁▂▄▅▇█████▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃
train/loss,█▇▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,1.05285
eval/runtime,1.8194
eval/samples_per_second,401.225
eval/steps_per_second,50.565
total_flos,0
train/epoch,9
train/global_step,828
train/grad_norm,31.92347
train/learning_rate,0.0
train/loss,2.4844



GPU memory before run: 0.47 GB allocated
RUN 4/6: lr=1e-05, warmup=0.05


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Epoch,Training Loss,Validation Loss
1,5.5699,1.249349
2,4.3556,1.099825
3,3.8744,1.084962
4,3.3179,1.103624
5,2.9072,1.050397
6,2.9893,1.028154
7,2.7087,1.07045
8,2.6023,1.081128
9,2.4981,1.059621


Result: val_loss=1.0282, epochs=9


0,1
eval/loss,█▃▃▃▂▁▂▃▂
eval/runtime,▇█▄▂▁▁▂▂▂
eval/samples_per_second,▂▁▅▇██▇▇▇
eval/steps_per_second,▂▁▅▇██▇▇▇
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,█▁▂▁▂▁▃▃▃▂▂▄▃▂▅▁▄▄▃▄▂▃█▄▄▁▂▃▃▂▄▃▄▃▅▂▂▃▃▃
train/learning_rate,▁▄████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂
train/loss,█▅▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,1.05962
eval/runtime,1.7814
eval/samples_per_second,409.788
eval/steps_per_second,51.644
total_flos,0
train/epoch,9
train/global_step,828
train/grad_norm,31.84941
train/learning_rate,0.0
train/loss,2.4981



GPU memory before run: 0.47 GB allocated
RUN 5/6: lr=5e-06, warmup=0.1


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Epoch,Training Loss,Validation Loss
1,8.6792,1.636586
2,5.141,1.19092
3,4.6139,1.090894
4,3.9636,1.065102
5,3.4658,1.069295
6,3.6383,1.023155
7,3.3311,1.053011
8,3.1866,1.070229
9,3.1343,1.035873


Result: val_loss=1.0232, epochs=9


0,1
eval/loss,█▃▂▁▂▁▁▂▁
eval/runtime,▄▅▆▅▄█▁▁▄
eval/samples_per_second,▅▃▃▄▅▁██▅
eval/steps_per_second,▅▄▃▄▅▁██▅
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▄▇▅▂▂▁▃▂▂▂▂▃▃▂▄▂▄▄▃▃▂▄█▄▄▂▃▃▄▂▅▃▄▄▅▃▃▃▄▃
train/learning_rate,▁▂▄▅▇█████▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃
train/loss,█▇▆▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,1.03587
eval/runtime,1.7731
eval/samples_per_second,411.7
eval/steps_per_second,51.886
total_flos,0
train/epoch,9
train/global_step,828
train/grad_norm,36.3521
train/learning_rate,0.0
train/loss,3.1343



GPU memory before run: 0.47 GB allocated
RUN 6/6: lr=5e-06, warmup=0.05


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Epoch,Training Loss,Validation Loss
1,6.579,1.407976
2,4.9266,1.153861
3,4.4958,1.082108
4,3.918,1.061034
5,3.4435,1.06777
6,3.6249,1.023084
7,3.3303,1.052614
8,3.1921,1.071335
9,3.1423,1.038861


Result: val_loss=1.0231, epochs=9


0,1
eval/loss,█▃▂▂▂▁▂▂▁
eval/runtime,▃▃▂▃▂▁▂▁█
eval/samples_per_second,▆▆▇▆▇█▇█▁
eval/steps_per_second,▆▆▇▆▇█▇█▁
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▅▇▁▁▂▁▃▃▃▂▂▃▃▂▄▂▃▄▃▄▂▄█▄▄▂▃▃▄▂▅▃▄▄▅▃▃▃▄▃
train/learning_rate,▁▄████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁
train/loss,█▆▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,1.03886
eval/runtime,1.7851
eval/samples_per_second,408.94
eval/steps_per_second,51.538
total_flos,0
train/epoch,9
train/global_step,828
train/grad_norm,36.17868
train/learning_rate,0.0
train/loss,3.1423



Sweep complete


In [15]:
# compare sweep results
results_df = pd.DataFrame(sweep_results)
results_df = results_df.sort_values("val_loss")

print("SWEEP RESULTS (sorted by validation loss):")
print(results_df.to_string(index=False))

# best model
best = results_df.iloc[0]
print(f"\nBEST MODEL:")
print(f"  learning rate: {best['lr']}")
print(f"  warmup ratio: {best['warmup']}")
print(f"  validation loss: {best['val_loss']:.4f}")
print(f"  epochs trained: {best['epochs']:.0f}")

SWEEP RESULTS (sorted by validation loss):
                     run       lr  warmup  val_loss  epochs
sweep_lr5e-06_warmup0.05 0.000005    0.05  1.023084     9.0
 sweep_lr5e-06_warmup0.1 0.000005    0.10  1.023155     9.0
sweep_lr1e-05_warmup0.05 0.000010    0.05  1.028154     9.0
 sweep_lr1e-05_warmup0.1 0.000010    0.10  1.028601     9.0
 sweep_lr2e-05_warmup0.1 0.000020    0.10  1.088344     8.0
sweep_lr2e-05_warmup0.05 0.000020    0.05  1.090458     8.0

BEST MODEL:
  learning rate: 5e-06
  warmup ratio: 0.05
  validation loss: 1.0231
  epochs trained: 9


In [16]:
# copy best model to main location
import shutil

best_src = f"output/models/sweep/{best['run']}"
best_dst = os.path.join(PROJECT_ROOT, "training", "output", "models", "cv-job-matcher-e5-best")

if os.path.exists(best_dst):
    shutil.rmtree(best_dst)
shutil.copytree(best_src, best_dst)

print(f"Best model copied to: {best_dst}")

# save sweep results
os.makedirs("output/models/sweep", exist_ok=True)
results_df.to_csv("output/models/sweep/sweep_results.csv", index=False)
print(f"Results saved to: output/models/sweep/sweep_results.csv")

Best model copied to: output/models/cv-job-matcher-e5-best
Results saved to: output/models/sweep/sweep_results.csv
