In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = "squad_v2"
dataset = load_dataset(dataset_name, split="train")
eval_dataset = load_dataset(dataset_name, split="validation")
print("dataset: ",dataset)
print("eval_dataset: ",eval_dataset)

dataset:  Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
})
eval_dataset:  Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})


In [3]:
import torch
cuda_available = torch.cuda.is_available()

if cuda_available:
    device_id = 0  # You can change to 1,2,3 if you want other GPUs
    torch.cuda.set_device(device_id)
    # device = torch.device(f"cuda:{device_id}")
    device = torch.device(f"cuda:{device_id}")
    print(f"🖥️ Using GPU {device_id}: {torch.cuda.get_device_name(device_id)}")
else:
    device = torch.device("cpu")
    print("⚙️ No GPU available, using CPU.")

print(f"Device selected: {device}")

🖥️ Using GPU 0: NVIDIA GeForce RTX 4070 SUPER
Device selected: cuda:0


In [4]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    max_seq_length = 4096,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.7: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4070 SUPER. Num GPUs = 2. Max memory: 11.994 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.4.7 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [6]:
import neptune
import neptune.integrations.optuna as optuna_utils

run = neptune.init_run(
    project="casvi/CodeMedic",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzMTMzYjhhOC1jYzA1LTQ0YjAtOTJjNi1iY2EzM2VhMDY0OTcifQ=="
)




[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/casvi/CodeMedic/e/COD-33


In [8]:
import evaluate

rouge = evaluate.load('rouge')
bleu = evaluate.load("bleu")
acc = evaluate.load("accuracy")


def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    # argmax to get the token ids
    return logits.argmax(dim=-1)


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels,
    # after the argmax(-1) has been calculated by preprocess_logits_for_metrics
    # but we need to shift the labels
    labels = labels[:, 1:]
    preds = preds[:, :-1]

    # -100 is a default value for ignore_index used by DataCollatorForCompletionOnlyLM
    mask = labels == -100
    # replace -100 with a value that the tokenizer can decode
    labels[mask] = tokenizer.pad_token_id
    preds[mask] = tokenizer.pad_token_id

    # bleu takes in text, so we have to translate from token ids to text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    # accuracy takes in lists of integers,
    # and we want to evaluate only the parts that are not -100,
    # hence the mask negation (~)
    accuracy = acc.compute(predictions=preds[~mask], references=labels[~mask])

    return {**bleu_score, **accuracy}

In [9]:
from trl import SFTTrainer, SFTConfig
import time
def objective(trial):
    start=time.time()
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5,5e-4, log=True)
    batch_size = trial.suggest_categorical("per_device_train_batch_size", [8,12,16])
    gradient_accumulation_steps = trial.suggest_categorical("gradient_accumulation_steps",[8,12,16])
    num_epochs = trial.suggest_int("num_train_epochs", 3, 10)

    # SFT Config
    config = SFTConfig(
        dataset_num_proc = 1,
        output_dir="./outputs",
        dataset_text_field="question",#Depends on the colum of your data set
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_epochs,
        report_to="none",
        logging_steps=100,
        max_steps=1000,
        eval_accumulation_steps=100
    )

    trainer = SFTTrainer(
        model=model,  # base or PEFT model
        tokenizer=tokenizer,
        train_dataset=dataset,
        eval_dataset=eval_dataset,
        args=config,
        warmup_steps = 5,
        weight_decay = 0.01,
        compute_metrics = compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )
    metrics = trainer.evaluate()
    trainer.train()
    # Log trial info to Neptunef
    run[f"trial/{trial.number}/metrics"] = metrics
    run[f"trial/{trial.number}/params"] = {
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "num_epochs": num_epochs,
    }
    print("Metrics:",metrics)

    end = time.time()
    length = end - start

    hours = int(length // 3600)
    minutes = int((length % 3600) // 60)
    seconds = int(length % 60)

    print(f"It took {hours} hours, {minutes} minutes, and {seconds} seconds to train the model!")

    return metrics["eval_loss"]  # Or any other metric


In [10]:
import optuna
neptune_callback = optuna_utils.NeptuneCallback(run)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, callbacks=[neptune_callback], show_progress_bar=True)

[I 2025-05-09 14:00:41,726] A new study created in memory with name: no-name-f9fe4e58-8a97-443d-b8e7-9e908c325658
  0%|          | 0/10 [00:00<?, ?it/s]Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


  0%|          | 0/10 [03:35<?, ?it/s]


[W 2025-05-09 14:04:16,804] Trial 0 failed with parameters: {'learning_rate': 1.2745429344030691e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 16, 'num_train_epochs': 8} because of the following error: ValueError("invalid literal for int() with base 10: 'i'").
Traceback (most recent call last):
  File "C:\Users\casvi\AppData\Local\Programs\Python\Python312\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\casvi\AppData\Local\Temp\ipykernel_9668\805129094.py", line 37, in objective
    metrics = trainer.evaluate()
              ^^^^^^^^^^^^^^^^^^
  File "C:\Users\casvi\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\trainer.py", line 4154, in evaluate
    output = eval_loop(
             ^^^^^^^^^^
  File "C:\Users\casvi\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\trainer.py", line 4443, in evaluation_loop
    

ValueError: invalid literal for int() with base 10: 'i'

In [9]:
# Get the best parameters
best_trial = study.best_trial

best_params = best_trial.params
print("best_params: ",best_params)

best_value = best_trial.value
print("Eval loss:", best_value)
run.stop()

best_params:  {'learning_rate': 0.0001726174951678421, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 16, 'num_train_epochs': 7}
Eval loss: 3.690156936645508
[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 56 operations to synchronize with Neptune. Do not kill this process.
[neptune] [info   ] All 56 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/casvi/CodeMedic/e/COD-13/metadata
