In [1]:
import os
from datasets import load_dataset
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import os
os.environ["WANDB_PROJECT"] = "qwen-coder-llm-fine-tuning"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import wandb
wandb.login()

wandb: Currently logged in as: casvi-sanchez (virtualtek) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

In [3]:
import wandb
wandb.login()

True

In [4]:
dataset_name = "squad_v2"
dataset = load_dataset(dataset_name, split="train")
eval_dataset = load_dataset(dataset_name, split="validation")
print("dataset: ",dataset)
print("eval_dataset: ",eval_dataset)


dataset:  Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
})
eval_dataset:  Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})


In [5]:
import torch
cuda_available = torch.cuda.is_available()

if cuda_available:
    device_id = 0  # You can change to 1,2,3 if you want other GPUs
    torch.cuda.set_device(device_id)
    # device = torch.device(f"cuda:{device_id}")
    device = torch.device(f"cuda:{device_id}")
    print(f"🖥️ Using GPU {device_id}: {torch.cuda.get_device_name(device_id)}")
else:
    device = torch.device("cpu")
    print("⚙️ No GPU available, using CPU.")

print(f"Device selected: {device}")

🖥️ Using GPU 0: NVIDIA GeForce RTX 4070 SUPER
Device selected: cuda:0


In [6]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models

)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.7: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4070 SUPER. Num GPUs = 2. Max memory: 11.994 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.4.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [16]:
import neptune
import neptune.integrations.optuna as optuna_utils

run = neptune.init_run(
    project="casvi/CodeMedic",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzMTMzYjhhOC1jYzA1LTQ0YjAtOTJjNi1iY2EzM2VhMDY0OTcifQ=="
)


[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/casvi/CodeMedic/e/COD-5


In [17]:
from trl import SFTTrainer, SFTConfig
import time
def objective(trial):
    start=time.time()
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
    batch_size = trial.suggest_categorical("per_device_train_batch_size", [2, 4, 8])
    num_epochs = trial.suggest_int("num_train_epochs", 1, 3)

    # SFT Config
    config = SFTConfig(
        dataset_num_proc = 1,
        output_dir="./outputs",
        dataset_text_field="question",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_epochs,
        report_to="none",  # We log to Neptune manually
        logging_steps=10,
        max_steps=100,
    )

    trainer = SFTTrainer(
        model=model,  # base or PEFT model
        tokenizer=tokenizer,
        train_dataset=dataset,
        eval_dataset=eval_dataset,
        args=config,
        prediction_loss_only=False,
        eval_accumulation_steps=10,
    )

    trainer.train()
    metrics = trainer.evaluate()
    # Log trial info to Neptune
    run[f"trial/{trial.number}/metrics"] = metrics
    run[f"trial/{trial.number}/params"] = {
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "num_epochs": num_epochs,
    }

    end = time.time()
    length = end - start

    hours = int(length // 3600)
    minutes = int((length % 3600) // 60)
    seconds = int(length % 60)

    print(f"It took {hours} hours, {minutes} minutes, and {seconds} seconds to train the model!")

    return metrics["eval_loss"]  # Or any other metric


In [18]:
import optuna
neptune_callback = optuna_utils.NeptuneCallback(run)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=2, callbacks=[neptune_callback], show_progress_bar=True)

[I 2025-05-07 01:02:53,685] A new study created in memory with name: no-name-a4c602cf-57e0-49ff-9542-5cbd35a2bcba
  0%|          | 0/2 [00:00<?, ?it/s]==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 130,319 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 34,865,152/7,000,000,000 (0.50% trained)


Step,Training Loss
10,4.0653
20,3.8292
30,3.6923
40,3.8725
50,3.9206
60,3.8869
70,3.9084
80,3.6741
90,3.7531
100,3.7571


result:  TrainOutput(global_step=100, training_loss=3.8359455108642577, metrics={'train_runtime': 83.9233, 'train_samples_per_second': 9.533, 'train_steps_per_second': 1.192, 'total_flos': 116397026918400.0, 'train_loss': 3.8359455108642577})


                                     

[I 2025-05-07 01:07:03,327] Trial 0 finished with value: 3.820232629776001 and parameters: {'learning_rate': 0.00016478552271182332, 'per_device_train_batch_size': 2, 'num_train_epochs': 1}. Best is trial 0 with value: 3.820232629776001.


                                     

[W 2025-05-07 01:07:03,337] Param num_train_epochs unique value length is less than 2.


                                     

[W 2025-05-07 01:07:03,342] Param per_device_train_batch_size unique value length is less than 2.


                                     

[W 2025-05-07 01:07:03,346] Param learning_rate unique value length is less than 2.


                                     

[W 2025-05-07 01:07:03,351] Param per_device_train_batch_size unique value length is less than 2.


                                     

[W 2025-05-07 01:07:03,354] Param learning_rate unique value length is less than 2.


                                     

[W 2025-05-07 01:07:03,358] Param num_train_epochs unique value length is less than 2.


Best trial: 0. Best value: 3.82023:  50%|█████     | 1/2 [04:09<04:09, 249.93s/it]==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 130,319 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 34,865,152/7,000,000,000 (0.50% trained)


Step,Training Loss
10,3.1057
20,3.236
30,3.5365
40,3.6555
50,3.7804
60,3.858
70,3.8556
80,3.6786
90,3.7082
100,3.704


result:  TrainOutput(global_step=100, training_loss=3.611842384338379, metrics={'train_runtime': 82.7215, 'train_samples_per_second': 19.342, 'train_steps_per_second': 1.209, 'total_flos': 266347455283200.0, 'train_loss': 3.611842384338379})


                                                                                  

[I 2025-05-07 01:11:07,781] Trial 1 finished with value: 3.8060173988342285 and parameters: {'learning_rate': 0.0004614775923983271, 'per_device_train_batch_size': 4, 'num_train_epochs': 2}. Best is trial 1 with value: 3.8060173988342285.


Best trial: 1. Best value: 3.80602: 100%|██████████| 2/2 [08:14<00:00, 247.24s/it]


In [24]:
# Get the best parameters
best_trial = study.best_trial

best_params = best_trial.params
print("best_params: ",best_params)

best_value = best_trial.value
print("Eval loss:", best_value)
run.stop()

best_params:  {'learning_rate': 0.0004614775923983271, 'per_device_train_batch_size': 4, 'num_train_epochs': 2}
Eval loss: 3.8060173988342285
