## Fine Tuning

Assuming running on T4 GPU

In [None]:
!nvidia-smi
%pip install -q --upgrade bitsandbytes==0.48.2 trl==0.25.1

In [None]:
import os
from huggingface_hub import login
import torch
from google.colab import userdata
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import wandb
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from datetime import datetime

In [None]:
# Constants

MODEL = "Qwen/Qwen3-4B-Instruct-2507"
PROJECT_NAME = "complexity"
HF_USER = "aslam-naseer"

DATA_USER = "aslam-naseer"
DATASET_NAME = f"{DATA_USER}/js-function-complexity-messages"

RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

# Hyper-parameters - overall

EPOCHS = 3
BATCH_SIZE = 1
MAX_SEQUENCE_LENGTH = 3072
GRADIENT_ACCUMULATION_STEPS = 16

# Hyper-parameters - QLoRA

QUANT_4_BIT = True
LORA_R = 32
LORA_ALPHA = 16
TARGET_MODULES = "all-linear"
LORA_DROPOUT = 0.05

# Hyper-parameters - training

LEARNING_RATE = 1e-4
WARMUP_RATIO = 0.03
LR_SCHEDULER_TYPE = 'cosine'
WEIGHT_DECAY = 0.001
OPTIMIZER = "paged_adamw_32bit"

capability = torch.cuda.get_device_capability()
use_bf16 = capability[0] >= 8

# Tracking

LOG_STEPS = 5
SAVE_STEPS = 100
LOG_TO_WANDB = True

#### Load API Keys

In [None]:
hf_token = userdata.get("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN not found in environment variables.")

login(hf_token, add_to_git_credential=True)

In [None]:
wandb_api_key = userdata.get('WANDB_API_KEY')
if not wandb_api_key:
    raise ValueError("WANDB_API_KEY not found in environment variables.")
os.environ["WANDB_API_KEY"] = wandb_api_key

os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["WANDB_WATCH"] = "false"
os.environ["WANDB_ENTITY"] = "aslam-naseer-personal"

wandb.login()
if LOG_TO_WANDB:
    wandb.init(project=PROJECT_NAME, name=RUN_NAME)

### Load dataset and config objects

In [None]:
dataset = load_dataset(DATASET_NAME)

train = dataset['train']
val = dataset['validation']
test = dataset['test']

In [None]:
if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if use_bf16 else torch.float16
    bnb_4bit_quant_type='nf4'
  )

else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if use_bf16 else torch.float16
  )

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    quantization_config=quant_config,
    device_map="auto",
    attn_implementation="flash_attention_2" if use_bf16 else "eager", 
)


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.1f} MB")

In [None]:
lora_config = LoraConfig(
  r=LORA_R,
  lora_alpha=LORA_ALPHA,
  target_modules=TARGET_MODULES,
  lora_dropout=LORA_DROPOUT,
  bias="none",
  task_type="CAUSAL_LM",
  use_rslora=True
)

In [None]:
sft_config = SFTConfig(
    output_dir=PROJECT_RUN_NAME,
    num_train_epochs=EPOCHS,

    max_length=MAX_SEQUENCE_LENGTH,
    packing=False,
    group_by_length=True,

    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,

    optim=OPTIMIZER,
    save_steps=SAVE_STEPS,
    save_total_limit=3,
    logging_steps=LOG_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=not use_bf16,
    bf16=use_bf16,
    max_grad_norm=0.3,
    max_steps=-1,

    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type=LR_SCHEDULER_TYPE,

    report_to="wandb" if LOG_TO_WANDB else None,
    run_name=RUN_NAME,

    save_strategy="steps",
    hub_strategy="every_save",
    push_to_hub=True,
    hub_model_id=HUB_MODEL_NAME,
    hub_private_repo=True,

    eval_strategy="steps",
    eval_steps=SAVE_STEPS,

    dataset_kwargs={
        "append_concat_token": False,
        "add_special_tokens": False,
    }
)

### Training!

In [None]:
fine_tuning = SFTTrainer(
  model=model,
  tokenizer=tokenizer,
  train_dataset=train,
  eval_dataset=val,
  peft_config=lora_config,
  args=sft_config
)

In [None]:
fine_tuning.train()

fine_tuning.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print(f"Saved to the hub: {PROJECT_RUN_NAME}")

In [None]:
if LOG_TO_WANDB:
  wandb.finish()