In [1]:
%%capture
# Installs Unsloth, xformers (Flash Attention), and all required packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "git+https://github.com/huggingface/trl.git@main" peft accelerate bitsandbytes

In [2]:
from unsloth import PatchDPOTrainer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
!pip install --upgrade unsloth



In [6]:
import os
import torch
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth import FastLanguageModel

In [7]:
max_sequence_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "mlabonne/TwinLlama-3.1-8B",
    max_seq_length = max_sequence_length,
    load_in_4bit = True,
    dtype = torch.float16
)

==((====))==  Unsloth 2025.8.9: Fast Llama patching. Transformers: 4.55.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [9]:
model = FastLanguageModel.get_peft_model(
    model = model,
    r = 32,
    lora_alpha = 32,
    lora_dropout = 0,
    target_modules = ["q_proj","k_proj","v_proj","up_proj","down_proj","o_proj","gate_proj"]
)

Unsloth 2025.8.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [10]:
dataset = load_dataset("mlabonne/llmtwin-dpo",split="train")
dataset

README.md:   0%|          | 0.00/490 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/304k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/31.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1545 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/89 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'rejected', 'chosen'],
    num_rows: 1545
})

In [11]:
alpaca_template = """Below is an instruction that describe a task. write a response that appropriately completes the request.
### Instruction:
{}
### Response:
{}
"""
EOS_TOKEN = tokenizer.eos_token

def format_samples(example):
  example["prompt"] = alpaca_template.format(example["prompt"],"")
  example["chosen"] = example["chosen"] + EOS_TOKEN
  example["rejected"] = example["rejected"] + EOS_TOKEN
  return {"prompt": example["prompt"], "chosen": example["chosen"], "rejected": example["rejected"]}


In [12]:
dataset = dataset.map(format_samples)
dataset = dataset.train_test_split(test_size = 0.05)

Map:   0%|          | 0/1545 [00:00<?, ? examples/s]

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'rejected', 'chosen'],
        num_rows: 1467
    })
    test: Dataset({
        features: ['prompt', 'rejected', 'chosen'],
        num_rows: 78
    })
})

In [14]:
import trl
print(trl.__version__)

0.22.0.dev0


In [15]:
from trl import DPOTrainer,DPOConfig

In [16]:
!pip install comet-ml>=3.43.2

In [20]:
trainer = DPOTrainer(
    model = model,
    ref_model = None,
    tokenizer = tokenizer,
    beta = 0.5,
    train_dataset = dataset["train"],
    eval_dataset = dataset["test"],
    max_length = max_sequence_length // 2,
    max_prompt_length = max_sequence_length // 2,
    args = DPOConfig(
        learning_rate = 2e-6,
        lr_scheduler_type = "linear",
        per_device_train_batch_size = 2,
        per_device_eval_batch_size = 2,
        gradient_accumulation_steps = 8,
        num_train_epochs = 1,
        fp16 = True,
        bf16 = False,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        warmup_steps = 10,
        output_dir = "output",
        eval_strategy = "steps",
        eval_steps = 0.2,
        seed = 0
    )
)

Extracting prompt in train dataset (num_proc=2):   0%|          | 0/1467 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/1467 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/1467 [00:00<?, ? examples/s]

Extracting prompt in eval dataset (num_proc=2):   0%|          | 0/78 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=2):   0%|          | 0/78 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=2):   0%|          | 0/78 [00:00<?, ? examples/s]

In [21]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,467 | Num Epochs = 1 | Total steps = 92
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdevmangukiya561[0m ([33mdevmangukiya561-nirma-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
19,0.6775,0.673535,0.036286,-0.003441,0.961538,0.039727,-86.372231,-53.806545,-1.574197,-1.520455,0,0,0,0


Step,Training Loss,Validation Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
19,0.6775,0.673535,0.036286,-0.003441,0.961538,0.039727,-86.372231,-53.806545,-1.574197,-1.520455,0,0,0,0
38,0.645,0.631683,0.103791,-0.024376,0.961538,0.128167,-85.697174,-54.015888,-1.583034,-1.525957,No Log,No Log,No Log,No Log
57,0.613,0.600501,0.141697,-0.055878,0.961538,0.197575,-85.318108,-54.33091,-1.589519,-1.530197,No Log,No Log,No Log,No Log
76,0.5867,0.58197,0.15973,-0.080633,0.974359,0.240363,-85.137779,-54.578457,-1.592922,-1.532504,No Log,No Log,No Log,No Log


TrainOutput(global_step=92, training_loss=0.6369568096554797, metrics={'train_runtime': 1833.9147, 'train_samples_per_second': 0.8, 'train_steps_per_second': 0.05, 'total_flos': 0.0, 'train_loss': 0.6369568096554797, 'epoch': 1.0})

In [23]:
FastLanguageModel.for_inference(model)
message = alpaca_template.format("Write a paragraph to introduce supervised fine-tuning.","")
inputs = tokenizer(message,return_tensors="pt").to("cuda")
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs,streamer = text_streamer,max_new_tokens = 256,use_cache=True)

<|begin_of_text|>Below is an instruction that describe a task. write a response that appropriately completes the request.
### Instruction:
Write a paragraph to introduce supervised fine-tuning.
### Response:

Supervised fine-tuning is a critical process that enhances the performance of language models by providing them with specific training data. This technique involves taking a pre-trained model and refining it through a series of tasks, such as answering questions or performing other language-related tasks. The goal is to tailor the model to excel in a particular domain or task, making it more responsive to user needs and improving its overall effectiveness.<|end_of_text|>


In [28]:
import os
from dotenv import load_dotenv
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")

In [27]:
# ✅ Save LoRA after SFT
model.save_pretrained("TwinLlama-3.1-8B")
tokenizer.save_pretrained("TwinLlama-3.1-8B")

# ✅ Push to Hugging Face Hub (optional)
model.push_to_hub("devmangukiya/TwinLlama-3.1-8B", token=HF_TOKEN)
tokenizer.push_to_hub("devmangukiya/TwinLlama-3.1-8B", token=HF_TOKEN)

README.md:   0%|          | 0.00/575 [00:00<?, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ma-3.1-8B/adapter_model.safetensors:   0%|          |  558kB /  336MB            

Saved model to https://huggingface.co/devmangukiya/TwinLlama-3.1-8B


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  TwinLlama-3.1-8B/tokenizer.json       : 100%|##########| 17.2MB / 17.2MB            