In [5]:
%%capture

!pip install unsloth trl vllm

In [6]:
class Config:
    model_name="Qwen/Qwen3-0.6B-Base"
    instruction_dataset_name="OpenAssistant/oasst1"
    dpo_dataset_name="HuggingFaceH4/ultrafeedback_binarized"
    random_state=2025

In [7]:
from warnings import filterwarnings
filterwarnings(action="ignore")

import tqdm
import torch
import pandas as pd
from peft import PeftModel
from unsloth import PatchDPOTrainer
from transformers import TextStreamer
from datasets import load_dataset, Dataset
## Note: for some reason, importing trl before unsloth leads to unessary errors so i imported unsloth before trl to avoid it.
## Reference: https://stackoverflow.com/questions/79663362/sfttrainer-the-specified-eos-token-eos-token-is-not-found-in-the-vocabu
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import CHAT_TEMPLATES, get_chat_template
from trl import SFTConfig, SFTTrainer, DPOTrainer, DPOConfig

In [8]:
instruction_dataset = load_dataset(Config.instruction_dataset_name)

instruction_dataset

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})

In [9]:
instruction_dataset["train"][0]

{'message_id': '6ab24d72-0181-4594-a9cd-deaf170242fb',
 'parent_id': None,
 'user_id': 'c3fe8c76-fc30-4fa7-b7f8-c492f5967d18',
 'created_date': '2023-02-05T14:23:50.983374+00:00',
 'text': 'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.',
 'role': 'prompter',
 'lang': 'en',
 'review_count': 3,
 'review_result': True,
 'deleted': False,
 'rank': None,
 'synthetic': False,
 'model_name': None,
 'detoxify': {'toxicity': 0.00044308538781479,
  'severe_toxicity': 3.252684837207198e-05,
  'obscene': 0.00023475120542570949,
  'identity_attack': 0.0001416115992469713,
  'insult': 0.00039489680784754455,
  'threat': 4.075629112776369e-05,
  'sexual_explicit': 2.712695459194947e-05},
 'message_tree_id': '6ab24d72-0181-4594-a9cd-deaf170242fb',
 'tree_state': 'ready_for_export',
 'emojis': {'name': ['+1', '_skip_reply', '_skip_ranking'],
  'count': [10

In [10]:
instruction_dataset_train = instruction_dataset["train"].to_pandas()
instruction_dataset_valid = instruction_dataset["validation"].to_pandas()

combined_instruction_data = pd.concat([instruction_dataset_train, instruction_dataset_valid], axis=0)

print(combined_instruction_data.shape)

combined_instruction_data.head()

(88838, 18)


Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
0,6ab24d72-0181-4594-a9cd-deaf170242fb,,c3fe8c76-fc30-4fa7-b7f8-c492f5967d18,2023-02-05T14:23:50.983374+00:00,Can you write a short introduction about the r...,prompter,en,3,True,False,,False,,"{'toxicity': 0.00044308538781479, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
1,c8e83833-ecbc-44fe-b6db-735228c25a1c,6ab24d72-0181-4594-a9cd-deaf170242fb,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T13:50:44.657083+00:00,"""Monopsony"" refers to a market structure where...",assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.00026396565954200923, 'severe_t...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [3...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
2,6708c47f-05c9-4346-b3d2-40b2bd24fde4,c8e83833-ecbc-44fe-b6db-735228c25a1c,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T18:48:49.391686+00:00,Now explain it to a dog,prompter,en,3,True,False,,False,,"{'toxicity': 0.03648477792739868, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,,"{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
3,343ee2d4-87ae-41fd-a768-bdd65959dc4a,6ab24d72-0181-4594-a9cd-deaf170242fb,49ddcb0d-6588-43bd-858d-19c577f12e7b,2023-02-06T13:37:56.044680+00:00,Monopsony is a market structure in which there...,assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0008866374846547842, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_labeling...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
4,18145bf4-37fd-4ac0-80f5-6108b5f2b365,343ee2d4-87ae-41fd-a768-bdd65959dc4a,e10e99a0-38ac-4b07-bf5d-4427696e4e0d,2023-02-06T18:52:51.428543+00:00,How can one fight back when a monospony had be...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0009362137061543763, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1'], 'count': [1]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."


In [11]:
def PrepareData(data):
    output = []
    for i in range(0, len(data) - 1, 2): 
        user = data.iloc[i]["text"]
        assistant = data.iloc[i + 1]["text"]
        
        output.append([
            {"role": "user", "content": user},
            {"role": "assistant", "content": assistant}
        ])
        
    return [{"messages":pair} for pair in output]

In [12]:
prep_data = PrepareData(combined_instruction_data)

In [13]:
prep_data[0]

{'messages': [{'role': 'user',
   'content': 'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.'},
  {'role': 'assistant',
   'content': '"Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.\n\nRecent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face l

In [14]:
hugging_face_dataset = Dataset.from_list(prep_data)

hugging_face_dataset

Dataset({
    features: ['messages'],
    num_rows: 44419
})

In [15]:
hugging_face_dataset

Dataset({
    features: ['messages'],
    num_rows: 44419
})

In [16]:
torch.cuda.empty_cache()
import gc
gc.collect()

429

In [17]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=Config.model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    load_in_8bit=False,
)

==((====))==  Unsloth 2025.8.1: Fast Qwen3 patching. Transformers: 4.55.0. vLLM: 0.10.0.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/567M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [18]:
list(CHAT_TEMPLATES)

['unsloth',
 'zephyr',
 'chatml',
 'mistral',
 'llama',
 'vicuna',
 'vicuna_old',
 'vicuna old',
 'alpaca',
 'gemma',
 'gemma_chatml',
 'gemma2',
 'gemma2_chatml',
 'llama-3',
 'llama3',
 'phi-3',
 'phi-35',
 'phi-3.5',
 'llama-3.1',
 'llama-31',
 'llama-3.2',
 'llama-3.3',
 'llama-32',
 'llama-33',
 'qwen-2.5',
 'qwen-25',
 'qwen25',
 'qwen2.5',
 'phi-4',
 'gemma-3',
 'gemma3',
 'qwen-3',
 'qwen3',
 'gemma-3n',
 'gemma3n']

In [19]:
tokenizer = get_chat_template(tokenizer=tokenizer, chat_template="qwen-3")

In [20]:
tokenizer.chat_template

'\n{%- if tools %}\n    {{- \'<|im_start|>system\n\' }}\n    {%- if messages[0].role == \'system\' %}\n        {{- messages[0].content + \'\n\n\' }}\n    {%- endif %}\n    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\n</tool_call><|im_end|>\n" }}\n{%- else %}\n    {%- if messages[0].role == \'system\' %}\n        {{- \'<|im_start|>system\n\' + messages[0].content + \'<|im_end|>\n\' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for forward_message in messages %}\n

In [21]:
tokenizer.chat_template =  """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}{{ eos_token }}
                {% endif %}
                {% endfor %}"""

tokenizer.pad_token = tokenizer.eos_token

In [22]:
tokenizer.eos_token

'<|endoftext|>'

In [23]:
tokenizer.pad_token

'<|endoftext|>'

In [24]:
tokenizer.chat_template

"{% for message in messages %}\n                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n\n                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n\n                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}{{ eos_token }}\n                {% endif %}\n                {% endfor %}"

In [25]:
model

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024, padding_idx=151654)
    (layers): ModuleList(
      (0-1): 2 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear4bit(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear4bit(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear4bit(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
      

In [26]:
qlora_model = FastLanguageModel.get_peft_model(
    model=model, 
    r=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.0,
    bias='none',
    use_gradient_checkpointing='unsloth',
    random_state=Config.random_state,
)

Unsloth 2025.8.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [87]:
def TestInstructModel(model, tokenizer, user_query, enable_thinking=False):
    user_prompt = [
        {"role":"user", "content":user_query}
    ]

    chat_template = tokenizer.apply_chat_template(user_prompt, 
                                                  tokenize=False,
                                                  add_generation_prompt=True,
                                                  enable_thinking=True if enable_thinking else False)

    print("Model Inferencing...")
    print("---"*50)

    print("Chat Template: ", chat_template)

    tokenized_template = tokenizer(chat_template, return_tensors="pt", truncation=True).to("cuda:0")

    print("Bot Response:")
    _ = model.generate(
                      **tokenized_template,
                      max_new_tokens = 1024,
                      eos_token_id=tokenizer.eos_token_id,
                      pad_token_id=tokenizer.eos_token_id,
                      temperature = .7 if not enable_thinking else .6, 
                      top_p = 0.8 if not enable_thinking else .95, 
                      top_k = 20,
                      streamer = TextStreamer(tokenizer, skip_prompt=True)
     )

#### Response Before Training

In [28]:
FastLanguageModel.for_inference(qlora_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024, padding_idx=151654)
        (layers): ModuleList(
          (0-1): 2 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Li

In [29]:
TestInstructModel(model=qlora_model,
                  tokenizer=tokenizer, 
                  user_query="Can you write a short introduction about the relevance of the term 'monopsony' in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.")

Chat Template:  User: Can you write a short introduction about the relevance of the term 'monopsony' in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.




Bot Response:
Monopsony, a term coined by economist John Commons, refers to a market structure where a single buyer has the power to purchase a significant quantity of a good or service at a price that is significantly lower than what would be possible under a competitive market. This concept is particularly relevant in the labour market, where it can manifest in various forms, such as a single employer controlling a large portion of the workforce or a single buyer controlling a significant portion of the goods being sold.

In the context of the labour market, a potential monopsony can be observed in several scenarios. For instance, in a highly competitive labour market, where multiple employers are competing for the same pool of workers, a single employer might be abl

In [30]:
def AddChatTemplate(data_batch, tokenizer):
    data_with_template = []
    for data in data_batch["messages"]:
        data_with_template.append(tokenizer.apply_chat_template(data, tokenize=False, add_generation_prompt=False))
    return {"template_text":data_with_template}

In [31]:
prep_instruction_data = hugging_face_dataset.map(lambda x: AddChatTemplate(x,tokenizer) , batched=True)

prep_instruction_data

Map:   0%|          | 0/44419 [00:00<?, ? examples/s]

Dataset({
    features: ['messages', 'template_text'],
    num_rows: 44419
})

In [32]:
prep_instruction_data[0]["template_text"]

'User: Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.\n\nAssistant: "Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.\n\nRecent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face low wages, limited benefits, and reduced bargaining power, leading t

In [33]:
# Prepare model for efficient finetuning
FastLanguageModel.for_training(qlora_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024, padding_idx=151654)
        (layers): ModuleList(
          (0-1): 2 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Li

In [34]:
 sft_config = SFTConfig(output_dir="./instruction_model", 
                       num_train_epochs=1,
                       do_eval=True,
                       do_train=True,
                       per_device_train_batch_size=2,
                       learning_rate=2e-5, 
                       weight_decay=0.01, 
                       lr_scheduler_type="linear", 
                       logging_strategy="steps", 
                       save_strategy="steps", 
                       seed=Config.random_state, 
                       fp16=not is_bfloat16_supported(),
                       bf16=is_bfloat16_supported(),
                       dataset_text_field="template_text",
                       gradient_checkpointing=True,
                       gradient_accumulation_steps=16,
                       eos_token=tokenizer.eos_token,
                       pad_token=tokenizer.pad_token,
                       optim="adamw_8bit",
                       report_to="none")

sft_trainer = SFTTrainer(model=qlora_model,
                         args=sft_config, 
                         train_dataset=prep_instruction_data.select(range(1000)), # Due to limited resources i reduced the data to 1000 data point 
                         eval_dataset=None, 
                         processing_class=tokenizer)

Unsloth: Tokenizing ["template_text"] (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [35]:
trainind_stata = sft_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 32
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 16 x 1) = 32
 "-____-"     Trainable parameters = 5,046,272 of 601,096,192 (0.84% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.1884
2,1.8444
3,1.846
4,2.0279
5,1.789
6,1.8156
7,1.946
8,2.0086
9,2.3014
10,1.8684


In [36]:
# Prepare model for inferencing
FastLanguageModel.for_inference(sft_trainer.model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024, padding_idx=151654)
        (layers): ModuleList(
          (0-1): 2 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Li

#### Response After Training

In [37]:
TestInstructModel(model=sft_trainer.model,
                  tokenizer=sft_trainer.tokenizer, 
                  user_query="Can you write a short introduction about the relevance of the term 'monopsony' in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Chat Template:  User: Can you write a short introduction about the relevance of the term 'monopsony' in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.




Bot Response:
Monopsony is a term used in economics to describe a market structure where a single buyer has the power to purchase a large quantity of a good or service at a price that is significantly lower than the price that would prevail in a competitive market. This can occur in situations where there is a large number of buyers and a small number of sellers, or where there is a large number of sellers and a small number of buyers. In such a market, the single buyer can influence the price of the good or service, and the price can be set below the competitive level.

One example of a potential monopsony in the labour market is the case of a single employer who hires a large number of workers. In this case, the employer has the power to hire a large number of worker

In [67]:
dpo_data = load_dataset(Config.dpo_dataset_name)

dpo_data

DatasetDict({
    train_prefs: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 61135
    })
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 61135
    })
    test_prefs: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 2000
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 1000
    })
    train_gen: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows: 61135
    })
    test_gen: Dataset({
        features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected'],
        num_rows

In [68]:
dpo_df = dpo_data["train_prefs"].to_pandas()

dpo_df.head()

Unnamed: 0,prompt,prompt_id,chosen,rejected,messages,score_chosen,score_rejected
0,how can i develop a habit of drawing daily,086b3e24f29b8956a01059f79c56db35d118a06fb6b844...,[{'content': 'how can i develop a habit of dra...,[{'content': 'how can i develop a habit of dra...,[{'content': 'how can i develop a habit of dra...,8.5,8.5
1,how can I transform the getPosition method of ...,2766cbd1fed7f982d94b031596e771c841668bd8913839...,[{'content': 'how can I transform the getPosit...,[{'content': 'how can I transform the getPosit...,[{'content': 'how can I transform the getPosit...,6.5,6.5
2,"Given a sentence in French, provide an equival...",0efb42706b3fcc906f579505c7cc0c4e68a640ab3862b1...,"[{'content': 'Given a sentence in French, prov...","[{'content': 'Given a sentence in French, prov...","[{'content': 'Given a sentence in French, prov...",6.0,3.0
3,"Which animal has two hands, a hyrax or a dog?",d169f4610d69b94f54b6923e11aeffe2e321e272395686...,"[{'content': 'Which animal has two hands, a hy...","[{'content': 'Which animal has two hands, a hy...","[{'content': 'Which animal has two hands, a hy...",9.0,8.0
4,Can you explain more about how Tamping Coke is...,ac205b0f69d45eae8dbf446673bd78284e230b6ce3381e...,[{'content': 'Can you explain more about how T...,[{'content': 'Can you explain more about how T...,[{'content': 'Can you explain more about how T...,8.0,6.0


In [69]:
dpo_model, dpo_tokenizer = FastLanguageModel.from_pretrained(
    model_name=Config.model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

# Apply the LoRA weights from checkpoint
dpo_model = PeftModel.from_pretrained(
    model, 
    "/kaggle/working/instruction_model/checkpoint-32",
    is_trainable=True
)

==((====))==  Unsloth 2025.8.1: Fast Qwen3 patching. Transformers: 4.55.0. vLLM: 0.10.0.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [98]:
dpo_tokenizer.chat_template =  """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}{{ eos_token }}
                {% endif %}
                {% endfor %}"""

dpo_tokenizer.pad_token = dpo_tokenizer.eos_token

In [112]:
def PrepareDpoData(df, tokenizer):
    data_list = []
    for idx, rows in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
        
        data_list.append(
        {
        "prompt": tokenizer.apply_chat_template([{"role":"user", "content":rows["messages"][0]["content"]}], tokenize=False),
        "chosen": tokenizer.apply_chat_template([{"role":"assistant", "content":rows["chosen"][1]["content"]}], tokenize=False),
        "rejected": tokenizer.apply_chat_template([{"role":"assistant", "content":rows["rejected"][1]["content"]}], tokenize=False)
        }
    )
    return Dataset.from_list(data_list)

In [113]:
dpo_formatted_data = PrepareDpoData(dpo_df, dpo_tokenizer)

dpo_formatted_data

100%|██████████| 61135/61135 [00:10<00:00, 5765.68it/s]


Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 61135
})

In [114]:
dpo_formatted_data[0]

{'prompt': 'User: how can i develop a habit of drawing daily\n\n',
 'chosen': "Assistant: Developing a daily habit of drawing can be challenging but with consistent practice and a few tips, it can become an enjoyable and rewarding part of your daily routine. Here are some strategies to help you develop the habit of drawing daily:\n\n1. Set a specific time: Allocate a specific time of the day to draw. It could be in the morning, afternoon, or evening. Make drawing a part of your daily routine.\n2. Set a specific duration: Determine the amount of time you want to spend on drawing each day. It can be as little as 10 minutes or as long as an hour. Be consistent with the duration to help build the habit.\n3. Start small and simple: Don't try to create a masterpiece every day, start with simple and easy-to-do sketches. Focus on improving your skills gradually.\n4. Use a variety of tools and mediums: Experiment with different tools like pencils, pens, markers, and different mediums like paper

In [115]:
FastLanguageModel.for_training(dpo_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024, padding_idx=151654)
        (layers): ModuleList(
          (0-1): 2 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Li

In [116]:
PatchDPOTrainer() # Add this for efficient finetuning

dpo_config = DPOConfig(output_dir="dpo_model_from_instruction_model",
                       num_train_epochs=1,
                       do_train=True, 
                       do_eval=False,
                       per_device_train_batch_size=2,
                       learning_rate=2e-5,
                       beta=.2,
                       weight_decay=0.01, 
                       lr_scheduler_type="linear", 
                       logging_strategy="steps",
                       save_strategy="steps", 
                       seed=Config.random_state, 
                       fp16=not is_bfloat16_supported(),
                       bf16=is_bfloat16_supported(),
                       gradient_checkpointing=True,
                       gradient_accumulation_steps=3,
                       optim="adamw_8bit",
                       report_to="none")

dpo_trainer = DPOTrainer(model=dpo_model, 
                         ref_model=None,
                         args=dpo_config,
                         train_dataset=dpo_formatted_data.select(range(500)), # Due to limited resources i reduced the data to 500 data point
                         # eval_dataset=dpo_formatted_data.shuffle(seed=Config.random_state).select(range(500)),
                         processing_class=dpo_tokenizer)

Extracting prompt in train dataset (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

In [117]:
dpo_training_stats = dpo_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 84
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 3
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 3 x 1) = 6
 "-____-"     Trainable parameters = 5,046,272 of 601,096,192 (0.84% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.1146,2.418448,0.06244,1.0,2.356009,-94.584328,-220.605103,-1.182632,0.136104,0,0,0,0
2,0.249,3.040781,0.831871,0.833333,2.20891,-178.307983,-144.500107,-0.291362,-0.476682,No Log,No Log,No Log,No Log
3,0.0055,4.325304,-0.965668,1.0,5.290971,-524.623352,-423.387054,-0.932113,-0.518924,No Log,No Log,No Log,No Log
4,0.1262,3.53502,0.396612,1.0,3.138408,-167.497162,-189.217834,-0.71698,-0.304245,No Log,No Log,No Log,No Log
5,0.1587,3.143517,0.434337,0.833333,2.70918,-185.999008,-250.433838,-0.458511,-0.822143,No Log,No Log,No Log,No Log
6,0.0899,3.440002,-0.627796,1.0,4.067798,-349.79422,-415.872284,-0.174519,-0.551759,No Log,No Log,No Log,No Log
7,0.0736,3.422282,-0.075139,1.0,3.497422,-466.188812,-482.306854,-0.747264,-0.563671,No Log,No Log,No Log,No Log
8,0.093,3.505237,0.610544,1.0,2.894693,-212.612564,-203.75354,-0.404231,-0.744077,No Log,No Log,No Log,No Log
9,0.0833,4.639887,0.780694,1.0,3.859192,-801.399902,-573.929688,-0.510913,-0.508544,No Log,No Log,No Log,No Log
10,0.0992,3.062364,-0.112117,1.0,3.174481,-309.558258,-307.858673,-0.729365,-0.189225,No Log,No Log,No Log,No Log


In [118]:
# Prepare model for inferencing
FastLanguageModel.for_inference(dpo_trainer.model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024, padding_idx=151654)
        (layers): ModuleList(
          (0-1): 2 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Li

## Prompt
**how can i develop a habit of drawing daily**

---

## Chosen
Developing a daily habit of drawing can be challenging but with consistent practice and a few tips, it can become an enjoyable and rewarding part of your daily routine. Here are some strategies to help you develop the habit of drawing daily:

1. **Set a specific time**  
   Allocate a specific time of the day to draw. It could be in the morning, afternoon, or evening. Make drawing a part of your daily routine.

2. **Set a specific duration**  
   Determine the amount of time you want to spend on drawing each day. It can be as little as 10 minutes or as long as an hour. Be consistent with the duration to help build the habit.

3. **Start small and simple**  
   Don't try to create a masterpiece every day; start with simple and easy-to-do sketches. Focus on improving your skills gradually.

4. **Use a variety of tools and mediums**  
   Experiment with different tools like pencils, pens, markers, and different mediums like paper, canvas, or digital apps to keep your drawing practice interesting and engaging.

5. **Take breaks and rest**  
   Taking breaks and resting after some time of drawing can help you avoid burnout and stay motivated.

6. **Challenge yourself**  
   Set challenges like drawing objects from memory or a specific subject to improve your skills and keep your drawing practice interesting.

7. **Track your progress**  
   Keep a record of your daily drawing practice and track your progress. This can be a source of motivation and help you see how far you've come.

> Remember, developing a habit takes time and patience. Stay consistent with your drawing practice, be flexible and open to trying new things, and with time, you'll develop a habit of daily drawing that brings you joy and satisfaction.

---

## Rejected
As an AI language model, I cannot personally develop habits for you. But, here are some tips for developing a habit of drawing daily:

1. **Start small**  
   Start with simple drawings or doodles and gradually increase the complexity of your drawings.

2. **Set a schedule**  
   Set a specific time every day to draw, and try to stick to it as much as possible.

3. **Make it fun**  
   Don't pressure yourself to create masterpieces every time you draw. Make it a relaxing and enjoyable experience.

4. **Use resources**  
   There are many drawing tutorials available online. Use resources like YouTube or online drawing courses to help you improve your skills.

5. **Surround yourself with inspiration**  
   Expose yourself to a variety of art forms, such as paintings, illustrations, and photographs, to inspire and motivate you.

> Remember, everyone has their own creative style and pace. Just keep practicing and enjoying the process of drawing.


#### After Training DPO model

In [119]:
TestInstructModel(model=dpo_trainer.model,
                  tokenizer=dpo_trainer.tokenizer, 
                  user_query="how can i develop a habit of drawing daily",
                  enable_thinking=False)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Model Inferencing...
------------------------------------------------------------------------------------------------------------------------------------------------------
Chat Template:  User: how can i develop a habit of drawing daily


Bot Response:
Assistant: Developing a habit of drawing daily can be a rewarding and enjoyable activity, but it requires consistency and a positive mindset. Here’s a step-by-step guide to help you establish and maintain this habit:

### 1. **Set a Regular Schedule**
   - **Start with a few days a week:** This will help you build a routine and avoid feeling overwhelmed.
   - **Stick to it:** Consistency is key. Even if it’s just one or two days a week, it’s better than never.

### 2. **Choose a Time and Place**
   - **Find a quiet, comfortable spot:** Whether it’s your desk, a quiet room, or even a dedicated drawing space, make sure it’s where you can focus without distractions.
   - **Set a timer:** Use a timer to remind yourself to draw. For example, 