# SFT

In [52]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import torch
import gc
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig # Import for 4-bit quantization

In [53]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [54]:
model_name = "qwen2-1.5b"

In [55]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                 # Load model weights in 4-bit precision
    bnb_4bit_quant_type="nf4",        # Use NF4 quantization for better performance
    bnb_4bit_compute_dtype=torch.float16, # Compute activations in float16 for speed
    bnb_4bit_use_double_quant=True,   # Optional: further quantize the quantization constants
)

In [56]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, 
    device_map="auto",             
    torch_dtype=torch.float16,     
)

In [57]:
model = prepare_model_for_kbit_training(model)


In [58]:
peft_config = LoraConfig(
    r=12,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear",
    modules_to_save=["lm_head", "embed_token"], # Ensure these are trained in full precision
    task_type="CAUSAL_LM",
)

In [59]:
dataset = load_dataset("json", data_files="data.json", split="train")


In [60]:
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["question", "hint"]])


In [61]:

def format_prompt(example):
    # Ensure this matches the tokenizer's chat template or your desired format precisely.
    # For Qwen2-Instruct, it expects a specific chat template.
    # Let's use the tokenizer's apply_chat_template for consistency.
    messages = [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": example['hint']}
    ]
    # apply_chat_template will convert messages into the correct format for the model
    # It also handles adding special tokens like <|im_start|> and <|im_end|>
    return {"text": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)}


In [62]:
dataset = dataset.map(format_prompt)

# Keep only 'text' column
dataset = dataset.remove_columns([col for col in dataset.column_names if col != "text"])


In [63]:
training_args = SFTConfig(
    output_dir=f"{model_name}-SFT",
    per_device_train_batch_size=1,      # **CRITICAL**: Reduce this to 1 to start
    gradient_accumulation_steps=8,      # **CRITICAL**: Increase this to compensate for batch_size=1
    gradient_checkpointing=True,        # **CRITICAL**: Saves memory by recomputing activations
    learning_rate=2e-4,                 # Common learning rate for LoRA fine-tuning
    num_train_epochs=12,                 # Or more, depending on dataset size and convergence
    optim="paged_adamw_8bit",           # Use 8-bit AdamW optimizer for memory savings
    logging_steps=10,                   # Log progress frequently
    save_steps=500,                     # Save checkpoints
    fp16=True,                          # Enable mixed precision training (PyTorch native)
    # SFTConfig specific parameters:
    dataset_text_field="text",          # The name of the column containing the text
    packing=True,                       # Packs multiple short examples into one longer sequence
    max_seq_length=1024,                # Adjust based on your data and memory. Shorter sequences save memory.
    # We pass peft_config directly to SFTTrainer, not here.
)

# -------------------- SFTTrainer Initialization and Training --------------------
trainer = SFTTrainer(
    model=model,
    args=training_args,        # Pass the SFTConfig object here
    train_dataset=dataset,
    peft_config=peft_config,   # Pass your LoRA config here
    # The 'packing' and 'max_seq_length' are now part of SFTConfig and automatically handled by SFTTrainer
    # No need for formatting_prompts_func directly as the dataset is already prepared with 'text' column
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [64]:

trainer.train()

print("Fine-tuning complete!")

Step,Training Loss
10,0.6226
20,0.3735


Fine-tuning complete!


In [65]:
trainer.save_model()

# DPO

In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import gc
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import DPOConfig, DPOTrainer # Import DPOConfig and DPOTrainer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import BitsAndBytesConfig


In [2]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [3]:
model_name = "merged_qwen2-1.5b_SFT_model" # Use the instruct version, as DPO optimizes for instruction following

In [4]:
# BitsAndBytesConfig for 4-bit quantization (memory efficiency)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [5]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Add pad token if missing, crucial for batching and generation
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

In [6]:
# Load the Policy Model (the one to be fine-tuned)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto", # Auto-distribute model across available GPUs
    torch_dtype=torch.float16, # Use float16 for computation
)




In [7]:
# Load the Reference Model. This is usually the SFT-tuned model or the base model.
# For memory efficiency, load it with the same 4-bit quantization.
ref_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
)

In [8]:
# Resize token embeddings for both models if a new pad_token was added
if tokenizer.pad_token is not None and len(tokenizer) > model.config.vocab_size:
    model.resize_token_embeddings(len(tokenizer))
    ref_model.resize_token_embeddings(len(tokenizer))


# Prepare models for k-bit training (PEFT compatibility)
model = prepare_model_for_kbit_training(model)
ref_model = prepare_model_for_kbit_training(ref_model) # Also prepare ref_model for kbit


In [9]:
# LoRA Configuration (PEFT)
peft_config = LoraConfig(
    r=12,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear", # Apply LoRA to all linear layers
    modules_to_save=["lm_head", "embed_tokens"], # Ensure these key layers are trained in full precision
    task_type="CAUSAL_LM",
)

In [10]:
# Apply PEFT to the policy model
model = get_peft_model(model, peft_config)
# The ref_model does not get PEFT applied for DPO, it remains frozen and serves as a baseline.

In [11]:
# --- DPO Dataset Preparation ---
# Load your dataset with 'question', 'hint' (chosen), and 'response' (rejected)
raw_dataset = load_dataset("json", data_files="data.json", split="train")

def format_dpo_examples(example):
    """
    Formats the raw dataset example into the 'prompt', 'chosen', 'rejected'
    format required by DPOTrainer, using the Qwen chat template.
    """
    # Construct the user prompt using the Qwen chat template
    user_messages = [{"role": "user", "content": example['question']}]
    prompt_text = tokenizer.apply_chat_template(user_messages, tokenize=False, add_generation_prompt=True)

    # Construct the chosen response, including the prompt and assistant's turn
    chosen_messages = user_messages + [{"role": "assistant", "content": example['hint']}]
    chosen_text = tokenizer.apply_chat_template(chosen_messages, tokenize=False, add_generation_prompt=False)

    # Construct the rejected response, including the prompt and assistant's turn
    rejected_messages = user_messages + [{"role": "assistant", "content": example['response']}]
    rejected_text = tokenizer.apply_chat_template(rejected_messages, tokenize=False, add_generation_prompt=False)

    return {
        "prompt": prompt_text,
        "chosen": chosen_text,
        "rejected": rejected_text,
    }

In [12]:
# Apply the formatting function to your dataset
dpo_dataset = raw_dataset.map(
    format_dpo_examples,
    remove_columns=raw_dataset.column_names # Remove original columns after mapping
)

In [13]:
# --- DPO Training Arguments (DPOConfig) ---
dpo_training_args = DPOConfig(
    output_dir=f"{model_name}-DPO_2",
    per_device_train_batch_size=1,        # **CRITICAL**: Keep at 1 for memory
    gradient_accumulation_steps=8,        # **CRITICAL**: Compensate for batch_size=1
    gradient_checkpointing=True,          # **CRITICAL**: Saves memory by recomputing activations
    learning_rate=5e-5,                   # DPO typically uses a lower LR than SFT
    num_train_epochs=3,                   # Adjust based on dataset size and convergence
    optim="paged_adamw_8bit",             # Use 8-bit AdamW for memory savings
    logging_steps=10,
    save_steps=500,
    fp16=True,                            # Enable mixed precision training
    # DPO specific parameters:
    beta=0.1,                             # Controls the strength of the preference. Start with 0.1-0.5.
    max_length=1024,                      # Max total sequence length for chosen/rejected
    max_prompt_length=512,                # Max length for the prompt part
    # max_completion_length is automatically derived from max_length and max_prompt_length
)

In [14]:
# --- DPOTrainer Initialization and Training ---
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,                  # The frozen reference model
    args=dpo_training_args,               # Pass the DPOConfig object
    train_dataset=dpo_dataset,
    peft_config=peft_config,    
    processing_class=tokenizer          # Apply LoRA to the policy model during DPO
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
print("\n--- Starting DPO training ---")
dpo_trainer.train()

print("DPO fine-tuning complete!")




--- Starting DPO training ---


wandb: Currently logged in as: a-a-elghawas (a-a-elghawas-king-fahd-university-of-petroleum-minerals) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,0.1795
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0
100,0.0


DPO fine-tuning complete!


In [16]:
dpo_trainer.save_model()

# Inference

In [31]:
from peft import PeftModel


In [36]:
t = "SFT"

In [37]:

# --- Configuration ---
# Set the base model name (the one you fine-tuned from)
base_model_name = "qwen2-1.5b"
# Set the directory where your LoRA adapters were saved
lora_adapter_path = f"{base_model_name}-{t}"

# --- 1. Load the Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# Ensure pad_token is set if it was added during training
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# --- 2. Load the Base Model (with quantization if used during training) ---
# If you trained with 4-bit quantization, you must load the base model with it too.
# Otherwise, load it in full precision (float16 or float32)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print(f"Loading base model: {base_model_name}")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config, # Use the same quantization config as training
    device_map="auto",              # Load the base model onto available devices
    torch_dtype=torch.float16,      # Match compute dtype from training
)

# --- 3. Load LoRA Adapters ---
print(f"Loading LoRA adapters from: {lora_adapter_path}")
model = PeftModel.from_pretrained(base_model, lora_adapter_path)

# --- 4. Merge LoRA Adapters into the Base Model (Optional but recommended for inference) ---
# Merging makes the model a regular Hugging Face model, no longer needing PEFT internally.
# This often results in faster inference and can be saved as a standard model.
print("Merging LoRA adapters into the base model...")
model = model.merge_and_unload() # This will put the model in float16 (or original precision)
print("Adapters merged.")

# --- 5. (Optional) Save the Merged Model ---
# This allows you to load the model directly without PEFT in the future.
merged_model_output_path = f"./merged_{base_model_name}_{t}_model"
print(f"Saving merged model to: {merged_model_output_path}")
model.save_pretrained(merged_model_output_path)
tokenizer.save_pretrained(merged_model_output_path)
print("Merged model and tokenizer saved.")


Loading base model: qwen2-1.5b
Loading LoRA adapters from: qwen2-1.5b-SFT
Merging LoRA adapters into the base model...




Adapters merged.
Saving merged model to: ./merged_qwen2-1.5b_SFT_model
Merged model and tokenizer saved.


In [59]:

# --- Configuration ---
# Set the base model name (the one you fine-tuned from)
base_model_name = "qwen2-1.5b"
# Set the directory where your LoRA adapters were saved
# lora_adapter_path = f"{model_name}-{t}"

# --- 1. Load the Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# Ensure pad_token is set if it was added during training
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# --- 2. Load the Base Model (with quantization if used during training) ---
# If you trained with 4-bit quantization, you must load the base model with it too.
# Otherwise, load it in full precision (float16 or float32)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print(f"Loading base model: {base_model_name}")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config, # Use the same quantization config as training
    device_map="auto",              # Load the base model onto available devices
    torch_dtype=torch.float16,      # Match compute dtype from training
)
model = base_model

Loading base model: qwen2-1.5b


In [60]:
model.eval()

# --- 7. Example Inference ---
print("\n--- Performing Inference ---")

# Define your prompt (use the same format as your training data)
def format_prompt_for_inference(question):
    messages = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": ""} # Assistant's turn to respond
    ]
    # Use add_generation_prompt=True for inference to prime the model
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)



--- Performing Inference ---


In [77]:

single_question = """
اختر الإجابة الصحيحة مما يلي:
ما هو جمع العدد 6 + 8
    1أ. 4
ب. 6
ج. 7
د. 2
"""


single_question = """
اختر الإجابة الصحيحة مما يلي:
"ما هو العدد الفردي التالي للعدد 49؟

أ. 48
ب. 50
ج. 51
د. 52"
"""



In [78]:
prompt = format_prompt_for_inference(single_question)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# --- 8. Generate response ---
print(f"\n--- Question: {single_question} ---")
with torch.no_grad(): # Disable gradient calculations for inference to save memory and speed up
    outputs = model.generate(
        **inputs,
        max_new_tokens=250, # Max tokens for the generated response
        do_sample=True,     # Use sampling for more creative/diverse outputs
        temperature=0.7,    # Lower values make output more deterministic (0.7 is a good balance)
        top_p=0.9,          # Nucleus sampling
        eos_token_id=tokenizer.eos_token_id, # Stop generation at end of sequence token
        pad_token_id=tokenizer.pad_token_id  # Important for batch generation (even with batch_size=1)
    )

# --- 9. Decode the generated tokens ---
# Skip the prompt tokens to get only the generated response
response = tokenizer.decode(outputs[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)

print(f"Response:\n{response}")
print("-" * 30)


--- Question: 
اختر الإجابة الصحيحة مما يلي:
"إذا بدأت رحلتك في الساعة 8:30 صباحاً ووصلت وجهتك في الساعة 10:00 صباحاً، فكم استغرقت رحلتك؟

أ. ساعة واحدة
ب. ساعة ونصف
ج. ساعتان
د. نصف ساعة"
 ---
Response:
The answer is "د. نصف ساعة" or "half an hour". This is because the passage states that the person started their journey at 8:30 a.m. and arrived at their destination at 10:00 a.m., which is a total of 1 hour and 30 minutes. Therefore, the travel time is 1.5 hours, or 90 minutes, not 60 minutes. This means the journey lasted for 30 minutes, which is half an hour. 

So, the answer is "د. نصف ساعة" or "half an hour". 

Explanation: 

The passage states that the person started their journey at 8:30 a.m. and arrived at their destination at 10:00 a.m. This is a total of 1 hour and 30 minutes. Therefore, the travel time is 1.5 hours, or 90 minutes. This means the journey lasted for 30 minutes, which is half an hour. Therefore, the answer is "د. نصف ساعة" or "half an hour". 

Explanation: 

T