# Setting Up the Fine-tuning Environment

In [1]:
!pip install datasets trl



In [2]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from datasets import Dataset
import json
from trl import SFTTrainer
import wandb

# Initialize wandb for experiment tracking
wandb.init(project="qwen-ai-research-qa")

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load datasets
def load_json_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

train_data = load_json_dataset("/content/drive/MyDrive/Mirai - ML hackathon/Task3/train_data.json")
val_data = load_json_dataset("/content/drive/MyDrive/Mirai - ML hackathon/Task3/val_data.json")

# Convert to HF datasets format
def convert_to_hf_dataset(data):
    formatted_data = []
    for item in data:
        formatted_data.append({
            "messages": item["conversations"]
        })
    return formatted_data

train_dataset = Dataset.from_list(convert_to_hf_dataset(train_data))
val_dataset = Dataset.from_list(convert_to_hf_dataset(val_data))

# Define training hyperparameters
model_name = "Qwen/Qwen2.5-3B-Instruct"
output_dir = "/content/drive/MyDrive/Mirai - ML hackathon/Task3/qwen-ai-research-qa"

# QLoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,  # Alpha scaling
    lora_dropout=0.05,  # Dropout probability
    bias="none",  # Don't train bias
    task_type="CAUSAL_LM",  # Task type
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ]  # Attention modules to target
)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,  # Reduced batch size
    gradient_accumulation_steps=16,  # Increase to maintain effective batch size if needed
    num_train_epochs=3,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    fp16=False,  # Mixed precision training
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    report_to="wandb",
    # BF16 would be used if available on GPU
    bf16=torch.cuda.is_bf16_supported(),
    max_grad_norm=0.3,  # Gradient clipping
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Function to print model size
def print_trainable_parameters(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_params} || "
        f"trainable%: {100 * trainable_params / all_params:.2f}"
    )

# Save configuration
config = {
    "model_name": model_name,
    "lora_config": {
        "r": lora_config.r,
        "lora_alpha": lora_config.lora_alpha,
        "lora_dropout": lora_config.lora_dropout,
        "target_modules": list(lora_config.target_modules)
    },
    "training_args": {
        "num_train_epochs": training_args.num_train_epochs,
        "per_device_train_batch_size": training_args.per_device_train_batch_size,
        "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
        "learning_rate": training_args.learning_rate,
        "warmup_ratio": training_args.warmup_ratio,
        "weight_decay": training_args.weight_decay
    }
}

with open(os.path.join(output_dir, "training_config.json"), "w") as f:
    json.dump(config, f, indent=2)

# Print dataset stats
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmyeduwebsites[0m ([33mmyeduwebsites-university-of-moratuwa[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Training dataset size: 48
Validation dataset size: 6


# Fine-tuning the Qwen 2.5 3B Model

In [3]:
!pip install -U bitsandbytes transformers accelerate



In [6]:
# Continuing from the setup code...
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments

# Define training hyperparameters
model_name = "Qwen/Qwen2.5-3B-Instruct"
output_dir = "/content/drive/MyDrive/Mirai - ML hackathon/Task3/qwen-ai-research-qa"

print("Loading model...")
# Load model in 4-bit quantization
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.float16,
#     load_in_4bit=True,
#     device_map="auto"
# )
from transformers import BitsAndBytesConfig

# Define quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load model with quantization config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,  # Use the new argument
    device_map="auto",

)
# Prepare model for training
model = prepare_model_for_kbit_training(model)

# Add LoRA adapters
model = get_peft_model(model, lora_config)

# Print trainable parameters
print_trainable_parameters(model)

# Chat template for proper formatting
response_template = "\n<|im_start|>assistant\n"

def formatting_func(example):
    output_texts = []

    # Check the structure of `example["messages"]`
    print("Debug:", example["messages"], type(example["messages"]))

    for messages in example["messages"]:  # No need for indexing (i)
        text = ""
        if isinstance(messages, list):  # Ensure it's a list
            for message in messages:
                if isinstance(message, dict) and "role" in message and "content" in message:
                    if message["role"] == "user":
                        text += f"<|im_start|>user\n{message['content']}<|im_end|>\n"
                    else:
                        text += f"<|im_start|>assistant\n{message['content']}<|im_end|>\n"
                else:
                    print("Warning: Unexpected message format:", message)
        else:
            print("Warning: Unexpected messages format:", messages)

        output_texts.append(text)
    print(output_texts)
    return output_texts


# Initialize SFT trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
    tokenizer=tokenizer,
    formatting_func=formatting_func,
    peft_config=lora_config,
)

print("Starting training...")
trainer.train()

# Save the trained model
print("Saving model...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("Training complete!")

# Evaluate the model's perplexity on the validation set
eval_results = trainer.evaluate()
print(f"Validation Loss: {eval_results['eval_loss']}")
print(f"Validation Perplexity: {torch.exp(torch.tensor(eval_results['eval_loss']))}")

# Log the final evaluation metrics to wandb
wandb.log({"final_val_loss": eval_results["eval_loss"],
           "final_val_perplexity": torch.exp(torch.tensor(eval_results["eval_loss"])).item()})
wandb.finish()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 29933568 || all params: 1728606208 || trainable%: 1.73
Debug: [{'content': 'Based on the DeepSeek V3 research paper, Could you elaborate on what is deepseek v3 and why is it significant in the ai industry?', 'role': 'user'}, {'content': 'DeepSeek V3 is an AI model from the Chinese startup Deepseek that has caused a stir in the AI space due to its impressive performance at a fraction of the training and inference cost compared to competitors. It uses a Mixture-of-Experts (MoE) architecture with 671B parameters but only 37B active parameters per token, which significantly reduces compute requirements. Its significance lies in improving algorithms at the software level rather than pushing for better hardware, making AI development more efficient and potentially disrupting the industry.', 'role': 'assistant'}] <class 'list'>
['', '']


  trainer = SFTTrainer(


Applying formatting function to train dataset:   0%|          | 0/48 [00:00<?, ? examples/s]

Debug: [[{'content': 'Based on the DeepSeek V3 research paper, Could you elaborate on what is deepseek v3 and why is it significant in the ai industry?', 'role': 'user'}, {'content': 'DeepSeek V3 is an AI model from the Chinese startup Deepseek that has caused a stir in the AI space due to its impressive performance at a fraction of the training and inference cost compared to competitors. It uses a Mixture-of-Experts (MoE) architecture with 671B parameters but only 37B active parameters per token, which significantly reduces compute requirements. Its significance lies in improving algorithms at the software level rather than pushing for better hardware, making AI development more efficient and potentially disrupting the industry.', 'role': 'assistant'}], [{'content': "Based on the research about DeepSeek V3, please answer the following question: What are the key differences between DeepSeek's approach and that of other AI companies?", 'role': 'user'}, {'content': "The key difference in

Converting train dataset to ChatML:   0%|          | 0/48 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/48 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/48 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/48 [00:00<?, ? examples/s]

Debug: [{'content': 'Based on the DeepSeek V3 research paper, Please explain what is multi-head latent attention (mla) and how was it used in deepseek v3.', 'role': 'user'}, {'content': 'Multi-head Latent Attention (MLA) is a technique used in DeepSeek V3 that compresses the Key-Value cache. This reduces memory usage and enables more efficient training by decreasing the memory footprint required during model operation.', 'role': 'assistant'}] <class 'list'>
['', '']


Applying formatting function to eval dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Debug: [[{'content': 'Based on the DeepSeek V3 research paper, Please explain what is multi-head latent attention (mla) and how was it used in deepseek v3.', 'role': 'user'}, {'content': 'Multi-head Latent Attention (MLA) is a technique used in DeepSeek V3 that compresses the Key-Value cache. This reduces memory usage and enables more efficient training by decreasing the memory footprint required during model operation.', 'role': 'assistant'}], [{'content': "Based on the DeepSeek V3 research paper, I'd like to know more about how many parameters does deepseek v3 have and how many are active during inference.", 'role': 'user'}, {'content': 'DeepSeek V3 has a total of 671B parameters in its Mixture-of-Experts (MoE) architecture, but only 37B parameters are active (fire) for each token during processing. This sparse activation approach significantly reduces the computational requirements compared to dense models of similar size.', 'role': 'assistant'}], [{'content': 'Here is information a

Converting eval dataset to ChatML:   0%|          | 0/6 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,No log,2.300149


Saving model...
Training complete!


Validation Loss: 2.3001489639282227
Validation Perplexity: 9.975667953491211


0,1
eval/loss,█▁▁
eval/mean_token_accuracy,▁▇█
eval/runtime,█▃▁
eval/samples_per_second,▁▄█
eval/steps_per_second,▁██
final_val_loss,▁
final_val_perplexity,▁
train/epoch,▁███
train/global_step,▁████

0,1
eval/loss,2.30015
eval/mean_token_accuracy,0.51749
eval/runtime,19.0164
eval/samples_per_second,0.316
eval/steps_per_second,0.053
final_val_loss,2.30015
final_val_perplexity,9.97567
total_flos,433024373784576.0
train/epoch,1.66667
train/global_step,3.0
