<a href="https://colab.research.google.com/github/AbhinavKumar0000/Fine_Tunnin_LLama3/blob/main/Fine_tunning_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import json

from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("HPAI-BSC/medqa-cot-llama31")

In [18]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [19]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! Llama 3 is up to 8k
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # Llama-3 70b also works (just change the model name)
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.7.9: Fast Llama patching. Transformers: 4.53.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [20]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [21]:
# Alpaca-style system prompt
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Make sure tokenizer is defined

def formatting_prompts_func(examples):
    # The MedQA-CoT dataset has these fields:
    # - system_prompt (fixed for all examples)
    # - question (instruction)
    # - response (output)
    # Input field is empty since questions are self-contained

    instructions = examples["question"]
    # Since there's no explicit input context, we'll use empty string
    inputs = [""] * len(instructions)
    outputs = examples["response"]

    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Use the specific system_prompt from the dataset for each example
        full_instruction = f"{examples['system_prompt'][0]}\n\n{instruction}"
        text = alpaca_prompt.format(full_instruction, input_text, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Load the dataset
from datasets import load_dataset
dataset = load_dataset("HPAI-BSC/medqa-cot-llama31", split="train")

# Apply formatting
dataset = dataset.map(formatting_prompts_func, batched=True)



Map:   0%|          | 0/10178 [00:00<?, ? examples/s]

In [22]:
from peft import LoraConfig
from transformers import TrainingArguments
from trl import SFTTrainer

# ===== Ultra-Fast QLoRA Config =====
peft_config = LoraConfig(
    r=8,  # Minimal rank
    lora_alpha=16,
    lora_dropout=0.01,
    target_modules=["q_proj", "v_proj"],  # Only most critical layers
    task_type="CAUSAL_LM"
)

# ===== Speed-Optimized Training Args =====
training_args = TrainingArguments(
    output_dir="./quick_output",
    per_device_train_batch_size=4,  # Max batch size for T4
    gradient_accumulation_steps=2,  # Effective batch=8
    max_steps=150,                 # Super short run (~30 mins)
    learning_rate=5e-4,            # Higher LR for fast learning
    optim="adamw_8bit",
    fp16=True,
    logging_steps=5,
    save_steps=50,
    remove_unused_columns=True,    # Saves memory
    gradient_checkpointing=False,  # Disabled for speed (risky)
    report_to="none",
    seed=3407,
)

# ===== Blazing Fast Trainer =====
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=256,  # Short sequences
    packing=True,        # Critical for speed
    dataset_num_proc=1,  # Less CPU overhead
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/10178 [00:00<?, ? examples/s]

In [23]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
11.402 GB of memory reserved.


In [24]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,178 | Num Epochs = 1 | Total steps = 150
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss
5,1.2798
10,0.9767
15,0.8601
20,0.8622
25,0.8356
30,0.8296
35,0.7997
40,0.7864
45,0.7947
50,0.7996




In [25]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "List the prime numbers contained within the range.", # instruction
        "1-50", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

["<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nList the prime numbers contained within the range.\n\n### Input:\n1-50\n\n### Response:\nTo approach this task, let's analyze the request and break it down step by step.\n\n**Understanding the Request:**\nThe task involves identifying the prime numbers contained within a given range. The range provided is 1-50, which means we need to find all prime numbers between 1 and 50.\n\n**Defining Prime Numbers:**\nA prime number is a positive integer that is divisible only by itself and 1. For example, 2, 3, 5, 7, 11, and 13 are prime numbers because they are divisible only by themselves and 1.\n\n**Algorithm for Finding Prime Numbers:"]

In [26]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Convert these binary numbers to decimal.", # instruction
        "1010, 1101, 1111", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Convert these binary numbers to decimal.

### Input:
1010, 1101, 1111

### Response:
To convert a binary number to decimal, you can use the following method: Start from the rightmost bit of the binary number, and assign each bit a weight based on its position. Then, sum up the weighted values of all the bits to get the decimal equivalent.

Let's break down the process for each of the given binary numbers:

**1010:**

- Start from the rightmost bit (0), and give it a weight of 1 (2^0).
- Next, move to the left and give the next bit (1) a weight of 2 (2^1).
- Continue in this fashion, giving


In [27]:
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving

In [28]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)

# alpaca_prompt = You MUST run cells from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

["<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is a famous tall tower in Paris?\n\n### Input:\n\n\n### Response:\nTo approach this question, let's analyze the information provided and break it down step by step.\n\nUnderstanding the Question: The question asks for a famous tall tower in Paris. The context suggests that the tower is a well-known landmark in the city.\n\nIdentifying the Tower: To identify the tower, let's consider the characteristics"]

In [29]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

In [30]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

In [31]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


AttributeError: 'SFTConfig' object has no attribute 'save_as_json'

In [35]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are an expert medical assistant named Aloe, developed by the High Performance Artificial Intelligence Group at Barcelona Supercomputing Center(BSC). You are to be a helpful, respectful, and honest assistant.', 'question': 'Answer the following question by selecting one of the possible choices. Explain the reasoning process of your decision", # instruction
        "Given the following medical question with options, your task is to select the correct answer by the following process: First summarize what the question is about, then analyze each option individually, and finally select the correct answer through a step-by-step process and conclude by your final option selected. Question: A 16-year-old boy is brought to the physician by his mother because she is worried about his behavior. Yesterday, he was expelled from school for repeatedly skipping classes. Over the past 2 months, he was suspended 3 times for bullying and aggressive behavior towards his peers and teachers. Once, his neighbor found him smoking cigarettes in his backyard. In the past, he consistently maintained an A grade average and had been a regular attendee of youth group events at their local church. The mother first noticed this change in behavior 3 months ago, around the time at which his father moved out after discovering his wife was having an affair. Which of the following defense mechanisms best describes the change in this patient's behavior?Choices:A. Acting outB. ProjectionC. Passive aggressionD. Regression", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an expert medical assistant named Aloe, developed by the High Performance Artificial Intelligence Group at Barcelona Supercomputing Center(BSC). You are to be a helpful, respectful, and honest assistant.', 'question': 'Answer the following question by selecting one of the possible choices. Explain the reasoning process of your decision

### Input:
Given the following medical question with options, your task is to select the correct answer by the following process: First summarize what the question is about, then analyze each option individually, and finally select the correct answer through a step-by-step process and conclude by your final option selected. Question: A 16-year-old boy is brought to the physician by his mother because she is worried about his behavior. Yesterday, he was exp

In [39]:
# Save the model
trainer.model.save_pretrained("output_model")

# Save the processing class (replacing deprecated tokenizer)
trainer.processing_class.save_pretrained("output_model")

# Save the training arguments
try:
    # Attempt to access the embedded TrainingArguments (may vary by trl version)
    training_args = trainer.args  # SFTConfig might wrap TrainingArguments differently
    if hasattr(training_args, 'to_dict'):
        import json
        with open("output_model/training_args.json", "w") as f:
            json.dump(training_args.to_dict(), f, indent=4)
    else:
        # Fallback: Recreate TrainingArguments and save
        from transformers import TrainingArguments
        training_args = TrainingArguments(
            output_dir="outputs",
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_steps=5,
            num_train_epochs=4,
            learning_rate=2e-4,
            fp16=not torch.cuda.is_bf16_supported(),
            bf16=torch.cuda.is_bf16_supported(),
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3407,
        )
        training_args.save_to_json("output_model/training_args.json")  # Correct method
except AttributeError:
    # Last resort: Save SFTConfig as JSON
    import json
    with open("output_model/sft_config.json", "w") as f:
        json.dump(trainer.args.to_dict(), f, indent=4)