<a href="https://colab.research.google.com/github/Amityadav9/Fine-Tuning-Colab/blob/main/finetune_llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets accelerate peft trl torch unsloth

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.7/164.7 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.4/318.4 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Optional cell to verify GPU
import torch
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU model:", torch.cuda.get_device_name(0))

GPU available: True
GPU model: Tesla T4


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
0.0 GB of memory reserved.


In [None]:
import torch

In [None]:
# Cell 2: Import required libraries
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template, standardize_sharegpt

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
# Cell 3: Load model and tokenizer
def load_base_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-3B-Instruct",
        max_seq_length=2048,
        load_in_4bit=True,
    )
    return model, tokenizer

model, tokenizer = load_base_model()

==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers, TRL and unsloth via:
`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`


In [None]:
# Cell 4: Add LoRA adapters
def setup_peft_model(base_model):
    model = FastLanguageModel.get_peft_model(
        base_model,
        r=16,
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ],
    )
    return model

model = setup_peft_model(model)

Unsloth 2024.10.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
# Cell 5: Prepare dataset
def prepare_dataset(tokenizer):
    # Set up chat template
    tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

    # Load and process dataset
    dataset = load_dataset("mlabonne/FineTome-100k", split="train")
    dataset = standardize_sharegpt(dataset)

    # Apply chat template
    dataset = dataset.map(
        lambda examples: {
            "text": [
                tokenizer.apply_chat_template(convo, tokenize=False)
                for convo in examples["conversations"]
            ]
        },
        batched=True
    )
    return dataset, tokenizer

dataset, tokenizer = prepare_dataset(tokenizer)

README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Standardizing format:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:

# Cell 6: Setup training arguments and trainer
def setup_trainer(model, dataset):
    training_args = TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        output_dir="outputs",
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        dataset_text_field="text",
        max_seq_length=2048,
        args=training_args,
    )
    return trainer

trainer = setup_trainer(model, dataset)


tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
2.768 GB of memory reserved.


In [None]:
# Cell 7: Train the model
# Note: This cell might take a while to execute
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,2.194
2,2.3322
3,1.7085
4,1.9617
5,1.7113
6,1.7627
7,1.1043
8,1.8268
9,1.5573
10,1.4278


TrainOutput(global_step=60, training_loss=1.152050319314003, metrics={'train_runtime': 672.5993, 'train_samples_per_second': 0.714, 'train_steps_per_second': 0.089, 'total_flos': 7096239206277120.0, 'train_loss': 1.152050319314003, 'epoch': 0.0048})

In [None]:
# Cell 8: Save the model
# Note: Make sure you have enough space in your Colab instance
model.save_pretrained("finetuned_model")

In [None]:
trainer_stats=trainer.train()

**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,1.0813
2,1.4341
3,0.9007
4,0.9965
5,0.9397
6,1.112
7,0.6426
8,1.3582
9,1.0994
10,1.0955


In [None]:

#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

495.8638 seconds used for training.
8.26 minutes used for training.
Peak reserved memory = 7.803 GB.
Peak reserved memory for training = 5.035 GB.
Peak reserved memory % of max memory = 52.909 %.
Peak reserved memory for training % of max memory = 34.14 %.


In [None]:


# Try more complex tests
# Test domain knowledge
messages = [{"role": "user", "content": "How does dopamine affect working memory?"}]
response = model.generate_response(messages)
print(response)

# Test response style
messages = [{"role": "user", "content": "Can you explain traumatic brain injury in simple terms?"}]
response = model.generate_response(messages)
print(response)

In [None]:
from datasets import Dataset

# Create the training data
training_data = {
    "instruction": [
        # Medical and Clinical Questions
        "Explain the symptoms of frontal lobe damage",
        "Describe how to assess a concussion patient",
        "Explain how brain function changes with age",

        # Cognitive Science Questions
        "Explain what working memory is",
        "Describe how dopamine affects brain function",
        "Explain what executive function means",

        # Treatment and Recovery Questions
        "Explain how to recover from a concussion",
        "Describe ways to improve cognitive function",
        "Explain how sleep affects brain recovery"
    ],

    "input": [
        # Medical and Clinical Inputs
        "What are the signs that someone has frontal lobe damage or dysfunction?",
        "What should doctors look for when examining a concussion patient?",
        "How does the brain's functionality change as people get older?",

        # Cognitive Science Inputs
        "Define working memory and explain its importance in daily life",
        "How does the neurotransmitter dopamine influence brain performance?",
        "What is executive function and why is it important?",

        # Treatment and Recovery Inputs
        "What steps should someone take to recover from a concussion?",
        "What are effective ways to maintain and improve cognitive abilities?",
        "How does sleep quality impact brain recovery and function?"
    ],

    "output": [
        # Medical and Clinical Outputs
        "Frontal lobe damage typically shows up as problems with planning, organizing, and decision-making. Patients might have difficulty setting priorities, achieving goals, or controlling impulses. They often know what they should do but can't apply the rules properly. Even mild frontal lobe dysfunction can appear as difficulty focusing, getting distracted easily, or having trouble adapting to new situations.",

        "When examining a concussion patient, doctors should assess multiple symptoms: mental clarity, sensitivity to light, dizziness, and sleep patterns. Many patients experience persistent symptoms even a year after injury. It's important to understand that even if they didn't lose consciousness, they may have significant symptoms that affect daily life. Regular follow-up is needed as symptoms may only become apparent when returning to normal activities.",

        "The brain's functionality changes significantly with age, particularly in the frontal lobes. Executive functions typically peak in the early twenties. While some decline is normal with aging, staying mentally active can help maintain function. The frontal cortex often shows the first signs of age-related changes, affecting planning and organizational abilities.",

        # Cognitive Science Outputs
        "Working memory is the brain's ability to hold and manipulate information temporarily. It's like a mental workspace where you can keep information active while using it - like remembering a phone number long enough to dial it, or holding the beginning of a sentence in mind while reading to the end. This ability is crucial for reading comprehension, problem-solving, and following conversations.",

        "Dopamine acts as a crucial neurotransmitter that affects working memory and cognitive performance. It follows an inverted U-shaped curve - too little or too much can impair function. The optimal level varies between individuals. Dopamine helps maintain persistent neural activity in the frontal lobes, which is essential for working memory and executive function.",

        "Executive function refers to the brain's ability to plan, organize, and carry out specific cognitive strategies. It's like having a CEO in your brain that manages all other functions. This includes the ability to maintain goals, switch between tasks, control impulses, and adapt behavior to different situations. The frontal lobes are primarily responsible for these abilities.",

        # Treatment and Recovery Outputs
        "Recovering from a concussion requires several steps: First, get adequate rest while gradually returning to activities as tolerated. Avoid activities that worsen symptoms. Optimize sleep patterns as poor sleep can slow recovery. Consider cognitive rehabilitation exercises, and work with healthcare providers to monitor progress. Don't rush back to full activity too quickly.",

        "To improve cognitive function: Maintain regular sleep patterns, engage in regular physical exercise, read books without interruption, learn new skills, and minimize distractions like constant phone checking. Practice focused attention through activities like reading complete book chapters without interruption. Consider brain training exercises that target specific cognitive functions.",

        "Sleep is fundamental for brain recovery and cognitive function. Poor sleep can significantly impair frontal lobe function and working memory. During sleep, the brain consolidates memories and repairs itself. Even one night of poor sleep can noticeably affect cognitive performance. Consistent good sleep is essential for maintaining optimal brain function and recovering from injury."
    ]
}

# Convert to Dataset format
dataset = Dataset.from_dict(training_data)

# Apply the formatting function you provided
def formatting_prompts_func(examples):
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []

    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + "</s>"  # Using </s> as EOS token
        texts.append(text)

    return {"text": texts}

# Format the dataset
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [None]:
# Load the formatted dataset into your training script
trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        output_dir="outputs",
    ),
)

# Start training
trained=trainer.train()

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 9 | Num Epochs = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,2.5742
2,2.6114
3,2.476
4,2.4993
5,2.1722
6,1.907
7,1.798
8,1.4532
9,1.2768
10,1.164


In [None]:
# Cell 8: Save the model
# Note: Make sure you have enough space in your Colab instance
model.save_pretrained("finetuned_model_brain")

In [None]:
from unsloth import FastLanguageModel

# 1. Format the prompt like in training
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# 2. Enable faster inference
FastLanguageModel.for_inference(model)

# 3. Create inputs for the model
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Explain what working memory is", # instruction
            "Provide a simple explanation of working memory and its importance", # input
            "" # output - leave blank for generation
        )
    ],
    return_tensors="pt"
).to("cuda")

# 4. Generate response
outputs = model.generate(
    **inputs,
    max_new_tokens=256,  # adjust this for longer/shorter responses
    use_cache=True
)

# 5. Decode and print response
print(tokenizer.batch_decode(outputs)[0])

# To try another question:
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "What happens during a concussion?", # instruction
            "Explain the effects of a concussion on the brain", # input
            "" # output
        )
    ],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=256, use_cache=True)
print(tokenizer.batch_decode(outputs)[0])

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Explain what working memory is

### Input:
Provide a simple explanation of working memory and its importance

### Response:
Working memory is the brain's ability to hold and manipulate information temporarily. It's like a mental workspace where you can keep information active while using it - like remembering a phone number long enough to dial it, or holding the beginning of a sentence in mind while reading to the end. This ability is crucial for reading comprehension, problem-solving, and following conversations.</s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')