In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Install required libraries for QLoRA fine-tuning
!pip install -U transformers accelerate bitsandbytes datasets peft trl --quiet

## Load and Prepare the Dataset

In [3]:
from datasets import load_dataset


dataset = load_dataset("yahma/alpaca-cleaned")

dataset = dataset["train"].train_test_split(test_size=0.001)
train_data = dataset["train"]

# Defining a function to format the dataset into the Mistral instruction format

def format_alpaca(example):
    # This creates the necessary format: <s>[INST] Instruction/Input [/INST] Response </s>
    if example.get("input"):
        # Combine instruction and input if both exist
        text = f"<s>[INST] {example['instruction']}\n{example['input']} [/INST] {example['output']}</s>"
    else:
        # Use only instruction if input is empty
        text = f"<s>[INST] {example['instruction']} [/INST] {example['output']}</s>"
    return {"text": text}

# Apply the formatting function to the training data
train_data = train_data.map(format_alpaca, remove_columns=["instruction", "input", "output"])

# Display the first formatted example to confirm the structure
print("Formatted Example")
print(train_data[0]["text"])


README.md: 0.00B [00:00, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51708 [00:00<?, ? examples/s]

Formatted Example
<s>[INST] Classify the following sentence according to the type of noun it contains. Output 1 for proper noun, 2 for common noun and 3 for collective noun.
The herd of cows is running across the field. [/INST] 3 for collective noun.</s>


## Load the Model and Tokenizer (4-bit Quantization)

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load the model with 4-bit quantization config (QLoRA)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

## Set Up LoRA Adapters (PEFT)

In [5]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,              # the size of the update matrices (small = efficient)
    lora_alpha=16,    # Scaling factor for the weights
    target_modules=["q_proj", "v_proj"], # Apply LoRA to Query and Value projections
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# Print the number of trainable parameters (should be tiny, e.g., 0.1% of total)
model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.0470


## Define Trainer and Training Arguments

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments

# training arguments
training_args = TrainingArguments(
    output_dir="./qlora-mistral-demo",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    fp16=True,
    logging_steps=10,
    save_steps=500,
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    # max_seq_length=512,
    args=training_args,
)

print("SFTTrainer initialized successfully")

Adding EOS to train dataset:   0%|          | 0/51708 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/51708 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/51708 [00:00<?, ? examples/s]

SFTTrainer initialized successfully


## Start Training and Save Adapters

In [7]:
# Start the training process
trainer.train()

# Save trained LoRA adapters
trainer.model.save_pretrained("mistral-qlora-trained")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbharathreddybollu[0m ([33mbharathreddybollu-jawaharlal-nehru-technological-univers[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.1118
20,0.8285
30,0.7222
40,0.6543


KeyboardInterrupt: 

## Load Trained Adapters and Test Inference

In [None]:
from peft import PeftModel
from transformers import pipeline


base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16 # Use float16 for inference
)

# Load the trained LoRA adapters onto the base model
model_for_inference = PeftModel.from_pretrained(base_model, "mistral-qlora-trained")

# Create the text generation pipeline
pipe = pipeline("text-generation", model=model_for_inference, tokenizer=tokenizer)

# Define the prompt in the correct instruction format
prompt = "Explain gravity to a 5-year-old:"
formatted_prompt = f"<s>[INST] {prompt} [/INST]"

# Run the inference and print the result
output = pipe(formatted_prompt, max_new_tokens=80)[0]["generated_text"]

print("Model Output")
# The output will include the prompt and the response.
print(output)

Gravity is like an invisible friend that gently pulls everything down to the ground.
It keeps your feet on the floor so you don’t float away, and it makes a ball fall when you drop it.
The Earth is big, so it pulls things toward it all the time.
That pull is called gravity.


In [None]:
6081176375ff7bf0a2255a50d26039477ee11ffa