# Installing dependencies

In [1]:
!pip install -U transformers accelerate peft bitsandbytes datasets trl

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloa

# Importing Libraries

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import Trainer

# Choosing model and Configuring 4-bit Quantization (QLoRA core)

In [3]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load Model & Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Configure LoRA

In [5]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # important
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

## Apply LoRA

In [6]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


# Load Dataset

In [7]:
dataset = load_dataset("yahma/alpaca-cleaned")

def tokenize(batch):
    texts = [
        inst + "\n" + out
        for inst, out in zip(batch["instruction"], batch["output"])
    ]
    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512
    )

train_dataset = dataset["train"].map(
    tokenize,
    batched=True,
    remove_columns=dataset["train"].column_names
)


README.md: 0.00B [00:00, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

# Setting Training Arguments

In [8]:
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    max_steps=500,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    output_dir="./qlora-output",
    save_steps=100,
    save_total_limit=2,
    report_to="none"
)

# Trainer Setup and Training

In [9]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # IMPORTANT for causal LM
)
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    data_collator=data_collator
)
trainer.train()

Step,Training Loss
10,1.6121
20,1.4971
30,1.4214
40,1.3799
50,1.4966
60,1.3346
70,1.4061
80,1.3674
90,1.4042
100,1.389


TrainOutput(global_step=500, training_loss=1.3233944854736328, metrics={'train_runtime': 730.9707, 'train_samples_per_second': 5.472, 'train_steps_per_second': 0.684, 'total_flos': 1.2725929377792e+16, 'train_loss': 1.3233944854736328, 'epoch': 0.07727975270479134})

## Save LoRA Adapters

In [10]:
model.save_pretrained("qlora-adapter")
tokenizer.save_pretrained("qlora-adapter")

('qlora-adapter/tokenizer_config.json',
 'qlora-adapter/special_tokens_map.json',
 'qlora-adapter/chat_template.jinja',
 'qlora-adapter/tokenizer.model',
 'qlora-adapter/added_tokens.json',
 'qlora-adapter/tokenizer.json')

# Load Base Model

In [11]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, "qlora-adapter")
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

# Building Inference using prompt Fuction same as in our dataset

In [15]:
def build_prompt(instruction):
    return f"""### Instruction:
{instruction}

### Response:
"""
prompt = build_prompt("Explain QLoRA like I'm 10 years old.")

inputs = tokenizer(
    prompt,
    return_tensors="pt"
).to("cuda")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.1,
        eos_token_id=tokenizer.eos_token_id
    )


## Response Time

In [16]:
response = tokenizer.decode(
    outputs[0][inputs["input_ids"].shape[-1]:],
    skip_special_tokens=True
)

print(response)


QLoRA stands for Quantum Learning-based Robust Automated Reasoning and Learning Algorithm, which is a machine learning algorithm that can be used to improve the accuracy of the human reasoning process. The goal of this algorithm is to enable humans to make better decisions by leveraging the power of quantum mechanics.

The QLoRA algorithm works by training a neural network on large sets of data using supervised learning techniques such as decision trees, random forests, and neural networks. This neural network is then able to learn from past mistakes and adapt to new situations, making it an extremely accurate predictor of future outcomes.

One key benefit of using QLoRA over traditional algorithms is its ability to handle uncertainty in decision-making. Uncertainty arises when there are multiple possible outcomes or when there is a lack of information about the situation at hand. QLoRA is designed to be able to tolerate high levels of uncertainty by learning from previous experiences
