In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

hf_token = os.getenv("HUGGINGFACE_TOKEN")
assert hf_token is not None, "⚠️ HUGGINGFACE_TOKEN is not set in .env!"

In [None]:
!pip install -q unsloth bitsandbytes datasets peft accelerate transformers

In [None]:
from unsloth import FastLanguageModel

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

In [4]:
from datasets import load_dataset, Dataset

In [None]:
raw_dataset = load_dataset("yahma/alpaca-cleaned")
train_data = raw_dataset["train"].shuffle().select(range(2000))

In [6]:
extra_examples = [
    {
        "instruction": "What is QLoRA?",
        "input": "Explain like I'm new to fine-tuning.",
        "output": "QLoRA is a method for fine-tuning large language models using 4-bit precision. It allows for low-resource training without sacrificing performance.",
    },
    {
        "instruction": "Why is QLoRA useful?",
        "input": "I have limited GPU memory.",
        "output": "QLoRA helps fine-tune big models efficiently using quantization and LoRA adapters, reducing memory requirements significantly.",
    },
    {
        "instruction": "Explain QLoRA in one sentence.",
        "input": "",
        "output": "QLoRA is a technique that fine-tunes large models efficiently using low-bit precision and adapter layers.",
    },
]

In [7]:
from datasets import Dataset, concatenate_datasets

extra_dataset = Dataset.from_list(extra_examples)
train_data = concatenate_datasets([train_data, extra_dataset])


In [8]:
def format_alpaca(example):
    return f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Output:\n{example['output']}"

In [9]:
def tokenize(example):
    prompt = format_alpaca(example)
    tokens = tokenizer(prompt, truncation=True, padding="max_length", max_length=2048)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

In [None]:
tokenized_dataset = train_data.map(tokenize, batched=False)

In [None]:
from dotenv import load_dotenv
load_dotenv()

import os
from huggingface_hub import login

os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"

hf_token = os.getenv("HUGGINGFACE_TOKEN")
assert hf_token, "HUGGINGFACE_TOKEN not found in .env"
login(token=hf_token)

In [12]:
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    push_to_hub=True,
    hub_model_id="Cre4T3Tiv3/unsloth-llama3-alpaca-lora",
    remove_unused_columns=False,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model=model,
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
)

In [15]:
from transformers import Trainer

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

In [17]:
extra_examples = [
    {
        "instruction": "What is QLoRA?",
        "input": "Explain like I'm new to fine-tuning.",
        "output": "QLoRA is a method for fine-tuning large language models using 4-bit precision. It allows for low-resource training without sacrificing performance.",
    },
    {
        "instruction": "Why is QLoRA useful?",
        "input": "I have limited GPU memory.",
        "output": "QLoRA helps fine-tune big models efficiently using quantization and LoRA adapters, reducing memory requirements significantly.",
    },
    {
        "instruction": "Explain QLoRA in one sentence.",
        "input": "",
        "output": "QLoRA is a technique that fine-tunes large models efficiently using low-bit precision and adapter layers.",
    },
]


In [18]:

def generate_prompt(example):
    if example["input"].strip():
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{example["instruction"]}

### Input:
{example["input"]}

### Response:
{example["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{example["instruction"]}

### Response:
{example["output"]}"""

def generate_and_tokenize_prompt(example):
    prompt = generate_prompt(example)
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=2048)


In [None]:
from datasets import Dataset, concatenate_datasets

extra_dataset = Dataset.from_list(extra_examples)
tokenized_extra_dataset = extra_dataset.map(generate_and_tokenize_prompt)

train_data = concatenate_datasets([train_data, tokenized_extra_dataset])


In [None]:
trainer.train()

In [None]:
model.push_to_hub("Cre4T3Tiv3/unsloth-llama3-alpaca-lora", safe_serialization=True)
tokenizer.push_to_hub("Cre4T3Tiv3/unsloth-llama3-alpaca-lora")

In [None]:
import json

extra_data = [
    {
        "instruction": "Explain QLoRA to a beginner.",
        "input": "",
        "output": "QLoRA is a way to fine-tune large language models using less memory by combining quantization (compressing weights) and LoRA (efficient adaptation)."
    },
    {
        "instruction": "Write a tweet that explains QLoRA in simple terms.",
        "input": "",
        "output": "QLoRA = Quantized + LoRA. Fine-tune giant models on a laptop using 4-bit weights. Efficient, cheap, and surprisingly powerful. #AI #LLM"
    },
    {
        "instruction": "What is QLoRA?",
        "input": "Explain it briefly for non-experts.",
        "output": "QLoRA helps reduce memory use when fine-tuning language models by using 4-bit quantization and low-rank adapters (LoRA)."
    },
    {
        "instruction": "Simplify this concept for beginners: Quantized Low-Rank Adaptation",
        "input": "",
        "output": "Quantized Low-Rank Adaptation (QLoRA) makes fine-tuning big AI models faster and cheaper by compressing the model and only updating a small part of it."
    },
    {
        "instruction": "Explain how QLoRA reduces memory usage.",
        "input": "",
        "output": "QLoRA reduces memory by using 4-bit quantized weights and applying LoRA adapters that require fewer parameters to train."
    },
    {
        "instruction": "Describe QLoRA using a cooking analogy.",
        "input": "",
        "output": "QLoRA is like using a premade sauce base (the frozen model) and just adding a few spices (LoRA adapters) to customize the taste — without cooking from scratch (full training)."
    },
    {
        "instruction": "Explain QLoRA in one sentence.",
        "input": "",
        "output": "QLoRA is a memory-efficient way to fine-tune large models by using quantized weights and low-rank updates."
    },
    {
        "instruction": "Why does QLoRA work well for limited hardware?",
        "input": "",
        "output": "QLoRA works on limited hardware because it keeps most of the model frozen and uses compressed weights, so you only update small parts using very little memory."
    },
    {
        "instruction": "Create a short technical summary of QLoRA.",
        "input": "",
        "output": "QLoRA uses 4-bit NormalFloat quantization with low-rank matrix adaptation (LoRA) to enable full-parameter fine-tuning at reduced compute cost."
    },
    {
        "instruction": "Give a fun, tweet-length QLoRA summary for AI enthusiasts.",
        "input": "",
        "output": "QLoRA = low-rank updates + 4-bit compression = fine-tune 65B models on consumer GPUs. 🤯📉 #LLM #AI"
    }
] * 3

with open("qlora_grounded_patch.jsonl", "w") as f:
    for ex in extra_data:
        f.write(json.dumps(ex) + "\n")

print("✅ QLoRA patch saved to qlora_grounded_patch.jsonl")

In [None]:
print(type(model))
print(type(tokenizer))

from datasets import load_dataset, concatenate_datasets
from transformers import TrainingArguments, Trainer
import torch

alpaca_dataset = load_dataset("yahma/alpaca-cleaned", split="train")
qlora_patch = load_dataset("json", data_files="qlora_grounded_patch.jsonl", split="train")

merged_dataset = concatenate_datasets([alpaca_dataset, qlora_patch, qlora_patch, qlora_patch])
print(f"✅ Final merged dataset size: {len(merged_dataset)}")

def tokenize(example):
    prompts = []
    for ex in example["instruction"]:
        prompt = f"### Instruction:\n{ex}\n\n"
        prompts.append(prompt)
    if "input" in example:
        for i, inp in enumerate(example["input"]):
            if inp.strip():
                prompts[i] = prompts[i].replace("\n\n", f"### Input:\n{inp}\n\n", 1)
    for i, out in enumerate(example["output"]):
        prompts[i] += f"### Response:\n{out}"

    encoded = tokenizer(prompts, truncation=True, padding="max_length", max_length=512)
    encoded["labels"] = encoded["input_ids"].copy()
    return encoded

tokenized_dataset = merged_dataset.map(
    tokenize,
    batched=True,
    remove_columns=merged_dataset.column_names,
)

tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

training_args = TrainingArguments(
    output_dir="./output-qlora-patch",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    optim="paged_adamw_8bit",
    logging_steps=10,
    save_steps=100,
    save_total_limit=1,
    learning_rate=2e-4,
    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

model.push_to_hub("Cre4T3Tiv3/unsloth-llama3-alpaca-lora", token=hf_token)
tokenizer.push_to_hub("Cre4T3Tiv3/unsloth-llama3-alpaca-lora", token=hf_token)

print("✅ Adapter + tokenizer pushed to Hugging Face.")
