### Install Required Packages

In [1]:
# !pip install -q bitsandbytes trl

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from trl import SFTTrainer
from peft import LoraConfig, PeftModel, TaskType
import torch
import json
from datasets import Dataset

import warnings
warnings.filterwarnings('ignore')
logging.set_verbosity(logging.CRITICAL)

### Load the model with QLoRA Configuration

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,         # Quantize to 8-bit
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
)

In [4]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}   # when there is atleast one GPU
)

# Only While Fine_Tuning and Training, Not on Inferencing
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

### Baseline Geneartion

In [8]:
pipe = pipeline("text-generation", model=base_model, tokenizer=tokenizer, max_length=80)

# Define the prompt
ques_type = "multiple choice question"
topic = "Physics"
prompt = f"Generate a {ques_type} on {topic}"

# Generate and print result
result = pipe(prompt)
print(result[0]['generated_text'])

Generate a multiple choice question on Physics that asks students to identify the type of force (e.g., gravitational, elastic, electromagnetic) and explain why it is relevant in a given situation.


### Load the Dataset

In [10]:
# Load dataset
with open("/content/custom_quiz_dataset.jsonl", "r") as f:
    data = [json.loads(line) for line in f if line.strip()]

# Convert to decoder-only format: input + output combined as one sequence
for d in data:
    output = d["output"]
    d["text"] = f"{d['input']}\nQuestion: {output['Question']}\nAnswer: {output['Answer']}"

dataset = Dataset.from_list(data)

In [11]:
dataset

Dataset({
    features: ['input', 'output', 'text'],
    num_rows: 2090
})

In [12]:
def preprocess(sample):
  return sample['text']

### LoRA Configuration

In [13]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Adjust based on model architecture
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

### WandB Login and Tracking

In [14]:
# import wandb
# wandb.login()

In [15]:
import os
os.environ["WANDB_PROJECT"] = "my-quiz-generator"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

### Training Arguments

In [20]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="wandb"
)

### SFTTrainer

In [21]:
trainer = SFTTrainer(
    model = base_model,
    train_dataset = dataset,
    peft_config = lora_config,
    formatting_func = preprocess,
    args = training_arguments,
)

Applying formatting function to train dataset:   0%|          | 0/2090 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/2090 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2090 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2090 [00:00<?, ? examples/s]

### Initiate Fine-Tuning

In [25]:
trainer.train()

{'train_runtime': 350.5643, 'train_samples_per_second': 5.962, 'train_steps_per_second': 0.747, 'train_loss': 0.9374216349070309, 'num_tokens': 77011.0, 'mean_token_accuracy': 0.8013950082990858, 'epoch': 1.0}
TrainOutput(global_step=262, training_loss=0.9374216349070309, metrics={'train_runtime': 350.5643, 'train_samples_per_second': 5.962, 'train_steps_per_second': 0.747, 'total_flos': 483310034976768.0, 'train_loss': 0.9374216349070309})


### Save the Fine-Tuned Model

In [26]:
new_model_name = "tinyllama_finetuned_qlora"

trainer.model.save_pretrained(new_model_name)
tokenizer.save_pretrained(new_model_name)

('tinyllama_finetuned_qlora/tokenizer_config.json',
 'tinyllama_finetuned_qlora/special_tokens_map.json',
 'tinyllama_finetuned_qlora/chat_template.jinja',
 'tinyllama_finetuned_qlora/tokenizer.model',
 'tinyllama_finetuned_qlora/added_tokens.json',
 'tinyllama_finetuned_qlora/tokenizer.json')

In [27]:
pipe = pipeline(task="text-generation", model=new_model_name, tokenizer=new_model_name, max_length=80)

### Inference on the test prompt

In [35]:
ques_type = "True/False"
topic = "Physics"

prompt = f"Generate a {ques_type} on {topic}"
result = pipe(prompt)

# print(result)
print(result[0]['generated_text'])

Generate a True/False on Physics.
Question: In a gas, particles have equal masses.
Answer: True


In [36]:
trainer.model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): L