### Install Required Packages

In [None]:
# !pip install -q bitsandbytes trl

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from trl import SFTTrainer
from peft import LoraConfig, PeftModel, TaskType
import torch
import json
from datasets import Dataset

import warnings
warnings.filterwarnings('ignore')
logging.set_verbosity(logging.CRITICAL)

### Load the model with QLoRA Configuration

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,         # Quantize to 8-bit
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
)

In [None]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}   # when there is atleast one GPU
)

# Only While Fine_Tuning and Training, Not on Inferencing
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

### Baseline Geneartion

In [None]:
pipe = pipeline("text-generation", model=base_model, tokenizer=tokenizer, max_length=80)

# Define the prompt
ques_type = "multiple choice question"
topic = "Physics"
prompt = f"Generate a {ques_type} on {topic}"

# Generate and print result
result = pipe(prompt)
print(result[0]['generated_text'])

### Load the Dataset

In [None]:
# Load dataset
with open("/content/custom_quiz_dataset.jsonl", "r") as f:
    data = [json.loads(line) for line in f if line.strip()]

# Convert to decoder-only format: input + output combined as one sequence
for d in data:
    output = d["output"]
    d["text"] = f"{d['input']}\nQuestion: {output['Question']}\nAnswer: {output['Answer']}"

dataset = Dataset.from_list(data)

In [None]:
dataset

In [None]:
def preprocess(sample):
  return sample['text']

### LoRA Configuration

In [None]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Adjust based on model architecture
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

### WandB Login and Tracking

In [None]:
# import wandb
# wandb.login()

In [None]:
import os
os.environ["WANDB_PROJECT"] = "my-quiz-generator"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

### Training Arguments

In [None]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="wandb"
)

### SFTTrainer

In [None]:
trainer = SFTTrainer(
    model = base_model,
    train_dataset = dataset,
    peft_config = lora_config,
    formatting_func = preprocess,
    args = training_arguments,
)

### Initiate Fine-Tuning

In [None]:
trainer.train()

### Save the Fine-Tuned Model

In [None]:
new_model_name = "tinyllama_finetuned_qlora"

trainer.model.save_pretrained(new_model_name)
tokenizer.save_pretrained(new_model_name)

In [None]:
pipe = pipeline(task="text-generation", model=new_model_name, tokenizer=new_model_name, max_length=80)

### Inference on the test prompt

In [None]:
ques_type = "True/False"
topic = "Physics"

prompt = f"Generate a {ques_type} on {topic}"
result = pipe(prompt)

# print(result)
print(result[0]['generated_text'])

In [None]:
trainer.model