In [None]:
## Installing dependencies
!pip install trl
!pip install -U bitsandbytes

In [None]:
## Setting up kaggle environment variables
from kaggle_secrets import UserSecretsClient
import huggingface_hub
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HUGGINGFACE_TOKEN")

huggingface_hub.login(secret_value_0)

In [None]:
## Importing dependencies
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np

In [None]:
## Setting up model and dataset variables
model = "Qwen/Qwen2.5-3B"
dataset = "sharmaarush/Pytorch_QA"

In [None]:
## Dataset helper functions (including chat templates, tokenization)
from functools import partial

def create_prompt_formats(sample):
    """
    Format QA pairs for PyTorch QA training
    :param sample: Sample dictionary with keys ['user_q', 'question', 'user_a', 'answer']
    """
    INTRO_BLURB = "Below is a question asked by a user. Write the most helpful and accurate answer."
    QUESTION_KEY = "### Question:"
    ANSWER_KEY = "### Answer:"
    END_KEY = "### End"

    blurb = f"\n{INTRO_BLURB}"
    user_question = f"Asked by {sample['user_q']}:\n{QUESTION_KEY}\n{sample['question']}"
    user_answer = f"Answered by {sample['user_a']}:\n{ANSWER_KEY}\n{sample['answer']}"
    end = f"{END_KEY}"

    # Collect non-empty parts
    parts = [part for part in [blurb, user_question, user_answer, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    return sample

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int,seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['user_q', 'question', 'user_a', 'answer'],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [None]:
## Dataset pre processing
dataset = load_dataset(dataset)
train_test_split = dataset["train"].train_test_split(test_size=0.3, seed=42)
test_validation_split = train_test_split["test"].train_test_split(test_size=1/3, seed=42)

dataset_train = train_test_split["train"]
dataset_validation = test_validation_split["train"]
dataset_test = test_validation_split["test"]

print("training dataset ", len(dataset_train))
print("validation dataset ", len(dataset_validation))
print("test dataset ", len(dataset_test))

In [None]:
## Loading model, bnb config and tokenizer
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

base_model = model
device_map = "auto"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map=device_map,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    use_auth_token=True,
)

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, padding_side="left", add_eos_token=True, add_bos_token=True, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
## Setting up training params
# Training
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model.gradient_checkpointing_enable() ## For effecient memmory usage (but is slow)

peft_model = get_peft_model(model, config)

output_dir = "content/model"
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=2,     # Adjust according to GPU memory
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    fp16=True,                         # Added this - crucial for memory
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    # save_steps=50,                     # Reduce checkpoint frequency
    eval_strategy="steps",
    eval_steps=200,
    do_eval=True,
    per_device_eval_batch_size=1,
    dataloader_num_workers=0,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir=True,
    group_by_length=True,
    dataloader_pin_memory=False,
    max_grad_norm=0.3,
)

peft_model.config.use_cache = False
peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=dataset_train_pre,
    eval_dataset=dataset_validation_pre,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
## Train the model
peft_trainer.train()

In [None]:
## Inferencing the model
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("sharmaarush/pytorch_QA_model")

# Load base model (same one you fine-tuned on)
base_model = AutoModelForCausalLM.from_pretrained(
    model,
    device_map="auto"
)

# Load LoRA adapter on top
model = PeftModel.from_pretrained(base_model, "sharmaarush/pytorch_QA_model")

# Create pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

In [None]:
# Run inference
prompt = "Explain FSDP2 in easier terms"
outputs = pipe(prompt, do_sample=True, temperature=0.7)
print(outputs[0]["generated_text"])