In [6]:
import os,json, random, torch
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig,
    Trainer, TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import accelerate
import warnings
warnings.filterwarnings("ignore")

2025-10-28 00:13:51.100439: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761610431.335468      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761610431.398806      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [8]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device

In [9]:
# USER CONFIG

MODEL_NAME = "meta-llama/Llama-3.1-8B"
DATA_PATH = "/kaggle/input/study-dataset-v2/study_dataset_v2_large_30000.json"      # expects keys: instruction, response
OUTPUT_DIR = "kaggle/working//study-finetuned-lora-selfcontext"

SEED = 42
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj", "v_proj"]

BATCH_SIZE = 8               # per-device batch
GRAD_ACCUM = 8               # gradient accumulation -> effective batch = BATCH_SIZE * GRAD_ACCUM * n_gpus
EPOCHS = 3
LEARNING_RATE = 2e-4
WEIGHT_DECAY = 0.0
WARMUP_STEPS = 100
MAX_LENGTH = 512             # input (prompt+response) truncation length
VAL_SPLIT = 0.05             # fraction for validation
MAX_RESP_TOKENS = 200        # generation length for eval (how long replies should be)
USE_4BIT = True

In [10]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")
from huggingface_hub import login
login(hf_token)

In [11]:
# System prompt that will always be included
SYSTEM_PROMPT = (
    "You are a knowledgeable, friendly AI tutor for high-school and college students. "
    "You understand science, commerce, and humanities topics. "
    "When a student asks a question, explain clearly using reasoning and examples, "
    "without requiring extra context. Be conversational and helpful."
)

PROMPT_TEMPLATE = (
    "### System:\n{system}\n\n"
    "### Question:\n{instruction}\n\n"
    "### Answer:\n"
)

In [12]:
# Utils
def set_seed(seed: int = 42):
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

In [13]:
def build_prompt(instruction: str) -> str:
    return PROMPT_TEMPLATE.format(system=SYSTEM_PROMPT, instruction=instruction.strip())

In [14]:
# Load Dataset
def load_local_json_as_dataset(path: str) -> Dataset:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    records = []
    for obj in data:
        if not obj.get("instruction") or not obj.get("response"):
            continue
        records.append({
            "instruction": obj["instruction"],
            "response": obj["response"]
        })
    return Dataset.from_list(records)

In [15]:
raw_ds = load_local_json_as_dataset(DATA_PATH)
raw_ds = raw_ds.train_test_split(test_size=VAL_SPLIT, seed=SEED)
dataset = DatasetDict({
    "train": raw_ds["train"].shuffle(seed=SEED),
    "validation": raw_ds["test"].shuffle(seed=SEED)})

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'response'],
        num_rows: 28500
    })
    validation: Dataset({
        features: ['instruction', 'response'],
        num_rows: 1500
    })
})

In [17]:
# Verify dataset size and GPU availability
print(f"Training examples: {len(dataset['train'])}")
print(f"Validation examples: {len(dataset['validation'])}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

Training examples: 28500
Validation examples: 1500
CUDA available: True
GPU count: 2
GPU 0: Tesla T4
GPU 1: Tesla T4


In [18]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

In [19]:
# Tokenizer & Model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
if tokenizer.eos_token is None:
    tokenizer.add_special_tokens({"eos_token": "</s>"})

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [20]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map='auto',token=hf_token
)

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [21]:
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_kbit_training(model)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [22]:
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

In [23]:
model = get_peft_model(model, peft_config)
print("LoRA adapters added successfully.")

LoRA adapters added successfully.


In [24]:
# Tokenization

def tokenize_example(ex,tokenizer):
    prompt = build_prompt(ex["instruction"])
    full = prompt + ex["response"].strip() + tokenizer.eos_token
    full_tokens = tokenizer(full, truncation=True, max_length=MAX_LENGTH)
    prompt_tokens = tokenizer(prompt, truncation=True, max_length=MAX_LENGTH)

    input_ids = full_tokens["input_ids"]
    labels = input_ids.copy()

    # mask system + question tokens from loss
    prompt_len = len(prompt_tokens["input_ids"])
    for i in range(prompt_len):
        if i < len(labels):
            labels[i] = -100

    return {
        "input_ids": input_ids,
        "attention_mask": full_tokens["attention_mask"],
        "labels": labels
    }

In [25]:
def preprocess_batch(batch):
    # batch is a dict of lists
    out = {"input_ids": [], "attention_mask": [], "labels": []}
    for i in range(len(batch["instruction"])):
        ex = {
            "instruction": batch["instruction"][i],
            "response": batch["response"][i]
        }
        tok = tokenize_example(ex, tokenizer)
        out["input_ids"].append(tok["input_ids"])
        out["attention_mask"].append(tok["attention_mask"])
        out["labels"].append(tok["labels"])
    return out

In [26]:
tokenized_ds = dataset.map(lambda batch: preprocess_batch(batch), batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/28500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [27]:
@dataclass
class DataCollatorForCausal:
    tokenizer: AutoTokenizer
    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        input_ids = [torch.tensor(f["input_ids"]) for f in features]
        attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
        labels = [torch.tensor(f["labels"]) for f in features]
        batch = {
            "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id),
            "attention_mask": torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0),
            "labels": torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100),
        }
        return batch

In [28]:
data_collator = DataCollatorForCausal(tokenizer)

In [29]:
total_train_batch = BATCH_SIZE * GRAD_ACCUM
logging_steps = 80
save_steps = 500
eval_steps = 500

In [30]:
# TrainingArguments & Trainer

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_steps=WARMUP_STEPS,
    logging_steps=logging_steps,
    eval_strategy="steps",
    eval_steps=eval_steps,
    save_strategy="steps",
    save_steps=save_steps,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="none",
    optim="paged_adamw_8bit",
    dataloader_num_workers=3,  # Enable multi-worker data loading
    dataloader_pin_memory=True,
    max_grad_norm=0.3,  # Gradient clipping for stability
)

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator,
)

In [32]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
500,0.0,7e-06


KeyboardInterrupt: 

In [35]:
from peft import get_peft_model_state_dict

In [36]:
try:
    peft_state = get_peft_model_state_dict(model)
    peft_path = os.path.join(OUTPUT_DIR, "adapter_model.bin")
    torch.save(peft_state, peft_path)
    print("Saved PEFT/LoRA weights to:", peft_path)
except Exception as e:
    print("Warning: couldn't save PEFT state dict separately:", e)

Saved PEFT/LoRA weights to: kaggle/working//study-finetuned-lora-selfcontext/adapter_model.bin


In [37]:
# Save LoRA weights
model.save_pretrained(OUTPUT_DIR)  # This saves LoRA + config
tokenizer.save_pretrained(OUTPUT_DIR)

('kaggle/working//study-finetuned-lora-selfcontext/tokenizer_config.json',
 'kaggle/working//study-finetuned-lora-selfcontext/special_tokens_map.json',
 'kaggle/working//study-finetuned-lora-selfcontext/tokenizer.json')

In [38]:
from peft import LoraConfig
peft_config.save_pretrained(OUTPUT_DIR)

In [49]:
OUTPUT_DIR = "/kaggle/working/finetuned-lora-model"

In [51]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [52]:
lora_state_dict = get_peft_model_state_dict(model)
lora_path = os.path.join(OUTPUT_DIR, "adapter_model.bin")
torch.save(lora_state_dict, lora_path)

# 2. Save adapter config (critical!)
from peft import LoraConfig
peft_config.save_pretrained(OUTPUT_DIR)

# 3. Save tokenizer
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"True LoRA weights saved: {lora_path} → {os.path.getsize(lora_path) / 1e6:.1f} MB")

True LoRA weights saved: /kaggle/working/finetuned-lora-model/adapter_model.bin → 4230.0 MB
