# IMPORT LIBRARIES AND CHECK GPU

In [1]:
# Install required packages
!pip install transformers datasets accelerate peft bitsandbytes -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5

In [2]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import os

2025-10-06 02:44:29.857243: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759718670.221344      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759718670.326356      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Check GPU
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}, GPUs: {torch.cuda.device_count()}")
if device == "cuda":
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

Device: cuda, GPUs: 2
GPU 0: Tesla T4
GPU 1: Tesla T4


# LOAD DATASET AND TOKENIZER

In [4]:
train_df = pd.read_csv('/kaggle/input/tinystories-narrative-classification/train.csv')
val_df = pd.read_csv('/kaggle/input/tinystories-narrative-classification/validation.csv')
train_df = train_df.sample(n=300_000, random_state=42)  # 2.1M -> 300K
val_df = val_df.sample(n=3_000, random_state=42)  # 22K -> 3K

print(f"Train: {len(train_df)}, Val: {len(val_df)}")

train_dataset = Dataset.from_pandas(train_df[['text']]).filter(lambda x: x["text"] and str(x["text"]).strip() != "")
val_dataset   = Dataset.from_pandas(val_df[['text']]).filter(lambda x: x["text"] and str(x["text"]).strip() != "")

Train: 300000, Val: 3000


Filter:   0%|          | 0/300000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [8]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Fixed length packing: concat -> split into blocks of 256
BLOCK_SIZE = 256

def tokenize(examples):
    return tokenizer(examples["text"], add_special_tokens=False)

def pack_dataset(ds):
    tokenized = ds.map(tokenize, batched=True, remove_columns=["text"])
    
    def group_texts(examples):
        concatenated = []
        for ids in examples["input_ids"]:
            concatenated.extend(ids)
        
        total_len = (len(concatenated) // BLOCK_SIZE) * BLOCK_SIZE
        
        input_ids = [concatenated[i : i + BLOCK_SIZE] for i in range(0, total_len, BLOCK_SIZE)]
        attention_mask = [[1]*BLOCK_SIZE for _ in range(len(input_ids))]
        labels = [ids[:] for ids in input_ids]  # Deep copy
        
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
    
    # batched=False done or add remove_columns
    packed = tokenized.map(
        group_texts, 
        batched=True, 
        remove_columns=tokenized.column_names  # Eski kolonları kaldır
    )
    return packed

tokenized_train = pack_dataset(train_dataset)
tokenized_val   = pack_dataset(val_dataset)

print(f"Train blocks: {len(tokenized_train)}, Val blocks: {len(tokenized_val)}")
print(f"Columns: {tokenized_train.column_names}")

Map:   0%|          | 0/299970 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1052 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/299970 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Train blocks: 261045, Val blocks: 2538
Columns: ['input_ids', 'attention_mask', 'labels']


# QLoRA - 4-BIT QUANTIZATION CONFIG

In [18]:
# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Model loading
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0},
    torch_dtype=torch.float16,
)

# Prepare model for QLoRA
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable() # for memory saving

# LoRA CONFIGURATION

In [19]:
# LoRA config
lora_config = LoraConfig(
    r=4,  # Rank
    lora_alpha=8,
    target_modules=["c_attn"],  # GPT-2 attention modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Add LoRA adapter to model
model = get_peft_model(model, lora_config)

# Trainable parameters
model.print_trainable_parameters()

trainable params: 147,456 || all params: 124,587,264 || trainable%: 0.1184


# TRAINING ARGUMENTS

In [20]:
output_dir = "./qlora_story_model"

per_device_bs=8
grad_accum=4
max_steps=10_000

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,

    per_device_train_batch_size=per_device_bs,
    per_device_eval_batch_size=per_device_bs,
    gradient_accumulation_steps=grad_accum,
    learning_rate=1e-4,                 
    weight_decay=0.01,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",

    fp16=True,
    gradient_checkpointing=True,      

    eval_strategy="steps",
    eval_steps=1000,                   
    save_strategy="steps",
    save_steps=1000,                   
    logging_steps=50,

    max_steps=max_steps,

    optim="paged_adamw_8bit",
    report_to="none",
    dataloader_num_workers=2,
    torch_empty_cache_steps=500,
)

# TRAINER AND TRAINING

In [21]:
from transformers import default_data_collator

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=default_data_collator,
)

print(f"Using {trainer.args.n_gpu} GPU(s)")
print(f"Effective batch size (tokens side aside): {per_device_bs * grad_accum * max(1, trainer.args.n_gpu)}")
print("Starting QLoRA fine-tuning...")
train_result = trainer.train()
print(train_result)

eval_results = trainer.evaluate()
import math, torch as th
print(f"Eval loss: {eval_results['eval_loss']:.4f}")
print(f"Perplexity: {th.exp(th.tensor(eval_results['eval_loss'])):.2f}")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Using 2 GPU(s)
Effective batch size (tokens side aside): 64
Starting QLoRA fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
1000,1.1581,
2000,1.1332,
3000,1.112,
4000,1.0996,
5000,1.0981,
6000,1.0925,
7000,1.0914,
8000,1.085,
9000,1.0881,6.898125
10000,1.0837,




TrainOutput(global_step=10000, training_loss=1.1157851959228515, metrics={'train_runtime': 14987.9868, 'train_samples_per_second': 42.701, 'train_steps_per_second': 0.667, 'total_flos': 8.375552619105485e+16, 'train_loss': 1.1157851959228515, 'epoch': 2.4515812699190977})




Eval loss: nan
Perplexity: nan
