# Required Libraries

In [None]:
!pip install --quiet torch datasets huggingface_hub transformers peft trl

In [None]:
import os
import torch

from datasets import load_dataset
from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# 2. Load the dataset

In [None]:
dataset_name = "ai-bites/databricks-mini"
dataset = load_dataset(
    dataset_name,
    split="train[0:100000]")
# dataset = load_dataset(dataset_name)
# Split into train and eval
# split_dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [None]:
dataset

Dataset({
    features: ['text'],
    num_rows: 10544
})

# 3. Load the Model and Tokenizer and Quantize the Model

In [None]:
# !pip install --quiet bitsandbytes
!pip install -U bitsandbytes



In [None]:
from huggingface_hub import login

# Paste your token here
login("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")

In [None]:
from transformers import AutoModel
import torch

# model_name = "sshleifer/tiny-gpt2"
model_name = "openai-community/gpt2"
model = AutoModel.from_pretrained(model_name)

# Count total parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params:,}")



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Total parameters: 124,439,808
Trainable parameters: 124,439,808


In [None]:
# bitsandbytes
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
device_map = "auto"

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant
)

# model
# model_name = "google/gemma-2b"
# model_name = "sshleifer/tiny-gpt2"
model_name = "distilgpt2"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Using the bnb_config defined above
    device_map=device_map,
)
model.config.use_cache = False # not using KV cache as it might require more memory
model.config.pretraining_tp = 1 # not using tensor parallelism

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Some models don't have a dedicated padding token, so using the EOS token as a pad token is a common practice
tokenizer.padding_side = "right" # configures the tokenizer to add padding to the right side of the sequences

In [None]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=


# 4. Load LoRA Configurations

In [None]:
# LoRA parameters
lora_r = 8
lora_alpha = 16
lora_dropout = 0.1

# LoRA Config
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = ["c_attn", "c_proj", "c_fc"]
)

# 5. Set Training Parameters

In [None]:
# src: https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py

training_arguments = TrainingArguments(
    output_dir="./results", # directory to store the model predictions and checkpoints
    num_train_epochs=5, # number of epochs
    per_device_train_batch_size=4, # batch size for training per GPU or core CPU
    per_device_eval_batch_size=4, # batch size for evaluating per GPU or core CPU
    gradient_accumulation_steps=4, # number of updates steps to accumulate the gradients for, before performing a backward/update pass
    optim="paged_adamw_32bit", # the optimizer to use
    save_steps=50, # number of updates steps before two checkpoint saves
    logging_steps=50, # number of update steps between two logs
    learning_rate=2e-4, # 0.0002 initial learning rate for [`AdamW`] optimizer
    lr_scheduler_type="constant", # the scheduler type to use
    warmup_ratio=0.03, # ratio of total training steps used for a linear warmup from 0 to `learning_rate`
    weight_decay=0.001, # the weight decay to apply to all layers except all bias and LayerNorm weights in [`AdamW`] optimizer
    fp16=True, # whether to use fp16 16-bit (mixed) precision training instead of 32-bit training
    bf16=False, # whether to use bfp16 16-bit (mixed) precision training instead of 32-bit training
    max_grad_norm=0.3, # maximum gradient norm (for gradient clipping)
    max_steps=-1, # if set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`    group_by_length=True, # Whether or not to group together samples of roughly the same length in the training dataset
    report_to="none", # the list of integrations to report the results and logs to - [wandb, mlflow, comet_ml, neptune...]
    # max_seq_length=128,  # Reduce this to fit your model
)

# 6. SFT Configurations

In [None]:
trainer = SFTTrainer(
    model=model_name, # model to train
    train_dataset=dataset, # the training dataset
    eval_dataset=eval_dataset, # the evaluation dataset
    peft_config=peft_config, # from LoRA Configuration
    # dataset_text_field="text", # pointing to the 'text' column in the dataset
    # max_seq_length=40, # it determines the maximum length of input sequences during fine-tuning
    # tokenizer=tokenizer, # model tokenizer
    args=training_arguments, # the training parameters
    # packing=True, # allows multiple shorter sequences to be packed into a single training example, maximizing the use of the model's context window.
)



Applying chat template to eval dataset:   0%|          | 0/1055 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1055 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2138 > 1024). Running this sequence through the model will result in indexing errors


Truncating eval dataset:   0%|          | 0/1055 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


# 7. Train the Model

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# Train model
new_model = "./tiny-gpt2-ft"

trainer.train()
trainer.model.save_pretrained(new_model)

Step,Training Loss
50,3.4507
100,3.195
150,3.0688
200,3.107
250,3.0421
300,3.0442
350,3.0632
400,3.0336
450,3.0195
500,3.0695


# 8. Model Inference

In [1]:
# input_text = (
#     "Instruction: What is a polygon?"
#     "Response:"
# )

# # Load the model
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map=device_map,
# )

# # Attach the LoRA adapters
# model = PeftModel.from_pretrained(base_model, new_model)
# model = model.merge_and_unload()

# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

# input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
# print(input_ids)
# outputs = model.generate(
#     **input_ids,
#     max_length=128
# )

# # print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# print(tokenizer.decode(outputs[0]))

In [None]:
def generate_response(instruction):
    prompt = f"Instruction: {instruction}\nResponse:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=256,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id
        )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result.replace(prompt, "").strip()


In [None]:
instruction = "Write C syntax"
response = generate_response(instruction)
print("ðŸ§  Model Response:", response)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


ðŸ§  Model Response: What is the difference between a string and an integer?

Answer: The difference between a string, a numeric or binary string
