In [2]:
%pip install --quiet transformers accelerate evaluate datasets peft torch

Note: you may need to restart the kernel to use updated packages.


We are going to use GPT-2 as our base model for text generation. We'll fine-tune it on two different datasets using LoRA to demonstrate efficient adaptation.

In [3]:
model_checkpoint = "gpt2"

## Creating Helper Functions

In [4]:
import os
import torch
from peft import PeftModel, LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer

def print_model_size(path):
    size = 0
    for f in os.scandir(path):
        size += os.path.getsize(f)
    print(f"Model size: {(size / 1e6):.2f} MB")

def print_trainable_parameters(model, label):
    parameters, trainable = 0, 0
    for _, p in model.named_parameters():
        parameters += p.numel()
        trainable += p.numel() if p.requires_grad else 0
    print(f"{label} trainable parameters: {trainable:,}/{parameters:,} ({100 * trainable / parameters:.2f}%)")

def split_dataset(dataset):
    dataset_splits = dataset.train_test_split(test_size=0.1)
    return dataset_splits.values()

## Creating Synthetic Datasets

We'll create two synthetic datasets:
1. A dataset of technical documentation-style text
2. A dataset of creative story-style text

In [5]:
from datasets import Dataset
import random

# Technical documentation dataset generator
def generate_tech_docs(num_samples=100):
    topics = ["API", "Database", "Network", "Security", "Cloud"]
    verbs = ["configure", "implement", "deploy", "optimize", "secure"]
    components = ["server", "application", "system", "framework", "protocol"]
    
    texts = []
    for _ in range(num_samples):
        topic = random.choice(topics)
        verb = random.choice(verbs)
        component = random.choice(components)
        text = f"How to {verb} a {topic} {component}. First, ensure all prerequisites are met. "
        text += f"Then, follow these steps to {verb} the {component}. "
        text += f"This guide covers best practices for {topic} implementation."
        texts.append(text)
    
    return Dataset.from_dict({"text": texts})

# Creative story dataset generator
def generate_stories(num_samples=100):
    characters = ["wizard", "knight", "dragon", "princess", "merchant"]
    actions = ["journeyed", "discovered", "battled", "created", "explored"]
    places = ["ancient castle", "mystical forest", "hidden cave", "magical realm", "forgotten city"]
    
    texts = []
    for _ in range(num_samples):
        character = random.choice(characters)
        action = random.choice(actions)
        place = random.choice(places)
        text = f"The {character} {action} through the {place}. "
        text += f"In this magical adventure, they encountered wonders beyond imagination. "
        text += f"The story of the {character} became legendary throughout the land."
        texts.append(text)
    
    return Dataset.from_dict({"text": texts})

# Generate datasets
dataset1 = generate_tech_docs(2)
dataset2 = generate_stories(2)

# Split datasets
dataset1_train, dataset1_test = split_dataset(dataset1)
dataset2_train, dataset2_test = split_dataset(dataset2)

In [6]:
config = {
    "model1": {
        "train_data": dataset1_train,
        "test_data": dataset1_test,
        "epochs": 1,
        "path": "./lora-text-model1",
        "description": "Technical documentation model"
    },
    "model2": {
        "train_data": dataset2_train,
        "test_data": dataset2_test,
        "epochs": 1,
        "path": "./lora-text-model2",
        "description": "Creative story model"
    }
}

## Setting up the Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

## Preparing the Dataset

In [8]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length",
        return_tensors="pt"
    )
    # Create labels for causal language modeling (next token prediction)
    model_inputs["labels"] = model_inputs["input_ids"].clone()
    return model_inputs

def prepare_dataset(dataset):
    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset.column_names
    )
    return tokenized_dataset

## Training Setup

In [9]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



## LoRA Configuration

In [11]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],  # Changed from ["query", "value"] to target GPT-2's attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",  # Added task type for causal language modeling
)


## Training Loop

In [12]:
from transformers import Trainer

for cfg in config.values():
    print(f"\nTraining {cfg['description']}")
    
    # Prepare datasets
    train_dataset = prepare_dataset(cfg['train_data'])
    eval_dataset = prepare_dataset(cfg['test_data'])
    
    # Load base model
    model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
    
    # Add LoRA adapters
    peft_model = get_peft_model(model, lora_config)
    print_trainable_parameters(peft_model, cfg['description'])
    
    # Set up trainer
    trainer = Trainer(
        model=peft_model,
        args=training_arguments,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )
    
    # Train and save
    trainer.train()
    peft_model.save_pretrained(cfg['path'])
    print(f"Model saved to {cfg['path']}")
    print_model_size(cfg['path'])


Training Technical documentation model


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Technical documentation model trainable parameters: 1,622,016/126,061,824 (1.29%)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcharbeldaher34[0m ([33mcharbeldaher34-lebanese-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112668788887782, max=1.0…

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 8.811991691589355, 'eval_runtime': 0.2944, 'eval_samples_per_second': 3.397, 'eval_steps_per_second': 3.397, 'epoch': 1.0}
{'train_runtime': 4.2998, 'train_samples_per_second': 0.233, 'train_steps_per_second': 0.233, 'train_loss': 7.141759395599365, 'epoch': 1.0}
Model saved to ./lora-text-model1
Model size: 6.50 MB

Training Creative story model


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Creative story model trainable parameters: 1,622,016/126,061,824 (1.29%)




  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 9.170618057250977, 'eval_runtime': 0.3151, 'eval_samples_per_second': 3.173, 'eval_steps_per_second': 3.173, 'epoch': 1.0}
{'train_runtime': 1.7788, 'train_samples_per_second': 0.562, 'train_steps_per_second': 0.562, 'train_loss': 7.800537109375, 'epoch': 1.0}
Model saved to ./lora-text-model2
Model size: 6.50 MB


## Testing the Models

In [13]:
def generate_text(model, prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Load and test both models
base_model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

for cfg in config.values():
    print(f"\nTesting {cfg['description']}")
    
    # Load the LoRA model
    model = PeftModel.from_pretrained(base_model, cfg['path'])
    
    # Generate some text
    if "Technical" in cfg['description']:
        prompt = "How to configure a secure API server"
    else:
        prompt = "Once upon a time, a brave wizard"
        
    generated_text = generate_text(model, prompt)
    print(f"Prompt: {prompt}")
    print(f"Generated: {generated_text}\n")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Testing Technical documentation model


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: How to configure a secure API server
Generated: How to configure a secure API server?

There are two different ways for setting up a secure server; the simplest is by creating your own SSL certificate. The second method is to use a custom formatter.

Let's say you want to access a database using SSL. You need to use the following code to use the formatter.

<form action="form.bodyForm"> <input type="text" name="databaseName" value="SOCIAL-PASSW


Testing Creative story model
Prompt: Once upon a time, a brave wizard
Generated: Once upon a time, a brave wizard was able to turn back time and return the time she had been on to that which she had been given. The time she had been given was the same as her original time and she was able to return time to her original time.

The Time of the Dragon has been named by Wizards of the Coast in a new adventure, The Time of the Dragon: The Quest for the Crown. The time she was given is the same as her original time and

