In [1]:
!pip install -q datasets==3.2.0 bitsandbytes==0.45.1 accelerate==1.2.1 evaluate==0.4.3 transformers==4.47.1 torch==2.5.1+cu124 numpy==1.26.4 peft==0.14.0

In [2]:
import torch
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training,
)

In [3]:
# Load and preprocess dataset
eli5 = load_dataset("eli5_category", split="train[:500]")
eli5 = eli5.train_test_split(test_size=0.05)
eli5

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 475
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 25
    })
})

In [4]:
eli5 = eli5.flatten()
eli5

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
        num_rows: 475
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
        num_rows: 25
    })
})

In [5]:
model_name = "distilbert/distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [6]:
def preprocess_function(batch):

    # answers.text is a nested list, let's merge each list
    texts = [" ".join(txt) for txt in batch["answers.text"]]

    return tokenizer(texts)

In [7]:
# Tokenize and group dataset
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5['train'].column_names
)

Map (num_proc=4):   0%|          | 0/475 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1543 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1058 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1062 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1049 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/25 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2041 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1033 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1079 > 1024). Running this sequence through the model will result in indexing errors


In [8]:
tokenized_eli5

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 475
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 25
    })
})

In [9]:
# Block size for grouping texts
block_size = 32

def group_texts(examples):

    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # Truncate to multiple of block_size
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size

    # Split into blocks
    result = {
        k: [t[i : i + block_size]
            for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }

    # Set up labels for causal language modeling
    result["labels"] = result["input_ids"].copy()

    return result

In [10]:
tokenized_eli5 = tokenized_eli5.map(
    group_texts,
    batched = True,
    num_proc = 4
)

Map (num_proc=4):   0%|          | 0/475 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/25 [00:00<?, ? examples/s]

In [11]:
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False # set masked lm to False, we are caussal lm
)

In [12]:
# Quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype = 'bfloat16',
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
)

In [13]:
# Load base model with quantization
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = quantization_config,
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(base_model)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [14]:
# Find target modules for LoRA
target_modules = set()

for name, module in base_model.named_modules():

    if isinstance(module, torch.nn.Linear):
        module_name = name.split('.')[-1]
        target_modules.add(module_name)

target_modules = list(target_modules)
print(target_modules)

['lm_head', 'c_proj', 'c_attn', 'c_fc']


In [15]:
# Configure LoRA
lora_config = LoraConfig(
    r = 4,
    lora_alpha = 2,
    lora_dropout = 0.1,
    bias = 'none',
    task_type = TaskType.CAUSAL_LM,
    target_modules = target_modules
)

# Get PEFT model
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 499,012 || all params: 82,411,588 || trainable%: 0.6055




In [16]:
import torch.nn.functional as F

def compute_metrics(eval_pred):

    "Compute evaluation metrics for causal language modeling."

    logits, labels = eval_pred

    # Convert logits to probabilities using softmax
    probs = F.softmax(torch.tensor(logits), dim=-1).numpy()

    # Get predicted token IDs
    predictions = np.argmax(logits, axis=-1)

    # Compute perplexity
    perplexity = calculate_perplexity(probs, labels)

    # Compute accuracy (ignoring masked tokens)
    accuracy = calculate_accuracy(predictions, labels)

    return {
        "perplexity": perplexity,
        "accuracy": accuracy
    }


def calculate_perplexity(probs, labels):

    "Calculate perplexity for causal language modeling."

    # Create a mask for non-masked tokens
    valid_mask = labels != -100

    # Gather the probabilities of the true tokens
    true_token_probs = np.take_along_axis(probs, labels[..., None], axis=-1).squeeze()

    # Compute the negative log-likelihood for non-masked tokens
    nll = -np.log(true_token_probs + 1e-10)  # Add small epsilon to avoid log(0)
    nll = np.where(valid_mask, nll, 0)  # Ignore masked tokens

    # Average the negative log-likelihood
    avg_nll = np.mean(nll[valid_mask])

    # Compute perplexity
    perplexity = np.exp(avg_nll)
    return perplexity


def calculate_accuracy(predictions, labels):

    "Calculate accuracy for causal language modeling, ignoring masked tokens."

    # Create a mask for non-masked tokens
    valid_mask = labels != -100

    # Compare predictions with labels for non-masked tokens
    correct_predictions = predictions[valid_mask] == labels[valid_mask]

    # Compute accuracy
    accuracy = np.mean(correct_predictions)
    return accuracy

In [17]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy='epoch',
    num_train_epochs=1,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    eval_strategy='epoch',
    logging_dir="./logs",
    logging_strategy='steps',
    logging_steps=100,
    bf16=True,
    load_best_model_at_end=True,
    lr_scheduler_type='cosine',
    warmup_ratio=0.2,
    max_grad_norm=1.0,
    group_by_length=True,
    metric_for_best_model="eval_perplexity",
)

In [18]:
# Initialize trainer
trainer = Trainer(
    model = lora_model,
    args = training_args,
    train_dataset = tokenized_eli5['train'],
    eval_dataset = tokenized_eli5['test'],
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

# Train the model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mabdulrahman-ahmed20072[0m ([33mabdulrahman-ahmed20072-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Perplexity,Accuracy
0,17.7452,4.454159,8525.488281,0.001319


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=321, training_loss=17.93044483030325, metrics={'train_runtime': 120.319, 'train_samples_per_second': 42.736, 'train_steps_per_second': 2.668, 'total_flos': 42430210191360.0, 'train_loss': 17.93044483030325, 'epoch': 0.9984447900466563})

In [23]:
# Inference
def generate_text(model, prompt, max_length = 50):

    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length = max_length,
            do_sample=True, # don't use gready search
            num_return_sequences = 1, # if do_sample, return n of generated samples
            temperature = 0.7,
            top_p = 0.9, # select the shortest sequence with a specific proba

        )

    samples = [tokenizer.decode(outputs[i], skip_special_tokens=True)
                for i in range(len(outputs))]

    return samples

In [29]:
# Test generation
prompt = "me and"
generated_text = generate_text(lora_model, prompt)
print(f"Generated text: {generated_text}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text: ["me and the rest of the world. The most important thing is that they can't make a difference, and they are doing so in terms of how they were perceived. It's really hard to see how they're able to make a difference. The"]
