In [1]:
!pip install -q datasets==3.2.0 bitsandbytes==0.45.1 accelerate==1.2.1 evaluate==0.4.3 transformers==4.47.1 torch==2.5.1+cu124 numpy==1.26.4 peft==0.14.0

In [2]:
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training
)

In [3]:
# Load and preprocess dataset
eli5 = load_dataset("eli5_category", split="train[:100]")
eli5 = eli5.train_test_split(test_size=0.2)
eli5

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 80
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 20
    })
})

In [4]:
eli5 = eli5.flatten()
eli5

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
        num_rows: 80
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
        num_rows: 20
    })
})

In [5]:
# Initialize tokenizer
model_name = "distilbert/distilroberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [6]:
# Preprocessing function
def preprocess_function(batch):

    # answers.text is a nested list, let's merge each list
    texts = [" ".join(txt) for txt in batch["answers.text"]]

    return tokenizer(texts)

In [7]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched = True,
    num_proc = 4,
    remove_columns = eli5['train'].column_names
)

Map (num_proc=4):   0%|          | 0/80 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (783 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (950 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1435 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/20 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1104 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1697 > 512). Running this sequence through the model will result in indexing errors


In [8]:
# Block size for grouping texts
block_size = 64

def group_texts(examples):

    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # Truncate to multiple of block_size
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size

    # Split into blocks
    result = {
        k: [t[i : i + block_size]
            for i in range(0, total_length, block_size)]
                for k, t in concatenated_examples.items()
    }

    # Set up labels for masked language modeling
    result["labels"] = result["input_ids"].copy()

    return result

In [9]:
tokenized_eli5 = tokenized_eli5.map(
    group_texts,
    batched=True,
    num_proc=4
)

Map (num_proc=4):   0%|          | 0/80 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/20 [00:00<?, ? examples/s]

In [10]:
# Data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = True, # means masked lm
    mlm_probability = 0.15 # proba of desired masked tokens
)

In [11]:
# Quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype ='bfloat16',
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True ,
)

In [12]:
# Load base model with quantization
base_model = AutoModelForMaskedLM.from_pretrained(
    model_name,
    quantization_config = quantization_config,
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(base_model)

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
# Find target modules for LoRA
target_modules = set()

for name, module in base_model.named_modules():
    if isinstance(module, torch.nn.Linear):
        module_name = name.split('.')[-1]
        target_modules.add(module_name)

target_modules = list(target_modules)
target_modules

['value', 'dense', 'decoder', 'query', 'key']

In [14]:
# Configure LoRA
lora_config = LoraConfig(
    r = 4,
    lora_alpha = 2,
    lora_dropout = 0.1,
    bias = 'none',
    task_type = TaskType.CAUSAL_LM,
    target_modules = ['key', 'value', 'query']
)

# Get PEFT model
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 110,592 || all params: 82,280,793 || trainable%: 0.1344


In [15]:
def compute_metrics(eval_pred):

    "Compute metrics function for masked language modeling"

    logits, labels = eval_pred

    # Create a mask for non-padding tokens
    mask = labels != -100

    # Calculate perplexity
    loss = np.mean(np.where(mask,
                           -np.log(np.take_along_axis(
                               torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy(),
                               labels[..., None],
                               axis=-1
                           ).squeeze()),
                           0))
    perplexity = np.exp(loss)

    # Calculate accuracy only on masked tokens
    predictions = np.argmax(logits, axis=-1)
    masked_labels = labels[mask]
    masked_preds = predictions[mask]

    accuracy = np.mean(masked_preds == masked_labels)

    return {
        "perplexity": perplexity,
        "accuracy": accuracy
    }

In [16]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:512"

training_args = TrainingArguments(
    output_dir = "./results",
    save_strategy = 'epoch',
    num_train_epochs = 1,
    learning_rate = 2e-4,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 4,
    weight_decay = 0.01, # L2 regularization
    eval_strategy = 'epoch',
    logging_dir = "./logs",
    logging_strategy = 'steps',
    logging_steps = 100,
    bf16 = True,
    load_best_model_at_end = True,
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.2,
    max_grad_norm = 1.0,
    group_by_length = True,
    metric_for_best_model = "eval_perplexity",
)

In [17]:
# Initialize trainer
trainer = Trainer(
    model = lora_model,
    args = training_args,
    train_dataset = tokenized_eli5['train'],
    eval_dataset = tokenized_eli5['test'],
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

torch.cuda.empty_cache()

# Train the model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mabdulrahman-ahmed20072[0m ([33mabdulrahman-ahmed20072-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Perplexity,Accuracy
0,No log,3.621192,1.747684,0.404418


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=60, training_loss=14.628037516276041, metrics={'train_runtime': 38.5042, 'train_samples_per_second': 12.492, 'train_steps_per_second': 1.558, 'total_flos': 7977692712960.0, 'train_loss': 14.628037516276041, 'epoch': 0.995850622406639})

In [18]:
# Function for mask filling
def fill_mask(model, text, top_k=5):

    # Prepare input
    input_text = text.replace("<mask>", tokenizer.mask_token)
    inputs = tokenizer(input_text, return_tensors="pt")
    inputs = inputs.to(model.device)

    # Find mask token index
    mask_token_index = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0][0]

    # Generate predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits

    # Get top k predictions
    mask_token_logits = predictions[0, mask_token_index, :]
    mask_token_proba = torch.nn.functional.softmax(mask_token_logits, dim = -1)
    top_k_tokens = torch.topk(mask_token_proba, top_k, dim=-1)

    # Decode and return results
    results = []
    for token_id, score in zip(top_k_tokens.indices, top_k_tokens.values):
        token = tokenizer.decode(token_id)
        result_text = input_text.replace(tokenizer.mask_token, token)
        results.append({
            "score": score.item(),
            "token": token,
            "sequence": result_text
        })

    return results

In [21]:
text = "i'm going to school every <mask>"

for seq in fill_mask(lora_model, text):
    print(seq)

{'score': 0.3977905809879303, 'token': ' day', 'sequence': "i'm going to school every  day"}
{'score': 0.12131927162408829, 'token': ' semester', 'sequence': "i'm going to school every  semester"}
{'score': 0.107063889503479, 'token': ' morning', 'sequence': "i'm going to school every  morning"}
{'score': 0.07832960784435272, 'token': ' year', 'sequence': "i'm going to school every  year"}
{'score': 0.0649375319480896, 'token': ' week', 'sequence': "i'm going to school every  week"}
