
#Downloading gpt2 weights and applying PEFT with loRA

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Initialize the GPT-2 tokenizer using the pre-trained "gpt2" version
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Initialize the GPT-2 model using the pre-trained "gpt2" version
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, #attention heads
    lora_alpha=32, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

#Tokenizing data

In [None]:
from datasets import load_dataset

# Load previously saved datasets (training and validation) from the specified paths
dataset = load_dataset('text', data_files={'train': train_path, 'validation': val_path})

# Setting the padding token for the tokenizer (GPT-2 uses the EOS token as padding)
tokenizer.pad_token = tokenizer.eos_token

# Define a function to tokenize data:
# - `padding='max_length'` ensures all tokenized outputs have the same length.
# - `truncation=True` ensures that examples longer than 512 tokens are truncated to that number.
# - `return_tensors='pt'` makes sure the tokenized outputs are returned as PyTorch tensors.
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Apply the tokenization function to the entire dataset (both training and validation)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Save the tokenized datasets to disk for easier retrieval in future sessions
tokenized_datasets.save_to_disk(f"{base_dir}/tokenized_datasets")

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/99357 [00:00<?, ? examples/s]

Map:   0%|          | 0/24840 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/99357 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24840 [00:00<?, ? examples/s]