# Load Model

In [1]:
import torch
from typing import Optional
from dataclasses import dataclass, field, InitVar
from transformers import BitsAndBytesConfig

@dataclass
class ModelArguments:
    model_id: str                                = field(metadata={"help": "model name on HuggingFace or path to local model"})
    adapter_name: Optional[str]                  = field(default=None, metadata={"help": "adapter name on HuggingFace or path to local adapter"})
    load_in_4bit: InitVar[bool]                  = field(default=True, metadata={"help": "enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from bitsandbytes"})
    bnb_4bit_compute_dtype: InitVar[torch.dtype] = field(default=torch.bfloat16, metadata={"help": "Computational type: Union[torch.bfloat16, torch.float16, torch.float32]"})
    bnb_4bit_quant_type: InitVar[str]            = field(default="nf4", metadata={"help": "quantization data type in the bnb.nn.Linear4Bit layers: Union['nf4', 'fp4']"})
    bnb_4bit_use_double_quant: InitVar[bool]     = field(default=False, metadata={"help": "enable nested quantization"})
    quant_config: BitsAndBytesConfig             = field(init=False)
    device_map: str                              = field(default="auto")
    output_hidden_states: InitVar[bool]          = field(default=False, metadata={"help": "outputs hidden states (W) during fwd pass"})
    output_attentions: InitVar[bool]             = field(default=False, metadata={"help": "outputs attentions calculated during fwd pass"})
    output_scores: InitVar[bool]                 = field(default=False, metadata={"help": "outputs logits calculated during fwd pass"})
    return_dict_in_generate: InitVar[bool]       = field(default=False, metadata={"help": "return ModelOutput during generation or a simple tuple"})
    config_args: dict                            = field(init=False)
        
    def __post_init__(self, load_in_4bit, bnb_4bit_compute_dtype, bnb_4bit_quant_type, bnb_4bit_use_double_quant,
                      output_hidden_states, output_attentions, output_scores, return_dict_in_generate):
        self.quant_config = BitsAndBytesConfig(load_in_4bit=load_in_4bit,
                                               bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
                                               bnb_4bit_quant_type=bnb_4bit_quant_type,
                                               bnb_4bit_use_double_quant=bnb_4bit_use_double_quant)
        
        self.config_args = {"output_hidden_states":output_hidden_states,
                            "output_attentions":output_attentions,
                            "output_scores": output_scores,
                            "return_dict_in_generate": return_dict_in_generate}

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_args = ModelArguments("facebook/opt-125m")
model = AutoModelForCausalLM.from_pretrained(model_args.model_id,
                                             quantization_config=model_args.quant_config,
                                             device_map=model_args.device_map,
                                             **model_args.config_args)

tokenizer = AutoTokenizer.from_pretrained(model_args.model_id, device_map=model_args.device_map)

# embedding size check
# idk why this is done (copied directly from HF script)
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    print("Resizing embeddings to avoid index errors")
    model.resize_token_embeddings(len(tokenizer))

# PEFT

In [3]:
from typing import List
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


@dataclass
class PeftArguments:
    lora_r: int                                    = field(metadata={"help": "rank of the update matrices"})
    lora_alpha: int                                = field(metadata={"help": "alpha parameter for Lora scaling"})
    lora_dropout: float                            = field(metadata={"help": "dropout probability for Lora layers"})
    target_modules: List[str]                      = field(metadata={"help": "names of the modules to apply Lora to"})
    bias: str                                      = field(default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the corresponding biases will be updated during training. Be aware that this means that, even when disabling the adapters, the model will not produce the same output as the base model would have without adaptation"})
    modules_to_save: Optional[List[str]]           = field(default=None, metadata={"help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. These typically include model’s custom head that is randomly initialized for the fine-tuning task"})
    layers_to_transform: Optional[List[int] | int] = field(default=None, metadata={"help": "List of layers to be transformed by LoRA. If not specified, all layers in target_modules are transformed"})

peft_args = PeftArguments(lora_r=16, lora_alpha=64, lora_dropout=0.05, target_modules=["q_proj", "k_proj", "v_proj"])

config = LoraConfig(r=peft_args.lora_r,
                    lora_alpha=peft_args.lora_alpha,
                    lora_dropout=peft_args.lora_dropout,
                    bias=peft_args.bias,
                    target_modules=peft_args.target_modules,
                    modules_to_save=peft_args.modules_to_save,
                    layers_to_transform=peft_args.layers_to_transform,
                    task_type="CAUSAL_LM",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, config)
model.gradient_checkpointing_enable()

# Load dataset

In [4]:
from typing import Callable

@dataclass
class DatasetArguments:
    dataset_id: str = field(metadata={"help": "dataset name on HuggingFace or path to Union[datasets.Dataset, csv, pandas.DataFrame, dict]"})
    valid_split_name: str = field(default="valid")
    train_split_name: str = field(default="train")
    validation_split_percentage: int = field(default=10)
    preprocessing_func: Optional[Callable] = field(default=None)
    num_workers: int = field(default=1)
    text_column_name: str = field(default="text")
    block_size: int = field(default=1024)
    postprocessing_func: Optional[Callable] = field(default=None)
    max_train_samples: Optional[int] = field(default=None)
    max_eval_samples: Optional[int] = field(default=None)
        
    def __post_init__(self):
        if self.validation_split_percentage > 100:
            self.validation_split_percentage = self.validation_split_percentage%100


dataset_args = DatasetArguments("Abirate/english_quotes")

In [5]:
from datasets import load_dataset, Dataset
import pandas as pd

if dataset_args.dataset_id.endswith(".csv"):
    raw_datasets = Dataset.from_csv(dataset_args.dataset_id)
elif isinstance(dataset_args.dataset_id, dict):
    raw_datasets = Dataset.from_dict(dataset_args.dataset_id)
elif isinstance(dataset_args.dataset_id, pd.DataFrame):
    raw_datasets = Dataset.from_pandas(dataset_args.dataset_id)
else:
    raw_datasets = load_dataset(dataset_args.dataset_id)

if dataset_args.valid_split_name not in raw_datasets.keys():
    raw_datasets[dataset_args.valid_split_name] = load_dataset(dataset_args.dataset_id, split=f"train[:{dataset_args.validation_split_percentage}%]")
    raw_datasets[dataset_args.train_split_name] = load_dataset(dataset_args.dataset_id, split=f"train[{dataset_args.validation_split_percentage}%:]")

In [6]:
if callable(dataset_args.preprocessing_func):
    raw_datasets = raw_datasets.map(dataset_args.preprocessing_func, batched=True, num_proc=dataset_args.num_workers)

In [7]:
column_names = list(raw_datasets[dataset_args.train_split_name].features)
dataset_args.text_column_name = column_names[0] if dataset_args.text_column_name not in column_names else dataset_args.text_column_name

In [8]:
dataset_args.block_size = min(dataset_args.block_size, tokenizer.model_max_length)

def tokenize_function(examples):
    output = tokenizer(examples[dataset_args.text_column_name])
    return output

raw_datasets = raw_datasets.map(tokenize_function, batched=True, num_proc=dataset_args.num_workers, remove_columns=column_names)
raw_datasets = raw_datasets.filter(lambda example: len(example['input_ids']) < dataset_args.block_size)

In [9]:
if callable(dataset_args.postprocessing_func):
    raw_datasets = raw_datasets.map(dataset_args.postprocessing_func, batched=True, num_proc=dataset_args.num_workers)

In [10]:
train_dataset = raw_datasets[dataset_args.train_split_name]
if dataset_args.max_train_samples is not None:
    dataset_args.max_train_samples = min(len(train_dataset), dataset_args.max_train_samples)
    train_dataset = train_dataset.select(range(dataset_args.max_train_samples))

eval_dataset = raw_datasets[dataset_args.valid_split_name]
if dataset_args.max_eval_samples is not None:
    dataset_args.max_eval_samples = min(len(eval_dataset), dataset_args.max_eval_samples)
    eval_dataset = eval_dataset.select(range(dataset_args.max_eval_samples))

# Data Collator

In [11]:
from transformers import DataCollatorForLanguageModeling
from trl import DataCollatorForCompletionOnlyLM

# Causal language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# # Masked language modeling
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)

# # SFT
# data_collator = DataCollatorForCompletionOnlyLM(tokenizer=tokenizer, response_template="### Response:\n")

# Training Arguments

In [12]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    do_eval=True,
    do_train=True,
    output_dir="./results", # output directory where the model predictions and checkpoints are saved
    # overwrite_output_dir=False, # overwrite the content of the output directory
    evaluation_strategy="epoch", # "no": No evaluation , "steps": Evaluation done (and logged) every eval_steps, "epoch": Evaluation done end of each epoch
    # eval_steps=50,
    # prediction_loss_only=False, # When performing evaluation and generating predictions, only returns the loss
    per_device_train_batch_size=1, # The batch size per GPU/TPU core/CPU for training
    # per_device_eval_batch_size=8, # The batch size per GPU/TPU core/CPU for evaluation
    # gradient_accumulation_steps=1, # Number of updates steps to accumulate the gradients for, before performing a backward/update pass 
                                   # When using gradient accumulation, one step is counted as one step with backward pass
    # eval_accumulation_steps=None, # Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. 
                                  # If left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but requires more memory)
    # eval_delay=0 # Number of epochs or steps to wait for before the first evaluation can be performed, depending on the evaluation_strategy
    learning_rate=2e-4, # The initial learning rate for AdamW optimizer
    # weight_decay=0, # The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer
    # adam_beta1=0.9, # The beta1 hyperparameter for the AdamW optimizer
    # adam_beta2=0.99, # The beta2 hyperparameter for the AdamW optimizer
    # adam_epsilon=1e-8, # The epsilon hyperparameter for the AdamW optimizer
    max_grad_norm=0.3, # Maximum gradient norm (for gradient clipping)
    # num_train_epochs=3.0, # Total number of training epochs to perform (perform decimal part percents if non-int)
    max_steps=100, # default -1, If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs. 
                  # In case of using a finite iterable dataset the training may stop before reaching the set number of steps when all data is exhausted
    lr_scheduler_type="constant", # one of "linear" (default) , "constant", "cosine", "cosine_with_restarts", "polynomial", "constant_with_warmup", "inverse_sqrt", "reduce_lr_on_plateau"
    warmup_ratio=0.03, # default 0, Ratio of total training steps used for a linear warmup from 0 to learning_rate
    # warmup_steps=0, # Number of steps used for a linear warmup from 0 to learning_rate. Overrides any effect of warmup_ratio
    # log_level="passive", # one of 'debug', 'info', 'warning', 'error' and 'critical', 'passive' defaults to transformers level ("warning")
    # log_level_replica="warning", # Logger log level to use on replicas
    # log_on_each_node=True, # In multinode distributed training, whether to log using log_level once per node, or only on the main node
    # logging_dir=None, #  TensorBoard log directory. Will default to *output_dir/runs/CURRENT_DATETIME_HOSTNAME*
    # logging_strategy="steps", # logging strategy to adopt during training. One of "no", "epoch", "steps"
    # logging_first_step=False, # Whether to log and evaluate the first global_step or not
    logging_steps=5, # Number of update steps between two logs if logging_strategy="steps". 
                      # Should be an integer or a float in range [0,1). If smaller than 1, will be interpreted as ratio of total training steps
    # logging_nan_inf_filter=True, # Whether to filter nan and inf losses for logging. only influences logging, not the behavior the gradient
    # save_strategy="steps", # save strategy. One of "no", "epoch", "steps"
    save_steps=5, # Number of updates steps before two checkpoint saves if save_strategy="steps".
                   # Should be an integer or a float in range [0,1). If smaller than 1, will be interpreted as ratio of total training steps.
    # save_total_limit=None, # If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir.
                           # When load_best_model_at_end is enabled, the “best” checkpoint according to metric_for_best_model will always be retained in addition to the most recent ones
    # save_safetensors=False, # Use safetensors saving and loading for state dicts instead of default torch.load and torch.save.
    # save_on_each_node=False, # When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on the main one.
    # no_cuda=False, # Whether to not use CUDA even when it is available or not.
    # seed=42, # Random seed that will be set at the beginning of training.
    # data_seed=None, # Random seed to be used with data samplers. If not set, random generators for data sampling will use the same seed as seed.
    # jit_mode_eval=False, # Whether or not to use PyTorch jit trace for inference.
    # use_ipex=False, # Use Intel extension for PyTorch when it is available
    # bf16=False, # Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training.
    # fp16=False, # default False, Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
    # fp16_opt_level="O1", # For fp16 training, Apex AMP optimization level selected in [‘O0’, ‘O1’, ‘O2’, and ‘O3’].
    # fp16_backend="auto", # This argument is deprecated. Use half_precision_backend instead.
    # half_precision_backend="auto", # The backend to use for mixed precision training. one of "auto", "cuda_amp", "apex", "cpu_amp"
    # bf16_full_eval=False, # Whether to use full bfloat16 evaluation instead of 32-bit. This will be faster and save memory but can harm metric values
    # fp16_full_eval=False, # Whether to use full float16 evaluation instead of 32-bit. This will be faster and save memory but can harm metric values.
    # tf32=None, # Whether to enable the TF32 mode, available in Ampere and newer GPU architectures.
    # local_rank=-1, # Rank of the process during distributed training
    # ddp_backend=None, # The backend to use for distributed training. Must be one of "nccl", "mpi", "ccl", "gloo".
    # tpu_num_cores=None, # When training on TPU, the number of TPU cores (automatically passed by launcher script).
    # dataloader_drop_last=False, # Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) or not.
    # eval_steps=None, # Number of update steps between two evaluations if evaluation_strategy="steps"
    # dataloader_num_workers=0, # Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process.
    # past_index=-1, # Some models like TransformerXL or XLNet can make use of the past hidden states for their predictions
    # run_name=None, # A descriptor for the run. Typically used for wandb and mlflow logging.
    # disable_tqdm=None, # Whether or not to disable the tqdm progress bars and table of metrics produced by ~notebook.NotebookTrainingTracker in Jupyter Notebooks
    remove_unused_columns=False, # Whether or not to automatically remove the columns unused by the model forward method.
    # label_names=None, # The list of keys in your dictionary of inputs that correspond to the labels.
    # load_best_model_at_end=False, # Whether or not to load the best model found during training at the end of training
    # metric_for_best_model=None, # Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models
    # greater_is_better=None, # specify if better models should have a greater metric or not
    # ignore_data_skip=False, # When resuming training, whether or not to skip the epochs and batches to get the data loading at the same stage as in the previous training
    # sharded_ddp=False, # Use Sharded DDP training from FairScale (in distributed training only)
    # fsdp=False, # Use PyTorch Distributed Parallel Training
    # fsdp_config=None, # Config to be used with fsdp
    # deepspeed=None, # Use Deepspeed
    # label_smoothing_factor=0.0, # The label smoothing factor to use
    # debug="", # Enable one or more debug features
    # optim="paged_adamw_32bit", # default "adamw_hf", The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or adafactor.
    # optim_args=None, # Optional arguments that are supplied to AnyPrecisionAdamW
    # group_by_length=False, # Whether or not to group together samples of roughly the same length in the training dataset (to minimize padding applied and be more efficient). Only useful if applying dynamic padding
    # length_column_name="length", # Column name for precomputed lengths. If the column exists, grouping by length will use these values rather than computing them on train startup
    # report_to="all", # The list of integrations to report the results and logs to. 
                     # Supported platforms are "azure_ml", "comet_ml", "mlflow", "neptune", "tensorboard","clearml" and "wandb".
                     # Use "all" to report to all integrations installed, "none" for no integrations
    # ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, skip_memory_metrics=True,
    # push_to_hub=False,
    # resume_from_checkpoint=None, # The path to a folder with a valid checkpoint for your model. 
                                 # This argument is not directly used by Trainer, it’s intended to be used by your training/evaluation scripts instead.
    # hub_model_id=None, hub_strategy="every_save", hub_token=None, hub_private_repo=False, 
    # gradient_checkpointing=False, # If True, use gradient checkpointing to save memory at the expense of slower backward pass
    # include_inputs_for_metrics=False, # Whether or not the inputs will be passed to the compute_metrics function.
    # auto_find_batch_size=False, # Whether to find a batch size that will fit into memory automatically through exponential decay, avoiding CUDA Out-of-Memory errors. Requires accelerate to be installed
    # full_determinism=False, torchdynamo=None, ray_scope="last", ddp_timeout=1800, use_mps_device=False,
    # torch_compile=False, # Whether or not to compile the model using PyTorch 2.0 torch.compile
    # torch_compile_backend=None, torch_compile_mode=None, sortish_sampler=False, 
    # predict_with_generate=False, # Whether to use generate to calculate generative metrics (ROUGE, BLEU)
    # generation_max_length=None, # The max_length to use on each evaluation loop when predict_with_generate=True
    # generation_num_beams=None, # The num_beams to use on each evaluation loop when predict_with_generate=True
    # generation_config=None
)

# Trainer

In [13]:
import evaluate

metric = evaluate.load("accuracy")
# metric = evaluate.load("perplexity")

In [14]:
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics but we need to shift the labels
    labels = labels[:, 1:].reshape(-1)
    labels[labels == -100] = tokenizer.pad_token_id

    preds = preds[:, :-1].reshape(-1)
    preds[preds == -100] = tokenizer.pad_token_id

    return metric.compute(predictions=preds, references=labels) # for accuracy
#     return metric.compute(predictions=tokenizer.decode(preds), model_id=model_args.model_id)

In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics if training_arguments.do_eval else None,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_arguments.do_eval else None,
)

In [16]:
model.config.use_cache = False # silence the warnings. Please re-enable for inference!

In [17]:
trainer.train()
# trainer.train(resume_from_checkpoint=)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
0,3.4597,2.830065,0.834267


TrainOutput(global_step=100, training_loss=3.2782231616973876, metrics={'train_runtime': 21.5434, 'train_samples_per_second': 4.642, 'train_steps_per_second': 4.642, 'total_flos': 1156827386880.0, 'train_loss': 3.2782231616973876, 'epoch': 0.04})