# core

> Fill in a module description here

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

## Basic Flow

In [None]:
def config(): pass
def train(): pass

In [None]:
cfg_path = 'fsdp_qdora.json'

## Default Parameters

In [None]:
from typing import List, Union

class Parameter:
    def __init__(self, description: str, param_type: type, default: any, choices: List[str] = None):
        self.description = description
        self.type = param_type
        self.value = default
        self.choices = choices

class DefaultParameters:
    def __init__(self):
        self.world_size = Parameter('Number of GPUs to use. -1 = all available GPUs.', int, -1)
        self.train_type = Parameter("Training type", str, "qlora", choices=["full", "lora", "qlora", "custom_qlora", "custom_lora", "hqq_lora", "hqq_dora", "bnb_dora", "bnb_llama_pro", "hqq_llama_pro"])
        self.llama_pro_path = Parameter('Path to the quantized llama pro model', str, None)
        self.batch_size = Parameter('Batch size per GPU. Effective BS = batch_size * world_size * gradient_accumulation_steps', int, 1)
        self.context_length = Parameter('Max length of input sequence (in tokens)', int, 512)
        self.gradient_accumulation_steps = Parameter('How many steps to accumulate gradients over (increases effective batch size)', int, 1)
        self.num_epochs = Parameter('How many epochs of training to do', int, 1)
        self.dataset = Parameter("Dataset to use", str, "alpaca_sample", choices=["alpaca", "alpaca_sample", "dummy", "guanaco", "sql", "orca_math"])
        self.dataset_samples = Parameter('Number of samples in an epoch if using "alpaca_sample" or "dummy" dataset', int, 512)
        self.sharding_strategy = Parameter("Sharding strategy for FSDP", str, "full_shard", choices=["full_shard", "shard_grad_op", "ddp", "hybrid_full_shard", "hybrid_shard_grad_op"])
        self.use_gradient_checkpointing = Parameter('Use FSDP\'s activation checkpointing', bool, True)
        self.reentrant_checkpointing = Parameter('Use re-entrant autograd activation checkpointing. Setting to True can use less GPU memory with BNB QLoRA', bool, False)
        self.use_cpu_offload = Parameter('Use FSDP\'s CPU offloading', bool, True)
        self.use_activation_cpu_offload = Parameter('Use FSDP\'s activation CPU offloading', bool, False)
        self.low_memory = Parameter('Load one copy of the model into CPU memory before sharding with FSDP. For QLoRA, quantizes each layer individually on GPU before placing on CPU.', bool, True)
        self.no_sync = Parameter('Prevent gradient sync until update step. Likely uses more memory. Required for `use_cpu_offload` and `gradient_accumulation_steps > 1`', bool, False)
        self.precision = Parameter("Training precision", str, "bf16", choices=["fp32", "bf16", "fp16_autocast", "bf16_autocast", "bf16_buffers_autocast"])
        self.model_name = Parameter('Which model to train - e.g. "TinyLlama/TinyLlama-1.1B-Chat-v1.0"', str, "meta-llama/Llama-2-7b-hf")
        self.save_model = Parameter('Save the resulting model', bool, False)
        self.output_dir = Parameter('Output directory to save the final model to', str, "output")
        self.lora_rank = Parameter('LoRA rank for lora/qlora', int, 64)
        self.lora_alpha = Parameter('LoRA alpha for lora/qlora', int, 16)
        self.lora_dropout = Parameter('LoRA dropout for lora/qlora', float, 0.1)
        self.lora_target_modules = Parameter("LoRA target modules", str, "all", choices=["all", "default"])
        self.verbose = Parameter('Whether to print extra info for debugging', bool, False)
        self.lr = Parameter('Learning rate', float, 1e-5)
        self.apply_gradient_clipping = Parameter('Apply gradient norm clipping', bool, False)
        self.grad_norm = Parameter('Gradient norm clipping', float, 0.3)
        self.wd = Parameter('Weight decay', float, 0.1)
        self.profile_memory = Parameter('Profile memory usage for the first few batches. Keep false for training. May increase memory usage.', bool, False)
        self.optimizer = Parameter("Optimizer", str, "adamw", choices=["adamw", "adam", "sgd", "adadelta"])
        self.lr_scheduler = Parameter("Learning Rate Scheduler", str, "constant", choices=["constant", "linear", "cosine"])
        self.loading_workers = Parameter('Number of layers to load and quantize in parallel per GPU. Default of -1 uses heuristics to set worker count.', int, -1)
        self.log_to = Parameter("Where to log output", str, "tqdm", choices=["tqdm", "wandb", "stdout"])
        self.master_addr = Parameter('For distributed training', str, "localhost")
        self.master_port = Parameter('For distributed training, must be the same for all processes', str, "12355")
        self.seed = Parameter('Random seed', int, 42)
        self.project_name = Parameter('For wandb logging', str, "fsdp_qlora")
        self.name = Parameter('For wandb logging', str, None)
        self.group = Parameter('For wandb logging', str, None)
        self.entity = Parameter('For wandb logging', str, None)
        self.n_bits = Parameter('passed to hqq', int, 4)
        self.profile = Parameter('Whether to profile with torch.profiler', bool, False)
        self.profiling_output = Parameter('Output file prefix for profiling', str, "profiles")
        self.with_stack = Parameter('Output stacks for profiling. Note that setting export_memory_timeline will automatically export traces since `with_stack` must be true to profile memory.', bool, False)
        self.with_shapes = Parameter('Output shapes for profiling. Can impact performance. Note that setting export_memory_timeline will automatically export traces since `with_shapes` must be true to profile memory.', bool, False)
        self.export_trace = Parameter('Output trace for profiling', bool, True)
        self.export_memory_timeline = Parameter('Output memory timeline for profiling', bool, False)
        self.wait_steps = Parameter('Wait steps when running profiler. Only used if repeat != 0.', int, 0)
        self.warmup_steps = Parameter('Warmup steps when running profiler', int, 1)
        self.active_steps = Parameter('Active steps when running profiler', int, 2)
        self.repeat = Parameter('Number of profiler cycles (wait + warmup + active) if > 0, else repeats forever', int, 0)
        self.profiling_frequency = Parameter('Profiling frequency in steps. Only used if repeat == 0, in which case wait_steps will be set to profiling_frequency - (warmup_steps + active_steps) such that the effective cycle length == profiling_frequency', int, 10)
        self.max_steps = Parameter('Max number of training steps (in units of batches) per epoch. -1 means no max_steps, otherwise training loop breaks after `max_steps` each epoch', int, -1)

    def __str__(self): return "\n".join([f"{attr}: {getattr(self, attr).value}" for attr in vars(self)])
    def __repr__(self): return self.__str__()

In [None]:
params = DefaultParameters(); params.batch_size, type(params.batch_size)

(<__main__.Param>, __main__.Param)

In [None]:
params.batch_size.value

1

In [None]:
params.batch_size.description, params.batch_size.choices

('Batch size per GPU. Effective BS = batch_size * world_size * gradient_accumulation_steps',
 None)

In [None]:
params.train_type.description, params.train_type.choices

('Training type',
 ['full',
  'lora',
  'qlora',
  'custom_qlora',
  'custom_lora',
  'hqq_lora',
  'hqq_dora',
  'bnb_dora',
  'bnb_llama_pro',
  'hqq_llama_pro'])

## Export -

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()