In [1]:
!pwd

/home/users/ap794/final_project_distillLLM/minillm


In [2]:
MASTER_ADDR='localhost'
MASTER_PORT='${2-2012}'
NNODES=1
NODE_RANK=0
GPUS_PER_NODE=1 #${3-16}

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
                  --nnodes $NNODES \
                  --node_rank $NODE_RANK \
                  --master_addr $MASTER_ADDR \
                  --master_port $MASTER_PORT"

# model
# BASE_PATH='/home/MiniLLM'
BASE_PATH='.'
CKPT_NAME="gpt2-xlarge"
#CKPT="${BASE_PATH}/checkpoints/${CKPT_NAME}/"
CKPT="gpt2-xl"
# data
DATA_DIR=f"{BASE_PATH}/processed_data/dolly/full/gpt2/"
# hp
BATCH_SIZE=2
LR=0.00001
GRAD_ACC=1
EVAL_BATCH_SIZE=8
# length
MAX_LENGTH=512
# runtime
SAVE_PATH=f"{BASE_PATH}/results/gpt2/train/sft"
# seed
SEED=10
SEED_ORDER=10


# Define arguments as a dictionary
args_dict = {
    # Model
    "base_path": BASE_PATH,
    "model_path": CKPT,
    "ckpt_name": CKPT_NAME,
    "n_gpu": GPUS_PER_NODE,
    
    # Data
    "data_dir": DATA_DIR,
    "num_workers": 0,
    "dev_num": 1000,

    # Hyperparameters
    "lr": LR,
    "batch_size": BATCH_SIZE,
    "eval_batch_size": EVAL_BATCH_SIZE,
    "gradient_accumulation_steps": GRAD_ACC,
    "warmup_iters": 0,
    "lr_decay_style": "cosine",
    "weight_decay": 1e-2,
    "clip_grad": 1.0,
    "epochs": 10,

    # Sequence length
    "max_length": MAX_LENGTH,
    "max_prompt_length": 256,

    # Runtime
    "do_train": True,
    "do_valid": True,
    "eval_gen": True,
    "save_interval": 10,
    "eval_interval": 10,
    "log_interval": 4,
    "mid_log_num": 10,
    "save": SAVE_PATH,

    # Seed
    "seed": SEED,
    "seed_order": SEED_ORDER,

    # DeepSpeed
    "deepspeed": True,
    "deepspeed_config": f"{BASE_PATH}/configs/deepspeed/ds_config_zero1_fp16.json",

    # Model type
    "type": "lm",

    # Generation settings
    "do_sample": True,
    "top_k": 0,
    "top_p": 1.0,
    "temperature": 1.0,
}

In [3]:
from finetune import *

torch.backends.cudnn.enabled = False

#args = get_args()
args = get_args(**args_dict)

initialize(args)

if dist.get_rank() == 0:
    print_args(args)
    with open(os.path.join(args.save, "args.json"), "w") as f:
        json.dump(vars(args), f)

device = torch.cuda.current_device()
cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
save_rank("\n\n" + "="*30 + f" EXP at {cur_time} " + "="*30, os.path.join(args.save, "log.txt"))

with open(args.deepspeed_config, "r") as f:
    ds_config = json.load(f)

ds_config["gradient_accumulation_steps"] = args.gradient_accumulation_steps
ds_config["train_micro_batch_size_per_gpu"] = args.batch_size
ds_config["gradient_clipping"] = args.clip_grad
ds_config["steps_per_print"] = 10000000

if not args.do_train:
    ds_config["zero_optimization"]["stage"] = 0

if "fp16" in ds_config and ds_config["fp16"]["enabled"]:
    args.dtype = "torch.float16"
elif "bf16" in ds_config and ds_config["bf16"]["enabled"]:
    args.dtype = "torch.bfloat16"
else:
    args.dtype = "torch.float32"
args.deepspeed_config = None

# get the tokenizer
tokenizer = get_tokenizer(args)
dataset = prepare_dataset(
    args,
    tokenizer,
)

dp_world_size = mpu.get_data_parallel_world_size() if args.model_parallel else dist.get_world_size()

if args.do_train:
    args.train_iters_per_epoch = int(len(dataset["train"]) / (args.batch_size * dp_world_size * args.gradient_accumulation_steps))
    print_rank("Train iters per epoch", args.train_iters_per_epoch)
    if args.total_iters is None:
        args.total_iters = args.train_iters_per_epoch * args.epochs
    if args.epochs is None:
        args.epochs = math.ceil(args.total_iters / args.train_iters_per_epoch)
    print_rank("total_iters", args.total_iters)
    
    if args.save_interval == -1:
        args.save_interval = args.train_iters_per_epoch
    
    if args.eval_interval == -1:
        args.eval_interval = args.train_iters_per_epoch

model, optimizer, lr_scheduler = setup_model_and_optimizer(args, ds_config, device, set_optim=args.do_train)

if args.teacher_model_type is None:
    args.teacher_model_type = args.model_type

if args.teacher_model_path is not None:
    teacher_model = get_teacher_model(args, device)
else:
    teacher_model = None

if args.do_train:
    model = finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset, device, teacher_model=teacher_model)

if args.do_eval:
    evaluate(args, tokenizer, model, dataset["test"], "test", 0, device)
    


[2025-03-22 09:21:03,968] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/bin/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


TypeError: get_args() got an unexpected keyword argument 'base_path'