In [5]:
import time
from dataclasses import dataclass, field, fields
import torch
from models.memoryGPT.config import GPTConfig, TrainConfig


# # 从配置文件加载配置
# config_file = '../configs/finetune_gpt2.py'
# with open(config_file, 'r', encoding='utf-8') as f:
#     exec(f.read())
# 
# # 将配置文件中的所有变量加载到config对象中
# config_dict = {k: v for k, v in locals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))}
# config = TrainConfig(**config_dict)

In [6]:
# 从配置文件加载配置
config_file = '../configs/finetune_gpt2.py'
config_vars = {}
with open(config_file, 'r', encoding='utf-8') as f:
    exec(f.read(), {}, config_vars)

# 将配置文件中的所有变量加载到config对象中
config_dict = {k: v for k, v in config_vars.items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))}
train_config_fields = {field.name for field in fields(TrainConfig)}
filtered_config_dict = {k: v for k, v in config_dict.items() if k in train_config_fields}
config = TrainConfig(**filtered_config_dict)


In [7]:
config_dict

{'out_dir': 'out-owt',
 'eval_interval': 500,
 'eval_iters': 100,
 'eval_only': False,
 'log_interval': 10,
 'wandb_log': False,
 'wandb_project': 'owt',
 'wandb_run_name': 'ft-1718680688.0224152',
 'dataset': 'fineweb',
 'train_mode': 'pretrain',
 'init_from': 'resume',
 'always_save_checkpoint': False,
 'batch_size': 1,
 'gradient_accumulation_steps': 16,
 'max_iters': 600000,
 'lr_decay_iters': 100000,
 'warmup_iters': 200,
 'memory_dim': 896,
 'intermediate_size': 4864,
 'n_layer': 24,
 'n_embd': 896,
 'num_attention_heads': 14,
 'num_key_value_heads': 2,
 'short_term_memory_size': 16,
 'bias': True,
 'rms_norm_eps': 1e-06,
 'block_size': 1024,
 'input_block_size': 256,
 'train_size_ratio': 16,
 'val_size_ratio': 256,
 'train_size': 4096,
 'val_size': 65536,
 'learning_rate': 8e-05,
 'decay_lr': True,
 'min_lr': 1e-06,
 'use_moe': False,
 'n_expert': 16,
 'n_expert_per_tok': 4,
 'dropout': 0.0,
 'weight_decay': 0.1,
 'beta1': 0.9,
 'beta2': 0.95,
 'grad_clip': 1.0,
 'backend': 'ncc

In [8]:
import os
import time
import pickle
import math
from contextlib import nullcontext
from dataclasses import fields

import torch
from datasets import load_dataset
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

from models.utils import get_lr
from dataloader import get_batch, CustomDataset, collate_fn
from models.memoryGPT.eval import estimate_loss
from models.memoryGPT.gpt2 import GPT
from models.memoryGPT.config import GPTConfig, TrainConfig


# # 从配置文件加载配置
# config_file = 'configs/finetune_gpt2.py'
# with open(config_file, 'r', encoding='utf-8') as f:
#     exec(f.read())
#
# # 将配置文件中的所有变量加载到config对象中
# config_dict = {k: v for k, v in locals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))}
# config = TrainConfig(**config_dict)

# 从配置文件加载配置
config_file = '../configs/finetune_gpt2.py'
config_vars = {}
with open(config_file, 'r', encoding='utf-8') as f:
    exec(f.read(), {}, config_vars)

# 将配置文件中的所有变量加载到config对象中
config_dict = {k: v for k, v in config_vars.items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))}
train_config_fields = {field.name for field in fields(TrainConfig)}
filtered_config_dict = {k: v for k, v in config_dict.items() if k in train_config_fields}
config = TrainConfig(**filtered_config_dict)

# 现在可以使用 config.参数名 来访问配置了
print(config)

# various inits, derived attributes, I/O setup
ddp = int(os.environ.get('RANK', -1)) != -1  # is this a ddp run?
if ddp:
    print("using distributed data parallel")
    init_process_group(backend=config.backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0  # this process will do logging, checkpointing etc.
    seed_offset = ddp_rank  # each process gets a different seed
    # world_size number of processes will be training simultaneously, so we can scale
    # down the desired gradient accumulation iterations per process proportionally
    assert config.gradient_accumulation_steps % ddp_world_size == 0
    config.gradient_accumulation_steps //= ddp_world_size
else:
    # if not ddp, we are running on a single gpu, and one process
    print("not using distributed data parallel")
    master_process = True
    seed_offset = 0
    ddp_world_size = 1
tokens_per_iter = config.gradient_accumulation_steps * ddp_world_size * config.batch_size * config.block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

if master_process:
    os.makedirs(config.out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu'  # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[config.dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# poor man's data loader
config_dict['data_dir'] = os.path.join('data', config.dataset)
print(f"load data from {config_dict['data_dir']}")

# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
iter_num = 0
best_val_loss = 1e9

# attempt to derive vocab_size from the dataset
meta_path = os.path.join(config_dict['data_dir'], 'meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    meta_vocab_size = meta['vocab_size']
    print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")

# model init choose arguments from config_dict that GPTConfig has
model_args = {k: v for k, v in config_dict.items() if k in GPTConfig.__dataclass_fields__}
if config.init_from == 'scratch':
    # init a new model from scratch
    print("Initializing a new model from scratch")
    # determine the vocab size we'll use for from-scratch training
    if meta_vocab_size is None:
        print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
    model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
elif config.init_from.startswith('Qwen') or config.init_from.startswith('meta'):
    print(f"Initializing from {config.init_from} weights")
    override_args = dict(dropout=config.dropout)
    model = GPT.from_pretrained(config.init_from, override_args)
    # read off the created configs params, so we can store them into checkpoint correctly
    model_args = {k: getattr(model.config, k) for k in GPTConfig.__dataclass_fields__}
elif config.init_from == 'resume':
    print(f"Resuming training from {config.out_dir}")
    # resume training from a checkpoint.
    ckpt_path = os.path.join(config.out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint['model_args']

    # create the model
    gptconf = GPTConfig(**checkpoint_model_args)
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    # fix the keys of the state dictionary :(
    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
    unwanted_prefix = '_orig_mod.'
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']

TrainConfig(max_batch_size=64, short_term_memory_size=16, long_term_memory_layer=16, long_term_memory_chunk_size=4, rope_theta=500000, rms_norm_eps=1e-06, block_size=1024, input_block_size=256, vocab_size=50304, n_layer=24, num_attention_heads=14, num_key_value_heads=2, use_moe=False, n_expert=16, n_expert_per_tok=4, n_embd=896, intermediate_size=4864, dropout=0.0, bias=True, device='cuda', init_from='Qwen/Qwen2-0.5B-Instruct', config_file='configs/finetune_gpt2.py', out_dir='out-owt', eval_interval=500, eval_iters=100, eval_only=False, log_interval=10, wandb_log=False, wandb_project='owt', wandb_run_name='ft-1718683316.4971983', dataset='fineweb', train_mode='pretrain', always_save_checkpoint=False, train_size_ratio=16, val_size_ratio=256, train_size=4096, val_size=65536, batch_size=1, gradient_accumulation_steps=16, max_iters=600000, lr_decay_iters=100000, warmup_iters=200, memory_dim=896, learning_rate=8e-05, decay_lr=True, min_lr=1e-06, weight_decay=0.1, beta1=0.9, beta2=0.95, grad