In [10]:
import os
import time
import pickle
import math
from contextlib import nullcontext

import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

from models.utils import get_lr
from dataloader import get_batch
from models.memoryGPT.eval import estimate_loss
from models.memoryGPT.gpt2 import GPT
from models.memoryGPT.config import GPTConfig

In [11]:
# 定义配置类
class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)

# 从配置文件加载配置
# config_file = 'configs/finetune_gpt2.py'
config_file = 'configs/eval_gpt2.py'
with open(config_file, 'r', encoding='utf-8') as f:
    exec(f.read())

# 将配置文件中的所有变量加载到config对象中
config_dict = {k: v for k, v in locals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))}
config = Config(**config_dict)

# 现在可以使用 config.参数名 来访问配置了
print(config.learning_rate)  # 示例：输出学习率


# various inits, derived attributes, I/O setup
ddp = int(os.environ.get('RANK', -1)) != -1  # is this a ddp run?
if ddp:
    init_process_group(backend=config.backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0  # this process will do logging, checkpointing etc.
    seed_offset = ddp_rank  # each process gets a different seed
    # world_size number of processes will be training simultaneously, so we can scale
    # down the desired gradient accumulation iterations per process proportionally
    assert config.gradient_accumulation_steps % ddp_world_size == 0
    config.gradient_accumulation_steps //= ddp_world_size
else:
    # if not ddp, we are running on a single gpu, and one process
    master_process = True
    seed_offset = 0
    ddp_world_size = 1
tokens_per_iter = config.gradient_accumulation_steps * ddp_world_size * config.batch_size * config.block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

if master_process:
    os.makedirs(config.out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu'  # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[config.dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# poor man's data loader
config_dict['data_dir'] = os.path.join('data', config.dataset)


# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
iter_num = 0
best_val_loss = 1e9

# attempt to derive vocab_size from the dataset
meta_path = os.path.join(config_dict['data_dir'], 'meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    meta_vocab_size = meta['vocab_size']
    print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")

# model init choose arguments from config_dict that GPTConfig has
model_args = {k: v for k, v in config_dict.items() if k in GPTConfig.__dataclass_fields__}
if config.init_from == 'scratch':
    # init a new model from scratch
    print("Initializing a new model from scratch")
    # determine the vocab size we'll use for from-scratch training
    if meta_vocab_size is None:
        print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
    model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
elif config.init_from.startswith('Qwen') or config.init_from.startswith('meta'):
    print(f"Initializing from {config.init_from} weights")
    override_args = dict(dropout=config.dropout)
    model = GPT.from_pretrained(config.init_from, override_args)
    # read off the created configs params, so we can store them into checkpoint correctly
    model_args = {k: getattr(model.config, k) for k in GPTConfig.__dataclass_fields__}
elif config.init_from == 'resume':
    print(f"Resuming training from {config.out_dir}")
    # resume training from a checkpoint.
    ckpt_path = os.path.join(config.out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint['model_args']
    
    # create the model
    gptconf = GPTConfig(**checkpoint_model_args)
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    # fix the keys of the state dictionary :(
    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
    unwanted_prefix = '_orig_mod.'
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']

8e-05
tokens per iteration will be: 16,384
Resuming training from out-owt
wte max:  151936
number of parameters: 630.17M


In [12]:
model.to(device)

GPT(
  (model): ModuleDict(
    (embed_tokens): Embedding(151936, 896)
    (drop): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x Block(
        (input_layernorm): RMSNorm()
        (self_attn): MemorySelfAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (memory): Memory(
            (long_term_memory): ModuleList(
              (0-15): 16 x MemoryQueue()
            )
            (short_term_memory): MemoryPool()
          )
        )
        (post_attention_layernorm): RMSNorm()
        (mlp): MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (

In [6]:
print(model_args)

{'max_batch_size': 64, 'short_term_memory_size': 16, 'long_term_memory_layer': 16, 'long_term_memory_chunk_size': 4, 'rope_theta': 500000, 'rms_norm_eps': 1e-06, 'block_size': 1024, 'input_block_size': 256, 'vocab_size': 151936, 'n_layer': 24, 'num_attention_heads': 14, 'num_key_value_heads': 2, 'use_moe': False, 'n_expert': 16, 'n_expert_per_tok': 4, 'n_embd': 896, 'intermediate_size': 4864, 'dropout': 0.0, 'bias': True, 'device': 'cuda'}


In [7]:
# 输出模型的显存占用
print(f"model is using {sum(p.numel() for p in model.parameters())} parameters")

# 用GB单位输出显存占用
print(f"model is using {sum(p.numel() for p in model.parameters()) * 4 / 1024 ** 3:.2f}GB of GPU memory")

model is using 630167424 parameters
model is using 2.35GB of GPU memory


In [8]:
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(config.dtype == 'float16'))

# optimizer
optimizer = model.configure_optimizers(config.weight_decay, config.learning_rate, (config.beta1, config.beta2), device_type)
if config.init_from == 'resume':
    optimizer.load_state_dict(config.checkpoint['optimizer'])
checkpoint = None  # free up memory

# compile the model
if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model)  # requires PyTorch 2.0

# wrap model into DDP container
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])

# logging
if config.wandb_log and master_process:
    import wandb

    wandb.init(project=config.wandb_project, name=config.wandb_run_name, config=config_dict)

# training loop
X, Y = get_batch(config_dict, 'train', config.train_size, device, device_type)  # fetch the very first batch
t0 = time.time()
local_iter_num = 0  # number of iterations in the lifetime of this process
raw_model = model.module if ddp else model  # unwrap DDP container if needed
running_mfu = -1.0
while True:

    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num, config.warmup_iters, config.lr_decay_iters, config.learning_rate, config.min_lr) if config.decay_lr else config.learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluate the loss on train/val sets and write checkpoints
    if iter_num % config.eval_interval == 0 and master_process:
        losses = estimate_loss(config_dict, model, ctx, device, device_type, iter_num)
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, train perplexity {losses['train_perplexity']:.4f}, val perplexity {losses['val_perplexity']:.4f}")
        if config.wandb_log:
            wandb.log({
                "iter": iter_num,
                "train/loss": losses['train'],
                "val/loss": losses['val'],
                "train/perplexity": losses['train_perplexity'],
                "val/perplexity": losses['val_perplexity'],
                "lr": lr,
                "mfu": running_mfu * 100,  # convert to percentage
            })
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': raw_model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'configs': config_dict,
                }
                print(f"saving checkpoint to {config.out_dir}")
                torch.save(checkpoint, os.path.join(config.out_dir, 'ckpt.pt'))
        if config.always_save_checkpoint:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': raw_model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'configs': config_dict,
                }
                print(f"saving checkpoint to {config.out_dir}")
                torch.save(checkpoint, os.path.join(config.out_dir, f'{iter_num}.pt'))
    if iter_num == 0 and config.eval_only:
        break

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(config.gradient_accumulation_steps):
        if ddp:
            # in DDP training we only need to sync gradients at the last micro step.
            # the official way to do this is with model.no_sync() context manager, but
            # I really dislike that this bloats the code and forces us to repeat code
            # looking at the source of that context manager, it just toggles this variable
            model.require_backward_grad_sync = (micro_step == config.gradient_accumulation_steps - 1)
        with ctx:
            # 生成一个0-train_size_ratio的整数
            # index = torch.randint(0, train_size_ratio, (1,))

            _, loss = model(X, Y, index=None)
            loss = loss / config.gradient_accumulation_steps  # scale the loss to account for gradient accumulation
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
 
        ratio = 1
        X, Y = get_batch(config_dict, 'train', int(config.train_size * ratio), device, device_type)
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if config.grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % config.log_interval == 0 and master_process:
        # get loss as float. note: this is a CPU-GPU sync point
        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
        lossf = loss.item() * config.gradient_accumulation_steps
        if local_iter_num >= 5:  # let the training loop settle a bit
            mfu = raw_model.estimate_mfu(config.batch_size * config.gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt * 1000:.2f}ms, mfu {running_mfu * 100:.2f}%")
    iter_num += 1
    local_iter_num += 1

    # termination conditions
    if iter_num > config.max_iters:
        break

if ddp:
    destroy_process_group()

num decayed parameter tensors: 170, with 630,095,872 parameters
num non-decayed parameter tensors: 121, with 71,552 parameters
using fused AdamW: True


Evaluating train loss: 100%|██████████| 20/20 [00:18<00:00,  1.06it/s]
Evaluating val loss: 100%|██████████| 20/20 [04:43<00:00, 14.16s/it]


step 0: train loss 6.4300, val loss 6.4365, train perplexity 620.1709, val perplexity 624.2457


OutOfMemoryError: CUDA out of memory. Tried to allocate 594.00 MiB. GPU 

In [13]:
from transformers import AutoTokenizer

start = "\n"

# load huggingface encoder
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

# encode the beginning of the prompt
if start.startswith('FILE:'):
    with open(start[5:], 'r', encoding='utf-8') as f:
        start = f.read()
start = "Background text: Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3, lit. Mary moved to the bathroom. Valkyria of the Battlefield 3 ), commonly referred to as Valkyria Chronicles III outside Japan, is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable.Released in January 2011 in Japan, it is the third game in the Valkyria series.Employing the same fusion of tactical and real @-@ time gameplay as its predecessors, the story runs parallel to the first game and follows the \"Nameless\", a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit \" Calamaty Raven \". The game began development in 2010, carrying over a large portion of the work done on Valkyria Chronicles II.While it retained the standard features of the series, it also underwent multiple adjustments, such as making the game more forgiving for series newcomers.Character designer Raita Honjou and composer Hitoshi Sakimoto both returned from previous entries, along with Valkyria Chronicles II director Takeshi Ozawa.The game's opening theme was sung by May 'n. It met with positive sales in Japan, and was praised by both Japanese and western critics.After release, it received downloadable content, along with an expanded edition in November of that year.It was also adapted into manga and an original video animation series.Due to low sales of Valkyria Chronicles II, Valkyria Chronicles III was not localized, but a fan translation compatible with the game's expanded edition was released in 2014.Media.Vision would return to the franchise with the development of Valkyria : Azure Revolution for the PlayStation 4.John went to the hallway. As with previous Valkyira Chronicles games, Valkyria Chronicles III is a tactical role @-@ playing game where players take control of a military unit and take part in missions against enemy forces.Stories are told through comic book @-@ like panels with animated character portraits, with characters speaking partially through voiced speech bubbles and partially through unvoiced text.The player progresses through a series of linear missions, gradually unlocked as maps that can be freely scanned through and replayed as they are unlocked.The route to each story location on the map varies depending on an individual player's approach : when one option is selected, the other is sealed. \nQuestion: Where is Mary? \nAnswer: "
start_ids = tokenizer.encode(start)
print(len(start_ids))
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

num_samples = 1
max_new_tokens = 50
temperature = 0.3
top_k = 50

# run generation
with torch.no_grad():
    with ctx:
        for k in range(num_samples):
            y = model.generate(x, max_new_tokens, eos_token_id=151643, temperature=temperature, top_k=top_k, output_type="asb")
            print(tokenizer.decode(y[0].tolist()))
            print('---------------')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


523
Background text: Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3, lit. Mary moved to the bathroom. Valkyria of the Battlefield 3 ), commonly referred to as Valkyria Chronicles III outside Japan, is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable.Released in January 2011 in Japan, it is the third game in the Valkyria series.Employing the same fusion of tactical and real @-@ time gameplay as its predecessors, the story runs parallel to the first game and follows the "Nameless", a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven ". The game began development in 2010, carrying over a large portion of the work done on Valkyria Chronicles II.While it retained the standard features of the series, it also underwent multiple adjustments, such as making the game more forgiving for series newcome

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512,
    eos_token_id=tokenizer.eos_token_id,
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


TypeError: argument 'ids': Can't extract `str` to `Vec`

In [4]:
from deepeval.benchmarks import MMLU
from deepeval.benchmarks.tasks import MMLUTask

# Define benchmark with specific tasks and shots
benchmark = MMLU(
    tasks=[MMLUTask.HIGH_SCHOOL_COMPUTER_SCIENCE],
    n_shots=3
)

# Replace 'mistral_7b' with your own custom model
benchmark.evaluate(model=model)
print(benchmark.overall_score)

Processing high_school_computer_science:   0%|          | 0/100 [00:00<?, ?it/s]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Processing high_school_computer_science:   1%|          | 1/100 [00:02<03:54,  2.37s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Processing high_school_computer_science:   2%|▏         | 2/100 [00:04<03:18,  2.03s/it]Special tokens have been added in th

MMLU Task Accuracy (task=high_school_computer_science): 0.0
Overall MMLU Accuracy: 0.0
0.0



