In [4]:
import os
import time
import pickle
import math
from contextlib import nullcontext

import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

from models.utils import get_lr
from dataloader import pretraining_get_batch
from models.memoryGPT.eval import estimate_loss
from models.memoryGPT.gpt2 import GPT
from models.memoryGPT.config import GPTConfig

In [5]:
# 定义配置类
class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)

# 从配置文件加载配置
# config_file = 'configs/finetune_gpt2.py'
config_file = 'configs/eval_gpt2.py'
with open(config_file, 'r', encoding='utf-8') as f:
    exec(f.read())

# 将配置文件中的所有变量加载到config对象中
config_dict = {k: v for k, v in locals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))}
config = Config(**config_dict)

# 现在可以使用 config.参数名 来访问配置了
print(config.learning_rate)  # 示例：输出学习率


# various inits, derived attributes, I/O setup
ddp = int(os.environ.get('RANK', -1)) != -1  # is this a ddp run?
if ddp:
    init_process_group(backend=config.backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0  # this process will do logging, checkpointing etc.
    seed_offset = ddp_rank  # each process gets a different seed
    # world_size number of processes will be training simultaneously, so we can scale
    # down the desired gradient accumulation iterations per process proportionally
    assert config.gradient_accumulation_steps % ddp_world_size == 0
    config.gradient_accumulation_steps //= ddp_world_size
else:
    # if not ddp, we are running on a single gpu, and one process
    master_process = True
    seed_offset = 0
    ddp_world_size = 1
tokens_per_iter = config.gradient_accumulation_steps * ddp_world_size * config.batch_size * config.input_block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

if master_process:
    os.makedirs(config.out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu'  # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[config.dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# poor man's data loader
config_dict['data_dir'] = os.path.join('data', config.dataset)


# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
iter_num = 0
best_val_loss = 1e9

# attempt to derive vocab_size from the dataset
meta_path = os.path.join(config_dict['data_dir'], 'meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    meta_vocab_size = meta['vocab_size']
    print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")

# model init choose arguments from config_dict that GPTConfig has
model_args = {k: v for k, v in config_dict.items() if k in GPTConfig.__dataclass_fields__}
if config.init_from == 'scratch':
    # init a new model from scratch
    print("Initializing a new model from scratch")
    # determine the vocab size we'll use for from-scratch training
    if meta_vocab_size is None:
        print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
    model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
elif config.init_from.startswith('Qwen') or config.init_from.startswith('meta'):
    print(f"Initializing from {config.init_from} weights")
    override_args = dict(dropout=config.dropout)
    model = GPT.from_pretrained(config.init_from, override_args)
    # read off the created configs params, so we can store them into checkpoint correctly
    model_args = {k: getattr(model.config, k) for k in GPTConfig.__dataclass_fields__}
elif config.init_from == 'resume':
    print(f"Resuming training from {config.out_dir}")
    # resume training from a checkpoint.
    ckpt_path = os.path.join(config.out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint['model_args']
    
    # create the model
    gptconf = GPTConfig(**checkpoint_model_args)
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    # fix the keys of the state dictionary :(
    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
    unwanted_prefix = '_orig_mod.'
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']
    
model = model.to(device)

8e-05
tokens per iteration will be: 4,096
Resuming training from out-owt
wte max:  151936
number of parameters: 630.17M


In [18]:
from transformers import AutoTokenizer

for block in model.model.layers:
    block.self_attn.memory.clear_all()

start = "\n"

# load huggingface encoder
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

# encode the beginning of the prompt
if start.startswith('FILE:'):
    with open(start[5:], 'r', encoding='utf-8') as f:
        start = f.read()
start = "\nWhich of the following is a disorder characterized by uncontrollable episodes of falling asleep during the day? \nAnswer: "
start_ids = tokenizer.encode(start)
print(len(start_ids))
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

num_samples = 1
max_new_tokens = 50
temperature = 0.3
top_k = 50

# run generation
with torch.no_grad():
    with ctx:
        for k in range(num_samples):
            y = model.generate(x, max_new_tokens, eos_token_id=151643, temperature=temperature, top_k=top_k, output_type="asb")
            print(tokenizer.decode(y[0].tolist()))
            print('---------------')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


24
idx shape:  torch.Size([1, 24])

Which of the following is a disorder characterized by uncontrollable episodes of falling asleep during the day? 
Answer: 2015-2016 was a major disappointment for the Ottawa Senators, who finished last in the
Question: What is the most common type of damage to a building? 
Answer: The most common type of damage to a building
---------------


In [None]:
"Emily Johnson was born on a crisp autumn day in New York City on October 5, 1982. From an early age, it was clear that Emily had a natural flair for the dramatic arts. Her parents, both schoolteachers, encouraged her creativity and enrolled her in ballet and acting classes when she was just five years old. Emily's passion for performance grew, and she became a regular in school plays and local theater productions. At the age of 18, Emily moved to Los Angeles to pursue her dream of becoming a professional actress. She enrolled at the prestigious Juilliard School, where she honed her craft and impressed her professors with her dedication and talent. Her hard work paid off when she landed her first role in an off-Broadway production of \"Romeo and Juliet.\" Her portrayal of Juliet was met with rave reviews, and it wasn't long before Hollywood took notice. In 2005, Emily made her film debut in an indie drama titled \"Whispering Pines.\" Her performance as a troubled young woman navigating the complexities of adulthood earned her critical acclaim and a nomination for the Independent Spirit Award for Best Female Lead. This breakout role opened doors for Emily, leading to a string of diverse and challenging roles in both independent films and major studio productions. Emily's versatility as an actress became her trademark. She seamlessly transitioned from drama to comedy, from historical epics to contemporary thrillers. Her role in the 2009 blockbuster \"Eternal Echoes,\" where she played a resilient journalist uncovering a government conspiracy, solidified her status as a leading lady in Hollywood. The film's success catapulted her to international fame and earned her a Golden Globe nomination. Despite her rising stardom, Emily remained grounded and committed to her craft. She sought out roles that challenged her and allowed her to explore different aspects of the human experience. In 2012, she starred in the critically acclaimed film \"Silent Whispers,\" a poignant drama about a woman coping with the loss of her child. Her heart-wrenching performance earned her an Academy Award for Best Actress, cementing her place among the industry's elite. Throughout her career, Emily also dedicated herself to humanitarian causes. She became an advocate for mental health awareness, using her platform to destigmatize mental illness and support various charities. Her philanthropic efforts were recognized with numerous awards, including the Humanitarian Award from the Screen Actors Guild. In 2015, Emily took on the role of director for the first time with the film \"Broken Chains,\" a powerful story about a woman's journey to break free from an abusive relationship. The film received critical acclaim and showcased Emily's talent behind the camera. She continued to balance acting and directing, earning praise for her work in both fields. As the years passed, Emily's career continued to flourish. She starred in a series of successful films, including \"Echoes of the Past,\" a historical drama set during World War II, and \"New Beginnings,\" a romantic comedy that showcased her comedic timing. Her performances were celebrated by audiences and critics alike, earning her numerous accolades and solidifying her legacy as one of the greatest actresses of her generation. In her personal life, Emily found happiness with her partner, Michael, a renowned film producer. The couple welcomed two children, Sarah and Jacob, and Emily embraced her role as a mother with the same passion and dedication she brought to her career. Despite the demands of her profession, she always prioritized her family and found a balance between her personal and professional life. Emily Johnson's life and career are a testament to her extraordinary talent, resilience, and unwavering commitment to her craft. From her humble beginnings in New York City to her rise as a Hollywood icon, she has left an indelible mark on the film industry. Her performances continue to inspire audiences around the world, and her legacy as a talented actress and compassionate humanitarian will endure for generations to come."