In [6]:
# 模型配置
from transformers import AutoConfig
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from accelerate import Accelerator

hidden_size = 128

intermediate_size = (int(hidden_size * 8/3 / 128) + 1) * 128


config = AutoConfig.for_model(
    model_type="llama",
    hidden_size=hidden_size,
    intermediate_size=intermediate_size,
    num_attention_heads=8,
    num_hidden_layers=2,
    num_key_value_heads=4,
    tie_word_embeddings=True,vocab_size=2048,max_position_embeddings=512
)


print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 384,
  "max_position_embeddings": 512,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 8,
  "num_hidden_layers": 2,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": true,
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 2048
}



In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained('story_tokenizer_2048')

print(tokenizer)

LlamaTokenizerFast(name_or_path='story_tokenizer_2048', vocab_size=2048, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|start_story|>', 'eos_token': '<|end_story|>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<|start_story|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("<|end_story|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


In [7]:
from transformers import Trainer
from transformers import TrainingArguments

training_args = TrainingArguments(
        output_dir='saves',
        overwrite_output_dir=True,
        do_train=True,
        do_eval=False,
        eval_steps=1000,
        per_device_train_batch_size=64,
        gradient_accumulation_steps=1,
        learning_rate=5e-4,
        lr_scheduler_type='cosine',
        bf16=True,
        logging_steps=25, 
        report_to="wandb",
        num_train_epochs=2,
        save_steps=10000000,
        seed=3407,
        warmup_steps=800
    )
def init_model():
    model = AutoModelForCausalLM.from_config(                    
        config,
        torch_dtype=torch.float32   # 全精度训练
    ).to(device)                    # 迁移到 device 上

    # Kaiming 初始化
    def kaiming_initialization(model):
        for name, param in model.named_parameters():
            if 'weight' in name and param.dim() > 1:
                torch.nn.init.kaiming_uniform_(param, mode='fan_in', nonlinearity='leaky_relu')
            elif 'bias' in name:
                # 一般偏置项可以初始化为 0
                torch.nn.init.constant_(param, 0)

    kaiming_initialization(model)

    def print_model_parameters(model):
        print("Layer Name & Parameters")
        print("----------------------------")
        total_params = 0
        for name, parameter in model.named_parameters():
            param_size = parameter.size()
            param_count = torch.prod(torch.tensor(param_size)).item()
            total_params += param_count
            print(f"{name:50} | Size: {str(param_size):30} | Count: {str(param_count):20}")
        print("----------------------------")
        print(f"Total Parameters: {total_params} ({total_params / 1000000:.1f} M)")

    print_model_parameters(model)
    return model
model=init_model()

Layer Name & Parameters
----------------------------
model.embed_tokens.weight                          | Size: torch.Size([2048, 128])        | Count: 262144              
model.layers.0.self_attn.q_proj.weight             | Size: torch.Size([128, 128])         | Count: 16384               
model.layers.0.self_attn.k_proj.weight             | Size: torch.Size([64, 128])          | Count: 8192                
model.layers.0.self_attn.v_proj.weight             | Size: torch.Size([64, 128])          | Count: 8192                
model.layers.0.self_attn.o_proj.weight             | Size: torch.Size([128, 128])         | Count: 16384               
model.layers.0.mlp.gate_proj.weight                | Size: torch.Size([384, 128])         | Count: 49152               
model.layers.0.mlp.up_proj.weight                  | Size: torch.Size([384, 128])         | Count: 49152               
model.layers.0.mlp.down_proj.weight                | Size: torch.Size([128, 384])         | Count: 49152   

In [None]:
def inference(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    input_text: str = "Once upon a time, ",
    max_new_tokens: int = 16
):
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=40,
        top_p=0.95,
        temperature=0.8
    )
    generated_text = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )
    # print(outputs)
    print(generated_text)

# 测试推理

# Prepare data

In [None]:
def init_model():
    model = AutoModelForCausalLM.from_config(                    
        config,
        torch_dtype=torch.float32   # 全精度训练
    ).to(device)                    # 迁移到 device 上

    # Kaiming 初始化
    def kaiming_initialization(model):
        for name, param in model.named_parameters():
            if 'weight' in name and param.dim() > 1:
                torch.nn.init.kaiming_uniform_(param, mode='fan_in', nonlinearity='leaky_relu')
            elif 'bias' in name:
                # 一般偏置项可以初始化为 0
                torch.nn.init.constant_(param, 0)

    kaiming_initialization(model)

    def print_model_parameters(model):
        print("Layer Name & Parameters")
        print("----------------------------")
        total_params = 0
        for name, parameter in model.named_parameters():
            param_size = parameter.size()
            param_count = torch.prod(torch.tensor(param_size)).item()
            total_params += param_count
            print(f"{name:50} | Size: {str(param_size):30} | Count: {str(param_count):20}")
        print("----------------------------")
        print(f"Total Parameters: {total_params} ({total_params / 1000000:.1f} M)")

    print_model_parameters(model)
    return model    

In [None]:
# 加载数据集
from datasets import load_dataset

dataset_name_or_path = "TinyStoriesV2_SpecialTokens"        # 可以替换为本地文件夹路径

# ds_train = load_dataset(dataset_name_or_path, split='train')        # 取全部数据
ds_train = load_dataset(dataset_name_or_path, split='train[:10%]')

print(ds_train)

In [None]:
print(ds_train[:2])

In [None]:
from typing import Dict, List

def process_func(
    examples: Dict[str, List]
) -> Dict[str, List]:
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained('story_tokenizer_2048')
    max_token = 512    # 设置最长 token 数目，对于我们当前任务，2048 绝对不会超

    encoded_texts = tokenizer(examples['text'], add_special_tokens=False)
    input_ids_list = encoded_texts['input_ids']

    new_input_ids_list, new_attn_mask_list = [], []
    for input_ids in input_ids_list:
        temp = input_ids[-max_token+1:] + [tokenizer.eos_token_id]
        new_input_ids_list.append(temp)
        new_attn_mask_list.append([1] * len(temp))
    return {
        "input_ids": new_input_ids_list,
        "attention_mask": new_attn_mask_list
    }

In [None]:
num_proc = 16                                    # 处理数据时所用的线程数

ds_train = ds_train.shuffle()

ds_train = ds_train.map(
    process_func,
    batched=True,
    num_proc=num_proc,
    remove_columns=ds_train.column_names,
    desc='Running tokenizer on train_set: '
)

print(ds_train[0])

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 正式开始训练

In [None]:

import wandb
wandb.init()

trainer=Trainer(
            model=model,
            args=training_args,
            train_dataset=ds_train,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
trainer.train()

In [None]:
inference(
    model,
    tokenizer,
    "<|start_story|>Once upon a time, there was a little boy named Tom. Tom ",
    max_new_tokens=256
)

In [None]:





def train_function(config=None):
    with wandb.init(config=config):
        # 从 wandb 获取超参数
        config = wandb.config

        training_args = TrainingArguments(
        output_dir='saves',                         # 输出路径，包括模型检查点、中间文件等
        overwrite_output_dir=True,                  # 是否覆写 output_dir
        do_train=True,                              # 是否做训练
        do_eval=False,                               # 是否做评估
        eval_steps=1000,                            # 评估步骤间隔
        per_device_train_batch_size=config.batch_size,              # 每设备批次
        gradient_accumulation_steps=1,              # 梯度累计步大小，省显存，但小模型没必要，用 1 收敛比较快
        learning_rate=config.learning_rate,                         # 学习率大小
        lr_scheduler_type='cosine',                 # 学习率调度策略，LLM 训练一般都用余弦
        bf16=True,
        logging_steps=50,                           # 打印步骤间隔
        report_to="wandb",                             # 日志输出目标，不想用 wandb 可以设置为 None
        num_train_epochs=1,                         # 训练轮数，2 ~ 3 即可
        save_steps=100000,                            # 检查点保存步骤间隔
        seed=3407,
        warmup_steps=config.warmup_steps
    )

        trainer=Trainer(
            model=init_model(),
            args=training_args,
            train_dataset=ds_train,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        trainer.train()

In [None]:
wandb.init()
import os

sweep_config = {
    "method": "bayes",  # 使用贝叶斯优化
    "metric": {
        "name": "train/loss",  # 优化的指标是训练 loss
        "goal": "minimize",  # 目标是最小化训练 loss
    },
    "parameters": {
        "learning_rate": {
            "min": 5e-5,
            "max": 1e-3,
            "distribution": "log_uniform_values",  # 学习率的分布是对数均匀分布
        },
        "batch_size": {"values": [16, 32, 64]},
        "warmup_steps": {"min": 0, "max": 100}
    },
}
os.environ["WANDB_TIMEOUT"] = "60"
sweep_id = wandb.sweep(sweep_config, project="tinystories-lm-1")
wandb.agent(sweep_id, train_function, count=100)

In [None]:
model_path = '111'

model.save_pretrained(model_path)