In [1]:
# 这份代码修改自仓库： https://github.com/timinar/BabyLlama

# 训练教师模型GPT-2 和 Llama

> 论文中写到：
>
> "The GPT-2 model has 24 layers, 16 attention heads, an embedding dimension of 1536, intermediate size of 6144, and maximum sequence length of 128, resulting in 705M parameters. It was trained for 6 epochs with a batch size of 256 and maximum learning rate3 of 2.5 · 10−4. The LLaMA model has 24 layers, 8 attention heads, a hidden size of 1024, intermediate size of 3072, and maximum sequence length of 256, resulting in 360M parameters. It was trained for 4 epochs with a batch size of 128 and maximum learning rate of 3 · 10−4."

In [2]:
# 准备数据
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2TokenizerFast
from babylm_dataset import BabylmDataset
from random import sample, seed
from torch.utils.data import Subset

data_train_path = "./data/train_10M_clean"
data_eval_path = "./data/dev_clean"
tokenizer_path = "./models/gpt-clean-16000.json"

SEQ_LENGTH = 128
tokenizer = GPT2TokenizerFast(tokenizer_file= str(tokenizer_path))
tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"
tokenizer.pad_token = "<pad>"
tokenizer.model_max_length = SEQ_LENGTH

# 进入BsbylmDataset类，可以在初始化函数中修改数据集大小
train_dataset = BabylmDataset(data_train_path, SEQ_LENGTH, tokenizer=tokenizer, random_chunk=True)
full_eval_dataset = BabylmDataset(data_eval_path, SEQ_LENGTH, tokenizer=tokenizer, offset=0)

seed(2024) # we fix the same subset for all models
eval_indices = sample(range(len(full_eval_dataset)), 200)
eval_dataset = Subset(full_eval_dataset, eval_indices)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

Loading data from data/train_10M_clean/tokenized_GPT2TokenizerFast_16000.pt
🔥 数据集总大小: 16912909
🔥 为了缩短训练时间，这里缩减为: 422822
Loading data from data/dev_clean/tokenized_GPT2TokenizerFast_16000.pt
🔥 数据集总大小: 17428872
🔥 为了缩短训练时间，这里缩减为: 435721


  self.data = torch.load(tokenized_file)


In [3]:
# 训练GPT2模型
from transformers import (
    GPT2Config, GPT2LMHeadModel, 
)
from transformers import Trainer, TrainingArguments
model_config = GPT2Config(
        vocab_size=tokenizer.vocab_size,
        n_positions=2*tokenizer.model_max_length,
        n_embd=1536,
        n_layer=24,
        n_head=16,
        pad_token_id=tokenizer.convert_tokens_to_ids("<pad>"),
    )
model = GPT2LMHeadModel(model_config)

output_dir = "./models/gpt2-teacher"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    num_train_epochs=6,
    gradient_accumulation_steps=2,
    per_device_train_batch_size=16,
    save_total_limit=1,  # Set to zero to avoid saving
    warmup_steps=300, 
    lr_scheduler_type="cosine",
    learning_rate=float(2.5e-4),
    logging_steps=20,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    torch_compile = False,
    no_cuda = True,   # we use CPU，显卡足够大的话可以改为False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)



  0%|          | 0/618 [00:00<?, ?it/s]

In [None]:
# 训练Llama模型
from transformers import (
    LlamaConfig, LlamaForCausalLM,  
)
from transformers import Trainer, TrainingArguments
model_config = LlamaConfig(
        vocab_size=tokenizer.vocab_size,
        max_position_embeddings=2*tokenizer.model_max_length,
        hidden_size=1024,
        intermediate_size=3072,
        num_hidden_layers=24,
        num_attention_heads=8,
        tie_word_embeddings=False,
        pad_token_id=tokenizer.convert_tokens_to_ids("<pad>"),
    )
model = LlamaForCausalLM(model_config)

output_dir = "./models/llama-teacher"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    num_train_epochs=4,
    gradient_accumulation_steps=2,
    per_device_train_batch_size=16,
    save_total_limit=1,  # Set to zero to avoid saving
    warmup_steps=300, 
    lr_scheduler_type="cosine",
    learning_rate=float(3e-4),
    logging_steps=20,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    torch_compile = False,
    no_cuda=True   # we use CPU，显卡足够大的话可以改为False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)



  0%|          | 0/332 [00:00<?, ?it/s]

{'loss': 16.5414, 'grad_norm': 7.050642013549805, 'learning_rate': 1.9999999999999998e-05, 'epoch': 0.24}
{'loss': 10.4487, 'grad_norm': 4.373862266540527, 'learning_rate': 3.9999999999999996e-05, 'epoch': 0.48}
{'loss': 7.7475, 'grad_norm': 2.7600486278533936, 'learning_rate': 5.9999999999999995e-05, 'epoch': 0.72}
{'loss': 6.8205, 'grad_norm': 2.7919507026672363, 'learning_rate': 7.999999999999999e-05, 'epoch': 0.96}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 7.165042877197266, 'eval_runtime': 48.2064, 'eval_samples_per_second': 4.149, 'eval_steps_per_second': 0.519, 'epoch': 1.0}
{'loss': 6.1967, 'grad_norm': 2.381103515625, 'learning_rate': 9.999999999999999e-05, 'epoch': 1.2}
{'loss': 5.7663, 'grad_norm': 2.0445632934570312, 'learning_rate': 0.00011999999999999999, 'epoch': 1.45}
{'loss': 5.6016, 'grad_norm': 1.9865704774856567, 'learning_rate': 0.00014, 'epoch': 1.69}
{'loss': 5.4055, 'grad_norm': 2.2117745876312256, 'learning_rate': 0.00015999999999999999, 'epoch': 1.93}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 7.207029342651367, 'eval_runtime': 43.6859, 'eval_samples_per_second': 4.578, 'eval_steps_per_second': 0.572, 'epoch': 2.0}
{'loss': 5.113, 'grad_norm': 2.0187833309173584, 'learning_rate': 0.00017999999999999998, 'epoch': 2.17}
{'loss': 4.9987, 'grad_norm': 1.7442786693572998, 'learning_rate': 0.00019999999999999998, 'epoch': 2.41}
{'loss': 4.8864, 'grad_norm': 1.9827890396118164, 'learning_rate': 0.00021999999999999995, 'epoch': 2.65}
{'loss': 4.7585, 'grad_norm': 1.594044804573059, 'learning_rate': 0.00023999999999999998, 'epoch': 2.89}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 7.277788162231445, 'eval_runtime': 47.1181, 'eval_samples_per_second': 4.245, 'eval_steps_per_second': 0.531, 'epoch': 3.0}
{'loss': 4.554, 'grad_norm': 1.9135853052139282, 'learning_rate': 0.00026, 'epoch': 3.13}
{'loss': 4.4329, 'grad_norm': 1.7531003952026367, 'learning_rate': 0.00028, 'epoch': 3.37}
{'loss': 4.4757, 'grad_norm': 1.717421293258667, 'learning_rate': 0.0003, 'epoch': 3.61}
{'loss': 4.2759, 'grad_norm': 1.5224635601043701, 'learning_rate': 9.259748514523653e-05, 'epoch': 3.86}


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 7.135180473327637, 'eval_runtime': 44.22, 'eval_samples_per_second': 4.523, 'eval_steps_per_second': 0.565, 'epoch': 4.0}
{'train_runtime': 7478.7308, 'train_samples_per_second': 1.413, 'train_steps_per_second': 0.044, 'train_loss': 6.295483612152467, 'epoch': 4.0}


('./gpt2-teacher/tokenizer_config.json',
 './gpt2-teacher/special_tokens_map.json',
 './gpt2-teacher/vocab.json',
 './gpt2-teacher/merges.txt',
 './gpt2-teacher/added_tokens.json',
 './gpt2-teacher/tokenizer.json')