In [None]:
# 这份代码修改自仓库： https://github.com/timinar/BabyLlama

# 训练教师模型GPT-2 和 Llama

> 论文中写到："The GPT-2 model has 24 layers, 16 attention heads, an embedding dimension of 1536, intermediate size of 6144, and maximum sequence length of 128, resulting in 705M parameters. It was trained for 6 epochs with a batch size of 256 and maximum learning rate3 of 2.5 · 10−4. The LLaMA model has 24 layers, 8 attention heads, a hidden size of 1024, intermediate size of 3072, and maximum sequence length of 256, resulting in 360M parameters. It was trained for 4 epochs with a batch size of 128 and maximum learning rate of 3 · 10−4."

In [2]:
# 准备数据
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2TokenizerFast
from babylm_dataset import BabylmDataset
from random import sample, seed
from torch.utils.data import Subset

data_train_path = "F:/llm-deploy-data/data/Babyllama/babylm_10M_clean"
data_eval_path = "F:/llm-deploy-data/data/Babyllama/babylm_dev_clean"
tokenizer_path = "F:/llm-deploy-data/data/Babyllama/models/gpt-clean-16000.json"

SEQ_LENGTH = 128
tokenizer = GPT2TokenizerFast(tokenizer_file= str(tokenizer_path))
tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"
tokenizer.pad_token = "<pad>"
tokenizer.model_max_length = SEQ_LENGTH

train_dataset = BabylmDataset(data_train_path, SEQ_LENGTH, tokenizer=tokenizer, random_chunk=True)
full_eval_dataset = BabylmDataset(data_eval_path, SEQ_LENGTH, tokenizer=tokenizer, offset=0)

seed(2024) # we fix the same subset for all models
eval_indices = sample(range(len(full_eval_dataset)), 200)
eval_dataset = Subset(full_eval_dataset, eval_indices)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

Token indices sequence length is longer than the specified maximum sequence length for this model (1282184 > 128). Running this sequence through the model will result in indexing errors


🔥 F:\llm-deploy-data\data\Babyllama\babylm_10M_clean\bnc_spoken.train len: 1282184
🔥 F:\llm-deploy-data\data\Babyllama\babylm_10M_clean\childes.train len: 6301883
🔥 F:\llm-deploy-data\data\Babyllama\babylm_10M_clean\gutenberg.train len: 3482661
🔥 F:\llm-deploy-data\data\Babyllama\babylm_10M_clean\open_subtitles.train len: 3394685
🔥 F:\llm-deploy-data\data\Babyllama\babylm_10M_clean\simple_wiki.train len: 2202434
🔥 F:\llm-deploy-data\data\Babyllama\babylm_10M_clean\switchboard.train len: 249559
Saving data to F:\llm-deploy-data\data\Babyllama\babylm_10M_clean\tokenized_GPT2TokenizerFast_16000.pt
🔥 F:\llm-deploy-data\data\Babyllama\babylm_dev_clean\bnc_spoken.dev len: 1749792
🔥 F:\llm-deploy-data\data\Babyllama\babylm_dev_clean\childes.dev len: 5927646
🔥 F:\llm-deploy-data\data\Babyllama\babylm_dev_clean\gutenberg.dev len: 3896232
🔥 F:\llm-deploy-data\data\Babyllama\babylm_dev_clean\open_subtitles.dev len: 3466050
🔥 F:\llm-deploy-data\data\Babyllama\babylm_dev_clean\simple_wiki.dev len: 

In [7]:
# 训练GPT2模型
from transformers import (
    GPT2Config, GPT2LMHeadModel, 
)
from transformers import Trainer, TrainingArguments
model_config = GPT2Config(
        vocab_size=tokenizer.vocab_size,
        n_positions=2*tokenizer.model_max_length,
        n_embd=1536,
        n_layer=24,
        n_head=16,
        pad_token_id=tokenizer.convert_tokens_to_ids("<pad>"),
    )
model = GPT2LMHeadModel(model_config)

output_dir = "./gpt2-teacher"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    num_train_epochs=6,
    gradient_accumulation_steps=2,
    per_device_train_batch_size=256,
    save_total_limit=1,  # Set to zero to avoid saving
    warmup_steps=300, 
    lr_scheduler_type="cosine",
    learning_rate=float(2.5e-4),
    logging_steps=20,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    torch_compile = False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)



  0%|          | 0/1548 [00:00<?, ?it/s]

: 

In [None]:
# 训练Llama模型
from transformers import (
    LlamaConfig, LlamaForCausalLM,  
)
from transformers import Trainer, TrainingArguments
model_config = LlamaConfig(
        vocab_size=tokenizer.vocab_size,
        max_position_embeddings=2*tokenizer.model_max_length,
        hidden_size=1024,
        intermediate_size=3072,
        num_hidden_layers=24,
        num_attention_heads=8,
        tie_word_embeddings=False,
        pad_token_id=tokenizer.convert_tokens_to_ids("<pad>"),
    )
model = LlamaForCausalLM(model_config)

output_dir = "./gpt2-teacher"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    num_train_epochs=4,
    gradient_accumulation_steps=2,
    per_device_train_batch_size=128,
    save_total_limit=1,  # Set to zero to avoid saving
    warmup_steps=300, 
    lr_scheduler_type="cosine",
    learning_rate=float(3e-4),
    logging_steps=20,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    torch_compile = False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)