# 加载包和处理数据
1. 只需要保证数据都含有content这一列
2. 使用`glob`包，获得所有的数据的路径
3. 使用`random`包，从所有的文件路径中，随机找50个数据路径，作为训练集合的使用

In [14]:
from datasets import load_dataset, DatasetDict
from glob import glob
import random
random.seed(42)

all_file_list = glob(pathname="gpt2_data/gpt2_data/*/**")
test_file_list = random.sample(all_file_list, 2)
remaining_files = [i for i in all_file_list if i not in test_file_list]
train_file_list = random.sample(remaining_files, 5)

len(train_file_list), len(test_file_list)

(5, 2)

# 创建数据
1. 只要将路径放到一个字典里面。dict的key分别为`train`、`valid`，他们对应的值就是文件路径列表即可

In [15]:
raw_datasets =load_dataset("csv",data_files={'train':train_file_list,'valid':test_file_list}, cache_dir="cache_data")

raw_datasets

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content', 'file_path'],
        num_rows: 5000
    })
    valid: Dataset({
        features: ['content', 'file_path'],
        num_rows: 3691
    })
})

# Tokenizer
1. Tokenizer 是最关键的一步，因为我们处理的是中文，因此使用`bert_base_chinese`就足够了
2. 如果你的语料里面有别的语言，你也可以使用多语言。这个都无所谓的。只要保证你使用的Tokenizer能覆盖你的数据即可
3. `context_length = 512`设置你的每一个文本的最长长度，我这里设置的是512，如果你的显卡显寸小，那你可以改小一点，比如128。但是多出来的数据，并不是说直接截断不要了，而是按照`context_length`长度，不断的对文本进行截断，大概就像是下面这样的：

<img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter7/chunking_texts.svg"/>


4. 对于`gpt2`模型，需要告诉模型一句话从哪里开始，从哪里结束。因此我们需要设置`bos_token`、`eos_token`、`unk_token`



In [16]:
from transformers import AutoTokenizer, AutoConfig

context_length = 512
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 2
Input chunk lengths: [27, 480]
Chunk mapping: [0, 1]


In [17]:
tokenizer.add_special_tokens(special_tokens_dict={'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>'})

1

In [18]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3691 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 6426
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 1996
    })
})

In [19]:
tokenizer.bos_token_id

21128

In [20]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer), 
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_embd=256,  # 减少嵌入的大小
    n_layer=8,  # 减少层的数量
    n_head=8,  # 减少注意力头的数量
)

In [21]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 12.0M parameters


In [22]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [23]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 512])
attention_mask shape: torch.Size([5, 512])
labels shape: torch.Size([5, 512])


In [24]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="chinese_gpt2_big",
    per_device_train_batch_size=20,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=2_000,
    logging_steps=2_000,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=2_000,
    push_to_hub=False,
    report_to="tensorboard",  # 添加这一行以启用TensorBoard
    logging_dir="logs",  # 添加这一行以指定日志目录
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.119, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [25]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=80, training_loss=9.539501190185547, metrics={'train_runtime': 2805.9282, 'train_samples_per_second': 4.58, 'train_steps_per_second': 0.029, 'total_flos': 248185397182464.0, 'train_loss': 9.539501190185547, 'epoch': 1.99})