# 因果语言模型训练实例

## Step1 导入相关包

In [1]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, BloomForCausalLM

## Step2 加载数据集

In [2]:
ds = Dataset.load_from_disk("./wiki_cn_filtered/")

In [3]:
ds

Dataset({
    features: ['source', 'completion'],
    num_rows: 10000
})

In [4]:
ds[0]

{'source': 'wikipedia.zh2307',
 'completion': "西安交通大学博物馆（Xi'an Jiaotong University Museum）是一座位于西安交通大学的博物馆，馆长是锺明善。\n历史\n2004年9月20日开始筹建，2013年4月8日正式建成开馆，位于西安交通大学兴庆校区陕西省西安市咸宁西路28号。建筑面积6,800平米，展厅面积4,500平米，馆藏文物4,900余件。包括历代艺术文物馆、碑石书法馆、西部农民画馆、邢良坤陶瓷艺术馆、陕西秦腔博物馆和书画展厅共五馆一厅。\n营业时间\n* 周一至周六：上午九点至十二点，下午一点至五点\n* 周日闭馆"}

## Step3 数据集处理

In [6]:
tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-389m-zh")

def process_func(examples):
    contents = [e + tokenizer.eos_token for e in examples["completion"]]        #在每个文本样本的末尾添加结束标记<eos>
    return tokenizer(contents, max_length=384, truncation=True)

In [7]:
tokenized_ds = ds.map(process_func, batched=True, remove_columns=ds.column_names)
tokenized_ds

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 10000
})

In [13]:
from torch.utils.data import DataLoader

'''
掩码语言模型
# mlm=True: 双向注意力
# 可以同时看到左右上下文
注意力: [今天, [MASK], 很] → 预测"天气"

因果语言模型
# mlm=False: 单向注意力（因果注意力）
# 只能看到左边的词
注意力: [今天] → 预测"天气"
注意力: [今天, 天气] → 预测"很"
注意力: [今天, 天气, 很] → 预测"好"
'''
dl = DataLoader(tokenized_ds, batch_size=2, collate_fn=DataCollatorForLanguageModeling(tokenizer, mlm=False))



In [14]:
next(enumerate(dl))

(0,
 {'input_ids': tensor([[    3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
              3,     3

In [15]:
tokenizer.pad_token, tokenizer.pad_token_id     #对填充token和结尾标token进行token化，并且从tokenizer这个典型的分词器词汇表里面查到pad和eos这两个token的id

('<pad>', 3)

In [18]:
tokenizer.eos_token, tokenizer.eos_token_id

('</s>', 2)

## Step4 创建模型

In [26]:
model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-389m-zh")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   5%|5         | 83.9M/1.56G [00:00<?, ?B/s]

ChunkedEncodingError: ('Connection broken: IncompleteRead(4475246 bytes read, 1468636631 more expected)', IncompleteRead(4475246 bytes read, 1468636631 more expected))

## Step5 配置训练参数

In [20]:
args = TrainingArguments(
    output_dir="./causal_lm",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    logging_steps=10,
    num_train_epochs=1,
    fp16=True
)

## Step6 创建训练器

In [23]:
trainer = Trainer(
    args=args,
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_ds,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

NameError: name 'model' is not defined

## Step7 模型训练

In [None]:
trainer.train()

## Step8 模型推理

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

In [None]:
pipe("西安交通大学博物馆（Xi'an Jiaotong University Museum）是一座位于西安", max_length=128, do_sample=True)

In [None]:
pipe("下面是一则游戏新闻。小编报道，近日，游戏产业发展的非常", max_length=128, do_sample=True)

In [None]:

'''
两个文件的区别
# masked_lm.ipynb - 掩码语言模型
model = AutoModelForMaskedLM.from_pretrained("hfl/chinese-macbert-base")

# masked_lm: 使用MacBERT分词器
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")


# causal_lm.ipynb - 因果语言模型
model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-389m-zh")

# causal_lm: 使用BLOOM分词器
tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-389m-zh")


方面	        masked_lm.ipynb	        causal_lm.ipynb
模型架构	        掩码语言模型 (MLM)	因果语言模型 (CLM)
训练目标	        预测被掩码的词	    预测下一个词
注意力机制	        双向注意力	    单向注意力
适用任务	        文本理解、完形填空	    文本生成、续写
分词器	        chinese-macbert-base	bloom-389m-zh
数据处理	        直接分词	            添加EOS Token
DataCollator	mlm=True	        mlm=False
推理管道	        fill-mask	        text-generation

'''