# Baichuan2-7B LoRA微调 ModelScope

使用了ModelScope平台 

fp16精度下LoRA微调，需要24GB的显卡

前期工作都准备好后，可以跳到 `1.2 编写Dataset` 运行

## 安装环境

In [None]:
!pip install -q accelerate
!pip install -q peft
!pip install -q datasets

## 0 Baichuan2-7B-Chat 预训练模型

In [None]:
from modelscope.hub.snapshot_download import snapshot_download

model_dir = snapshot_download('baichuan-inc/baichuan2-7B-Chat', cache_dir='baichuan2-7B-Chat', revision='v1.0.1')

In [None]:
# import sys
# sys.path.append("/mnt/workspace/baichuan-7B/baichuan-inc/baichuan-7B")

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig

import torch

model_path = "baichuan2-7B-Chat/baichuan-inc/baichuan2-7B-Chat"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path,device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
model.generation_config = GenerationConfig.from_pretrained(model_path)

# 用fp16加载，约占用15.3GB的显存

In [2]:
messages = []
messages.append({"role": "user", "content": "解释一下“温故而知新”"})
response = model.chat(tokenizer, messages)
print("Baichuan2: ", response)

Baichuan2:  "温故而知新"是一句中国古代的成语，出自《论语·为政》。这句话的意思是：通过回顾和了解过去的事情，可以从中获得新的知识和启示。这句话强调了学习和知识的重要性，以及通过不断回顾和总结过去的经验来提高自己的重要性。

在现代语境中，这句成语可以用来鼓励人们在学习过程中不断回顾和巩固已知的知识，以便更好地理解和掌握新的知识。同时，它也可以用来强调经验和教训对于个人成长和发展的重要性。


### 微调前的效果：

In [5]:
prompt_text = "你现在是一个信息抽取模型，请你帮我抽取出关系内容为\"性能故障\", \"部件故障\", \"组成\"和 \"检测工具\"的相关三元组，三元组内部用\"_\"连接，三元组之间用\\n分割。文本："

def get_prompt(text):
    return prompt_text + text

In [4]:
messages = []
messages.append({"role": "user", "content": get_prompt("故障现象：奔腾B70做PDI检查时车辆无法启动。")})
response = model.chat(tokenizer, messages)
print("Baichuan2: ", response)

Baichuan2:  性能故障_部件故障_组成_检测工具


## 1 加载训练数据

### 1.1 准备训练数据
首先，需要准备训练数据，需要将所有样本放到列表中并存入json文件中。每个样本对应一个字典，包含id和conversations，其中后者为一个列表。示例如下所示：

```JSON
[
  {
    "id": "identity_0",
    "conversations": [
      {
        "from": "user",
        "value": "你现在是一个信息抽取模型，请你帮我抽取出关系内容为\"性能故障\", \"部件故障\", \"组成\"和 \"检测工具\"的相关三元组，三元组内部用\"_\"连接，三元组之间用\\n分割。文本：\n故障现象：奔腾B70做PDI检查时车辆无法启动。",
      },
      {
        "from": "assistant",
        "value": "车辆_部件故障_无法启动"
      }
    ]
  }
]
```

In [19]:
import json
import random

def ftdata_process(ori_path, train_path, test_path):
    data = []
    with open(ori_path, 'r', encoding='utf-8') as fh:
        for i, line in enumerate(fh):
            sample = json.loads(line.strip())
            conversations = []
            text = sample["text"]
            new_entry = {
                "id": sample['ID'],
                "conversations": conversations
            }
            
            # 创建"user"输入
            user_input = {
                "from": "user",
                "value": f'你现在是一个信息抽取模型，请你帮我抽取出关系内容为"性能故障", "部件故障", "组成"和 "检测工具"的相关三元组，三元组内部用"_"连接，三元组之间用\\n分割。文本：\\n{text}'
            }
            conversations.append(user_input)
            
            # 创建"assistant"回应
            spo_list = []
            for spo in sample['spo_list']:
                spo_list.append('_'.join([spo['h']["name"], spo['relation'], spo['t']['name']]))
            assistant_response = {
                "from": "assistant",
                "value": "\\n".join(spo_list)
            }
            conversations.append(assistant_response)
            
            data.append(new_entry)

    # 随机抽取50条数据作为测试集
    test_set = random.sample(data, min(50, len(data)))
    train_set = [record for record in data if record not in test_set]

    with open(test_path, "w", encoding="utf-8") as f:
        json.dump(test_set, f, indent=4, ensure_ascii=False)

    with open(train_path, "w", encoding="utf-8") as f:
        json.dump(train_set, f, indent=4, ensure_ascii=False)


In [20]:
ori_path = "dataset/ori_data.json"
train_path = "dataset/train.json"
test_path = "dataset/test.json"
ftdata_process(ori_path, train_path, test_path)

### 1.2 编写Dataset

In [3]:
from typing import Optional, Dict
import json

import torch
from torch.utils.data import Dataset


def preprocess(
    sources, 
    tokenizer, 
    max_len, 
) -> Dict:
    user_tokens=[195]
    assistant_tokens=[196]
    ignore_index = -100

    input_ids, labels = [], []
    for i, source in enumerate(sources):
        input_id, label = [], []
        for j, sentence in enumerate(source):
            role = sentence["from"]
            value = sentence["value"]
            value_ids = tokenizer.encode(value)

            if role == "user":
                input_id += user_tokens + value_ids
                label += [tokenizer.eos_token_id] + [ignore_index] * len(value_ids)
            else:
                input_id += assistant_tokens + value_ids
                label += [ignore_index] + value_ids
        assert len(input_id) == len(label)
        input_id.append(tokenizer.eos_token_id)
        label.append(tokenizer.eos_token_id)
        input_id = input_id[:max_len]
        label = label[:max_len]
        
        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
        label += [ignore_index] * (max_len - len(label))
        
        input_ids.append(input_id)
        labels.append(label)

    input_ids = torch.tensor(input_ids, dtype=torch.int)
    labels = torch.tensor(labels, dtype=torch.int)
    attention_mask = input_ids.ne(tokenizer.pad_token_id)
    
    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask,
    }



class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""
    def __init__(self, raw_data, tokenizer, max_len):
        super(SupervisedDataset, self).__init__()
        
        print("Formatting inputs...")
        sources = [example["conversations"] for example in raw_data]
        data_dict = preprocess(sources, tokenizer, max_len)
        
        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]
        self.attention_mask = data_dict["attention_mask"]
        
        print("input:", tokenizer.decode(self.input_ids[66]))
        labels = []
        for id_ in self.labels[66]:
            if id_ == -100:
                continue

            labels.append(id_)
        print("label:", tokenizer.decode(labels))
        
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(
            input_ids=self.input_ids[i],
            labels=self.labels[i],
            attention_mask=self.attention_mask[i],
        )

In [7]:
def make_supervised_data_module(
    tokenizer, cfg, max_len,
) -> Dict:
    """Make dataset and collator for supervised fine-tuning."""
    dataset_cls = SupervisedDataset
    print("Loading data...")

    train_json = json.load(open(cfg.train_path, "r"))
    train_dataset = dataset_cls(train_json, tokenizer=tokenizer, max_len=max_len)

    if cfg.eval_path:
        eval_json = json.load(open(cfg.eval_path, "r"))
        eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, max_len=max_len)
    else:
        eval_dataset = None

    return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)

## 2 定义模型

In [1]:
from transformers import TrainingArguments, Trainer
from argparse import Namespace
cfg = Namespace()

# model
cfg.model_name_or_path = 'baichuan2-7B-Chat/baichuan-inc/baichuan2-7B-Chat'
cfg.train_path = "dataset/train.json"
cfg.eval_path = None
cfg.test_path = "dataset/test.json"
cfg.model_max_length = 1024
cfg.batch_size = 2

training_args = TrainingArguments(
    "baichuan2-7B-chat_lora",
    evaluation_strategy = "no",
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=cfg.batch_size,
    weight_decay=0.01,
    optim="adamw_torch",
)

2023-09-21 23:13:02.770505: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-21 23:13:03.178651: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2023-09-21 23:13:07,303] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig

tokenizer = AutoTokenizer.from_pretrained(
    cfg.model_name_or_path,
    use_fast=False,
    trust_remote_code=True,
    model_max_length=cfg.model_max_length,
)

model = AutoModelForCausalLM.from_pretrained(
    cfg.model_name_or_path,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["W_pack"],
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

model.supports_gradient_checkpointing = True  #节约cuda，但可能会使得训练时间变长
model.gradient_checkpointing_enable() # 作用同上
model.enable_input_require_grads() # 作用同上

model.config.use_cache = False

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 7,510,167,552 || trainable%: 0.055848341211549045


## 3 训练模型

In [12]:
# Load data
data_module = make_supervised_data_module(
    tokenizer=tokenizer, cfg=cfg, max_len=cfg.model_max_length
)

Loading data...
Formatting inputs...
input:  <reserved_106>你现在是一个信息抽取模型，请你帮我抽取出关系内容为"性能故障", "部件故障", "组成"和 "检测工具"的相关三元组，三元组内部用"_"连接，三元组之间用\n分割。文本：\n处理原则：检查监控系统告警信息，相关电流指示；检查各个采样系统的电流测量值有无异常；检查本体有无异常声响、有无异常振动；检查二次回路、电子模块装置和端子接线排有无放电打火、开路现象，查找开路点；二次回路开路，应尽快处理；如不能恢复，应立即汇报值班调控人员申请停运接地极或调整直流系统运行方式；查找零磁通电流互感器二次开路点时应注意安全，应穿绝缘靴，戴绝缘手套，至少两人一起。<reserved_107>二次回路_部件故障_开路</s><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><

In [13]:
trainer = Trainer(
    model=model, args=training_args, tokenizer=tokenizer, **data_module
)

In [14]:
trainer.train()

Step,Training Loss
500,0.6309
1000,0.4166
1500,0.3754
2000,0.3311
2500,0.3033
3000,0.2946
3500,0.268


TrainOutput(global_step=3605, training_loss=0.3713475157252297, metrics={'train_runtime': 7668.3308, 'train_samples_per_second': 0.94, 'train_steps_per_second': 0.47, 'total_flos': 3.096653235172147e+17, 'train_loss': 0.3713475157252297, 'epoch': 5.0})

In [15]:
trainer.save_state()
trainer.save_model(output_dir="baichuan2-7B-chat_lora")

## 4 验证训练后结果



In [7]:
from peft import PeftModel

ckpt_path = 'baichuan2-7B-chat_lora/checkpoint-3500'

model = PeftModel.from_pretrained(model, ckpt_path)
model = model.merge_and_unload() #合并lora权重

In [8]:
messages = []
messages.append({"role": "user", "content": get_prompt("原因分析：细心的朋友会发现，润滑油在车况良好的情况下也存在正常的消耗，但有些车况较差的时候，汽车的尾气排出蓝烟，其实这就意味润滑油消耗过大，一般来说，润滑油的消耗无非两种情况，进入燃烧室参与燃烧，或是机油渗漏。之所以机油能够窜入燃烧室，主要是因为零部件严重磨损，配合间隙过大，或者机油压力过高，导致机油上窜进燃烧室。而机油的渗漏主要是因为密封垫变硬老化、气门卡死。如果是老旧车辆，一般都存在密封垫由于老化而密封不严的情况。遇到以上情况，您最好是通过专业的养护中心，由养护工程师进行判定，并实施行之有效的解决办法。")})
response = model.chat(tokenizer, messages)
print("Baichuan2: ", response)

Baichuan2:  机油_部件故障_泄漏


## 5 测试

In [14]:
import torch

def predict_one_sample(model, tokenizer, text):
    messages = []
    messages.append({"role": "user", "content": get_prompt(text)})
    with torch.no_grad():
        response = model.chat(tokenizer, messages)
    
    # 对结果按照“\n”进行分割，获取每个三元组内容
    pre_res = list(set([rr for rr in response.split('\\n') if len(rr.split("_"))==3]))

    return response, pre_res

In [15]:
import json
import os
from tqdm import tqdm

test_path = "dataset/test.json"
save_data = []
f1, total = 0.0, 0.0

test_json = json.load(open(cfg.test_path, "r"))
sources = [example["conversations"] for example in test_json]

for i, source in enumerate(tqdm(sources, desc="iter")):
    total += 1
    for j, sentence in enumerate(source):
        role = sentence["from"]
        value = sentence["value"]

        if role == "user":
            text = value
        else:
            answer = value
    
    response, pre_res = predict_one_sample(model, tokenizer, text)
    real_res = answer.split("\\n")
    
    # 计算预测与真实的F1值
    same_res = set(pre_res) & set(real_res)
    if len(set(pre_res)) == 0:
        p = 0.0
    else:
        p = len(same_res) / len(set(pre_res))
    r = len(same_res) / len(set(real_res))
    if (p + r) != 0.0:
        f = 2 * p * r / (p + r)
    else:
        f = 0.0
    f1 += f 
    save_data.append(
        {"text": text, "ori_answer": answer, "gen_answer": response, "f1": f})
    
print("f1:", f1 / total)
save_path = os.path.join('baichuan2-7B-chat_lora', "ft_answer.json")
fin = open(save_path, "w", encoding="utf-8")
json.dump(save_data, fin, ensure_ascii=False, indent=4)
fin.close()

iter: 100%|██████████| 50/50 [00:59<00:00,  1.19s/it]

f1: 0.4785412172368694



