# Baichuan2-13B QLoRA微调 ModelScope

使用了ModelScope平台 

QLoRA微调：模型本身用4bit加载，训练时把数值反量化到bf16后进行训练，利用LoRA可以锁定原模型参数不参与训练，只训练少量LoRA参数的特性使得训练所需的显存大大减少。

前期工作都准备好后，可以跳到 `1.2 编写Dataset` 运行

## 安装环境

In [None]:
!pip install -q accelerate
!pip install -q peft
!pip install -q datasets

## 0 Baichuan2-13B-Chat 4bits量化模型

In [None]:
from modelscope.hub.snapshot_download import snapshot_download

model_dir = snapshot_download('baichuan-inc/baichuan2-13B-Chat', cache_dir='baichuan2-13B-Chat', revision='v1.0.2')

# model_dir = snapshot_download('baichuan-inc/baichuan2-13B-Chat-4bits', cache_dir='baichuan2-13B-Chat-4bits', revision='v1.0.2')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig

import torch

model_path = "baichuan2-13B-Chat-4bits/baichuan-inc/baichuan2-13B-Chat-4bits"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path,device_map="auto", trust_remote_code=True)
model.generation_config = GenerationConfig.from_pretrained(model_path)

# 约占用9.7GB的显存

In [3]:
messages = []
messages.append({"role": "user", "content": "解释一下“温故而知新”"})
response = model.chat(tokenizer, messages)
print("Baichuan2: ", response)

# 推理约占用13GB显存

Baichuan2:  “温故而知新”是一句源自《论语·为政》的古文名言，它意味着通过回顾过去的学习和经验，从而获得新的理解和领悟。这句话鼓励我们在学习过程中不断地复习和巩固已学知识，同时尝试从中发现新的观点和想法。

具体来说，“温故”是指回顾过去的知识和经验，而“知新”则是指在回顾的过程中发现新的理解和启示。这句名言鼓励我们既要珍惜过去的成果，也要勇于探索未知的新领域，从而实现持续的成长和发展。


### 微调前的效果

In [4]:
prompt_text = "你现在是一个信息抽取模型，请你帮我抽取出关系内容为\"性能故障\", \"部件故障\", \"组成\"和 \"检测工具\"的相关三元组，三元组内部用\"_\"连接，三元组之间用\\n分割。文本："

def get_prompt(text):
    return prompt_text + text

In [5]:
messages = []
messages.append({"role": "user", "content": get_prompt("故障现象：奔腾B70做PDI检查时车辆无法启动。")})
response = model.chat(tokenizer, messages)
print("Baichuan2: ", response)

Baichuan2:  性能故障_奔腾B70; 部件故障_PDI检查; 组成_无法启动


## 1 加载训练数据

### 1.1 准备训练数据
首先，需要准备训练数据，需要将所有样本放到列表中并存入json文件中。每个样本对应一个字典，包含id和conversations，其中后者为一个列表。示例如下所示：

```JSON
[
  {
    "id": "identity_0",
    "conversations": [
      {
        "from": "user",
        "value": "你现在是一个信息抽取模型，请你帮我抽取出关系内容为\"性能故障\", \"部件故障\", \"组成\"和 \"检测工具\"的相关三元组，三元组内部用\"_\"连接，三元组之间用\\n分割。文本：\n故障现象：奔腾B70做PDI检查时车辆无法启动。",
      },
      {
        "from": "assistant",
        "value": "车辆_部件故障_无法启动"
      }
    ]
  }
]
```

In [None]:
import json
import random

def ftdata_process(ori_path, train_path, test_path):
    data = []
    with open(ori_path, 'r', encoding='utf-8') as fh:
        for i, line in enumerate(fh):
            sample = json.loads(line.strip())
            conversations = []
            text = sample["text"]
            new_entry = {
                "id": sample['ID'],
                "conversations": conversations
            }
            
            # 创建"user"输入
            user_input = {
                "from": "user",
                "value": f'你现在是一个信息抽取模型，请你帮我抽取出关系内容为"性能故障", "部件故障", "组成"和 "检测工具"的相关三元组，三元组内部用"_"连接，三元组之间用\\n分割。文本：\\n{text}'
            }
            conversations.append(user_input)
            
            # 创建"assistant"回应
            spo_list = []
            for spo in sample['spo_list']:
                spo_list.append('_'.join([spo['h']["name"], spo['relation'], spo['t']['name']]))
            assistant_response = {
                "from": "assistant",
                "value": "\\n".join(spo_list)
            }
            conversations.append(assistant_response)
            
            data.append(new_entry)

    # 随机抽取50条数据作为测试集
    test_set = random.sample(data, min(50, len(data)))
    train_set = [record for record in data if record not in test_set]

    with open(test_path, "w", encoding="utf-8") as f:
        json.dump(test_set, f, indent=4, ensure_ascii=False)

    with open(train_path, "w", encoding="utf-8") as f:
        json.dump(train_set, f, indent=4, ensure_ascii=False)


In [None]:
ori_path = "dataset/ori_data.json"
train_path = "dataset/train.json"
test_path = "dataset/test.json"
ftdata_process(ori_path, train_path, test_path)

### 1.2 编写Dataset

In [1]:
from typing import Optional, Dict
import json

import torch
from torch.utils.data import Dataset


def preprocess(
    sources, 
    tokenizer, 
    max_len, 
) -> Dict:
    user_tokens=[195]
    assistant_tokens=[196]
    ignore_index = -100

    input_ids, labels = [], []
    for i, source in enumerate(sources):
        input_id, label = [], []
        for j, sentence in enumerate(source):
            role = sentence["from"]
            value = sentence["value"]
            value_ids = tokenizer.encode(value)

            if role == "user":
                input_id += user_tokens + value_ids
                label += [tokenizer.eos_token_id] + [ignore_index] * len(value_ids)
            else:
                input_id += assistant_tokens + value_ids
                label += [ignore_index] + value_ids
        assert len(input_id) == len(label)
        input_id.append(tokenizer.eos_token_id)
        label.append(tokenizer.eos_token_id)
        input_id = input_id[:max_len]
        label = label[:max_len]
        
        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
        label += [ignore_index] * (max_len - len(label))
        
        input_ids.append(input_id)
        labels.append(label)

    input_ids = torch.tensor(input_ids, dtype=torch.int)
    labels = torch.tensor(labels, dtype=torch.int)
    attention_mask = input_ids.ne(tokenizer.pad_token_id)
    
    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask,
    }



class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""
    def __init__(self, raw_data, tokenizer, max_len):
        super(SupervisedDataset, self).__init__()
        
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        print("Formatting inputs...Skip in lazy mode")
        self.raw_data = raw_data
        self.cached_data_dict = {}
    
    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        if i in self.cached_data_dict:
            return self.cached_data_dict[i]

        ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len)
        ret =  dict(
            input_ids=ret["input_ids"][0],
            labels=ret["labels"][0],
            attention_mask=ret["attention_mask"][0],
        )
        self.cached_data_dict[i] = ret
        
        return ret

In [2]:
def make_supervised_data_module(
    tokenizer, cfg, max_len,
) -> Dict:
    """Make dataset and collator for supervised fine-tuning."""
    dataset_cls = SupervisedDataset
    print("Loading data...")

    train_json = json.load(open(cfg.train_path, "r"))
    train_dataset = dataset_cls(train_json, tokenizer=tokenizer, max_len=max_len)

    if cfg.eval_path:
        eval_json = json.load(open(cfg.eval_path, "r"))
        eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, max_len=max_len)
    else:
        eval_dataset = None

    return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)

## 2 定义模型

In [4]:
from transformers import TrainingArguments, Trainer
from argparse import Namespace
cfg = Namespace()

# model
# cfg.model_name_or_path = 'baichuan2-13B-Chat-4bits/baichuan-inc/baichuan2-13B-Chat-4bits'
cfg.model_name_or_path = "baichuan2-13B-Chat/baichuan-inc/baichuan2-13B-Chat"
cfg.train_path = "dataset/train.json"
cfg.eval_path = None
cfg.test_path = "dataset/test.json"
cfg.model_max_length = 768
cfg.batch_size = 2

training_args = TrainingArguments(
    "baichuan2-13B-chat_QLoRA",
    evaluation_strategy = "no",
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=cfg.batch_size,
    weight_decay=0.01,
    optim="adamw_torch",
)

BitsAndBytes 支持 8bits 和 4bits 两种量化，其中 4bits 支持 FP4 和 NF4 两种格式，Baichuan 2 选用 NF4 作为 4bits 量化的数据类型。

In [5]:
from transformers import BitsAndBytesConfig
import torch

bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
)

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig

tokenizer = AutoTokenizer.from_pretrained(
    cfg.model_name_or_path,
    use_fast=False,
    trust_remote_code=True,
    model_max_length=cfg.model_max_length,
)

model = AutoModelForCausalLM.from_pretrained(
    cfg.model_name_or_path,
    quantization_config=bnb_config,
    trust_remote_code=True
)

model.supports_gradient_checkpointing = True  #节约cuda，但可能会使得训练时间变长
model.gradient_checkpointing_enable() # 作用同上
model.enable_input_require_grads() # 作用同上

model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
import bitsandbytes as bnb 
def find_all_linear_names(model):
    """
    找出所有全连接层
    """
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

lora_modules = find_all_linear_names(model)
print(lora_modules)

['down_proj', 'up_proj', 'W_pack', 'gate_proj', 'o_proj']


In [7]:
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=['W_pack', 'o_proj'],
    inference_mode=False,
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 126,812,160 || all params: 14,023,480,320 || trainable%: 0.9042845078845592


## 3 训练模型

In [8]:
# Load data
data_module = make_supervised_data_module(
    tokenizer=tokenizer, cfg=cfg, max_len=cfg.model_max_length
)

Loading data...
Formatting inputs...Skip in lazy mode


In [9]:
trainer = Trainer(
    model=model, args=training_args, tokenizer=tokenizer, **data_module
)

In [10]:
trainer.train()

Step,Training Loss
500,0.5972
1000,0.3514
1500,0.318
2000,0.2582
2500,0.2251
3000,0.207
3500,0.1766


Token indices sequence length is longer than the specified maximum sequence length for this model (889 > 768). Running this sequence through the model will result in indexing errors


TrainOutput(global_step=3605, training_loss=0.301301208051663, metrics={'train_runtime': 43084.596, 'train_samples_per_second': 0.167, 'train_steps_per_second': 0.084, 'total_flos': 2.34906267746304e+17, 'train_loss': 0.301301208051663, 'epoch': 5.0})

In [11]:
trainer.save_state()
trainer.save_model(output_dir="baichuan2-13B-chat_qlora")

## 4 验证训练后结果

因为显存和内存都不够，所以用8bits量化，显存占用约16GB

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

import torch

model_path = "baichuan2-13B-Chat/baichuan-inc/baichuan2-13B-Chat"
peft_model_path = "baichuan2-13B-chat_qlora"

config = PeftConfig.from_pretrained(peft_model_path)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    load_in_8bit=True,
    device_map="auto", 
    trust_remote_code=True
)

[2023-09-23 11:14:12,237] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2023-09-23 11:14:15.974271: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-23 11:14:16.013896: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
from peft import PeftModel

ckpt_path = 'baichuan2-13B-chat_qlora'

model = PeftModel.from_pretrained(model, peft_model_path)

In [5]:
messages = []
messages.append({"role": "user", "content": get_prompt("原因分析：细心的朋友会发现，润滑油在车况良好的情况下也存在正常的消耗，但有些车况较差的时候，汽车的尾气排出蓝烟，其实这就意味润滑油消耗过大，一般来说，润滑油的消耗无非两种情况，进入燃烧室参与燃烧，或是机油渗漏。之所以机油能够窜入燃烧室，主要是因为零部件严重磨损，配合间隙过大，或者机油压力过高，导致机油上窜进燃烧室。而机油的渗漏主要是因为密封垫变硬老化、气门卡死。如果是老旧车辆，一般都存在密封垫由于老化而密封不严的情况。遇到以上情况，您最好是通过专业的养护中心，由养护工程师进行判定，并实施行之有效的解决办法。")})
response = model.chat(tokenizer, messages)
print("Baichuan2: ", response)

Baichuan2:  密封垫_部件故障_密封不严\n机油_部件故障_压力过高\n机油_部件故障_渗漏\n机油_部件故障_上窜进燃烧室\n气门_部件故障_卡死\n润滑油_部件故障_消耗过大\n汽车_部件故障_尾气排出蓝烟\n零部件_部件故障_磨损


## 5 测试

In [6]:
import torch

def predict_one_sample(model, tokenizer, text):
    messages = []
    messages.append({"role": "user", "content": get_prompt(text)})
    with torch.no_grad():
        response = model.chat(tokenizer, messages)
    
    # 对结果按照“\n”进行分割，获取每个三元组内容
    pre_res = list(set([rr for rr in response.split('\\n') if len(rr.split("_"))==3]))

    return response, pre_res

In [8]:
import json
import os
from tqdm import tqdm

test_path = "dataset/test.json"
save_data = []
f1, total = 0.0, 0.0

test_json = json.load(open(test_path, "r"))
sources = [example["conversations"] for example in test_json]

for i, source in enumerate(tqdm(sources, desc="iter")):
    total += 1
    for j, sentence in enumerate(source):
        role = sentence["from"]
        value = sentence["value"]

        if role == "user":
            text = value
        else:
            answer = value
    
    response, pre_res = predict_one_sample(model, tokenizer, text)
    real_res = answer.split("\\n")
    
    # 计算预测与真实的F1值
    same_res = set(pre_res) & set(real_res)
    if len(set(pre_res)) == 0:
        p = 0.0
    else:
        p = len(same_res) / len(set(pre_res))
    r = len(same_res) / len(set(real_res))
    if (p + r) != 0.0:
        f = 2 * p * r / (p + r)
    else:
        f = 0.0
    f1 += f 
    save_data.append(
        {"text": text, "ori_answer": answer, "gen_answer": response, "f1": f})
    
print("f1:", f1 / total)
save_path = os.path.join('baichuan2-13B-chat_qlora', "baichuan2-13B-chat_qlora_ft_answer.json")
fin = open(save_path, "w", encoding="utf-8")
json.dump(save_data, fin, ensure_ascii=False, indent=4)
fin.close()

iter: 100%|██████████| 50/50 [03:09<00:00,  3.79s/it]

f1: 0.5722646131407069



