In [1]:
# %env HF_ENDPOINT=https://hf-mirror.com
%env HF_HOME=/root/autodl-tmp/hf
%env HF_HUB_CACHE=/root/autodl-tmp/hf

env: HF_HOME=/root/autodl-tmp/hf
env: HF_HUB_CACHE=/root/autodl-tmp/hf


In [2]:
import os
import subprocess

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True,
                        text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [3]:
# !pip install -q transformers datasets accelerate evaluate bitsandbytes peft

In [4]:
# 定义全局变量和参数
model_name_or_path = 'THUDM/chatglm3-6b'  # 模型ID或本地路径
train_data_path = 'shibing624/AdvertiseGen'  # 训练数据路径
eval_data_path = None  # 验证数据路径，如果没有则设置为None
seed = 8  # 随机种子
max_input_length = 512  # 输入的最大长度
max_output_length = 1536  # 输出的最大长度
lora_rank = 4  # LoRA秩
lora_alpha = 32  # LoRA alpha值
lora_dropout = 0.05  # LoRA Dropout率
resume_from_checkpoint = None  # 如果从checkpoint恢复训练，指定路径
prompt_text = ''  # 所有数据前的指令文本

In [5]:
from datasets import load_dataset

dataset = load_dataset(train_data_path)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['content', 'summary'],
        num_rows: 114599
    })
    validation: Dataset({
        features: ['content', 'summary'],
        num_rows: 1070
    })
})

In [7]:
dataset['train'][0]

{'content': '类型#裤*版型#宽松*风格#性感*图案#线条*裤型#阔腿裤',
 'summary': '宽松的阔腿裤这两年真的吸粉不少，明星时尚达人的心头爱。毕竟好穿时尚，谁都能穿出腿长2米的效果宽松的裤腿，当然是遮肉小能手啊。上身随性自然不拘束，面料亲肤舒适贴身体验感棒棒哒。系带部分增加设计看点，还让单品的设计感更强。腿部线条若隐若现的，性感撩人。颜色敲温柔的，与裤子本身所呈现的风格有点反差萌。'}

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
                                          trust_remote_code=True,
                                          revision='b098244'
                                          )



In [9]:
# tokenize_func 函数
def tokenize_func(example, tokenizer, ignore_label_id=-100):
    """
    对单个数据样本进行tokenize处理。

    参数:
    example (dict): 包含'content'和'summary'键的字典，代表训练数据的一个样本。
    tokenizer (transformers.PreTrainedTokenizer): 用于tokenize文本的tokenizer。
    ignore_label_id (int, optional): 在label中用于填充的忽略ID，默认为-100。

    返回:
    dict: 包含'tokenized_input_ids'和'labels'的字典，用于模型训练。
    """

    # 构建问题文本
    question = prompt_text + example['content']
    if example.get('input', None) and example['input'].strip():
        question += f'\n{example["input"]}'

    # 构建答案文本
    answer = example['summary']

    # 对问题和答案文本进行tokenize处理
    q_ids = tokenizer.encode(text=question, add_special_tokens=False)
    a_ids = tokenizer.encode(text=answer, add_special_tokens=False)

    # 如果tokenize后的长度超过最大长度限制，则进行截断
    if len(q_ids) > max_input_length - 2:  # 保留空间给gmask和bos标记
        q_ids = q_ids[:max_input_length - 2]
    if len(a_ids) > max_output_length - 1:  # 保留空间给eos标记
        a_ids = a_ids[:max_output_length - 1]

    # 构建模型的输入格式
    input_ids = tokenizer.build_inputs_with_special_tokens(q_ids, a_ids)
    question_length = len(q_ids) + 2  # 加上gmask和bos标记

    # 构建标签，对于问题部分的输入使用ignore_label_id进行填充
    labels = [ignore_label_id] * question_length + input_ids[question_length:]

    return {'input_ids': input_ids, 'labels': labels}


In [10]:
column_names = dataset['train'].column_names
column_names

['content', 'summary']

In [11]:
tokenized_dataset = dataset.map(
    lambda example: tokenize_func(example, tokenizer),
    batched=False,
    remove_columns=column_names,
    num_proc=4,
)

In [12]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 114599
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1070
    })
})

In [13]:
# split tokenized_dataset into train, validation and test sets
seed = 16
train_test_set = tokenized_dataset['train'].train_test_split(test_size=0.1, seed=seed)
train_set = train_test_set['train']
test_set = train_test_set['test']
eval_set = tokenized_dataset['validation']

In [14]:
train_set

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 103139
})

In [15]:
eval_set

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 1070
})

In [16]:
test_set

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 11460
})

In [17]:
import torch
from typing import List, Dict


# DataCollatorForChatGLM 类
class DataCollatorForChatGLM:
    """
    用于处理批量数据的DataCollator，尤其是在使用 ChatGLM 模型时。

    该类负责将多个数据样本（tokenized input）合并为一个批量，并在必要时进行填充(padding)。

    属性:
    pad_token_id (int): 用于填充(padding)的token ID。
    max_length (int): 单个批量数据的最大长度限制。
    ignore_label_id (int): 在标签中用于填充的ID。
    """

    def __init__(self, pad_token_id: int, max_length: int = 2048, ignore_label_id: int = -100):
        """
        初始化DataCollator。

        参数:
        pad_token_id (int): 用于填充(padding)的token ID。
        max_length (int): 单个批量数据的最大长度限制。
        ignore_label_id (int): 在标签中用于填充的ID，默认为-100。
        """
        self.pad_token_id = pad_token_id
        self.ignore_label_id = ignore_label_id
        self.max_length = max_length

    def __call__(self, batch_data: List[Dict[str, List]]) -> Dict[str, torch.Tensor]:
        """
        处理批量数据。

        参数:
        batch_data (List[Dict[str, List]]): 包含多个样本的字典列表。

        返回:
        Dict[str, torch.Tensor]: 包含处理后的批量数据的字典。
        """
        # 计算批量中每个样本的长度
        len_list = [len(d['input_ids']) for d in batch_data]
        batch_max_len = max(len_list)  # 找到最长的样本长度

        input_ids, labels = [], []
        for len_of_d, d in sorted(zip(len_list, batch_data), key=lambda x: -x[0]):
            pad_len = batch_max_len - len_of_d  # 计算需要填充的长度
            # 添加填充，并确保数据长度不超过最大长度限制
            ids = d['input_ids'] + [self.pad_token_id] * pad_len
            label = d['labels'] + [self.ignore_label_id] * pad_len
            if batch_max_len > self.max_length:
                ids = ids[:self.max_length]
                label = label[:self.max_length]
            input_ids.append(torch.LongTensor(ids))
            labels.append(torch.LongTensor(label))

        # 将处理后的数据堆叠成一个tensor
        input_ids = torch.stack(input_ids)
        labels = torch.stack(labels)

        return {'input_ids': input_ids, 'labels': labels}


In [18]:
# 准备数据整理器
data_collator = DataCollatorForChatGLM(pad_token_id=tokenizer.pad_token_id)

In [19]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM

_compute_dtype_map = {
    'fp32': torch.float32,
    'fp16': torch.float16,
    'bf16': torch.bfloat16
}

# QLoRA 量化配置
q_config = BitsAndBytesConfig(load_in_4bit=True,
                              bnb_4bit_quant_type='nf4',
                              bnb_4bit_use_double_quant=True,
                              bnb_4bit_compute_dtype=_compute_dtype_map['bf16'])


In [20]:


# revision='b098244' 版本对应的 ChatGLM3-6B 设置 use_reentrant=False
# 最新版本 use_reentrant 被设置为 True，会增加不必要的显存开销
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             quantization_config=q_config,
                                             device_map='auto',
                                             trust_remote_code=True,
                                             revision='b098244'
                                             )



Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [21]:
type(model)

transformers_modules.THUDM.chatglm3-6b.b098244a71fbe69ce149682d9072a7629f7e908c.modeling_chatglm.ChatGLMForConditionalGeneration

In [22]:
test_input = tokenizer("你好", return_tensors='pt')
test_input

{'input_ids': tensor([[64790, 64792, 36474, 54591]]), 'attention_mask': tensor([[1, 1, 1, 1]]), 'position_ids': tensor([[0, 1, 2, 3]])}

In [23]:
test_output = model.generate(**test_input, return_dict_in_generate=True, output_scores=True)
print(type(test_output))



<class 'transformers.generation.utils.GenerateDecoderOnlyOutput'>


In [24]:
test_output.scores

(tensor([[-4.3828, -4.3789,  6.1172,  ..., -4.3789, -4.3789, -4.3789]],
        dtype=torch.float16),
 tensor([[-3.8730, -3.8730,  1.5615,  ..., -3.8730, -3.8750, -3.8711]],
        dtype=torch.float16),
 tensor([[-3.1523, -3.1504,  1.1846,  ..., -3.1504, -3.1504, -3.1504]],
        dtype=torch.float16),
 tensor([[-5.6562, -5.6562,  5.1914,  ..., -5.6523, -5.6562, -5.6523]],
        dtype=torch.float16),
 tensor([[-6.1367, -6.1367,  4.7266,  ..., -6.1367, -6.1367, -6.1367]],
        dtype=torch.float16),
 tensor([[-4.8711, -4.8711,  3.2910,  ..., -4.8711, -4.8711, -4.8711]],
        dtype=torch.float16),
 tensor([[-3.5137, -3.5137, -2.4375,  ..., -3.5117, -3.5137, -3.5117]],
        dtype=torch.float16),
 tensor([[-3.0352, -3.0352, -0.5562,  ..., -3.0332, -3.0352, -3.0352]],
        dtype=torch.float16),
 tensor([[-4.5430, -4.5430,  3.1660,  ..., -4.5430, -4.5430, -4.5430]],
        dtype=torch.float16),
 tensor([[-4.3086, -4.3086, 10.4766,  ..., -4.3086, -4.3086, -4.3086]],
        dt

In [25]:
stacked = torch.stack(test_output.scores,dim=1)
stacked.shape

torch.Size([1, 16, 65024])

In [26]:
maxed = torch.argmax(stacked, dim=-1)
print(maxed)

tensor([[31123, 33030, 34797, 42481, 31155, 48895, 38549, 31645, 31404, 42693,
         33277, 31639, 40648, 55268, 55353, 36295]])


In [27]:
tokenizer.decode(maxed[0])

'，我是人工智能助手。很高兴为您服务！请问有什么问题我可以帮您解答'

In [28]:
# 获取当前模型占用的 GPU显存（差值为预留给 PyTorch 的显存）
memory_footprint_bytes = model.get_memory_footprint()
memory_footprint_mib = memory_footprint_bytes / (1024 ** 2)  # 转换为 MiB

print(f"{memory_footprint_mib:.2f}MiB")

3739.69MiB


In [29]:
from peft import TaskType, LoraConfig, get_peft_model, prepare_model_for_kbit_training

kbit_model = prepare_model_for_kbit_training(model)

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [30]:
from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING

target_modules = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING['chatglm']

In [31]:
target_modules

['query_key_value']

In [32]:
lora_config = LoraConfig(
    target_modules=target_modules,
    r=lora_rank,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias='none',
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM
)

In [33]:
qlora_model = get_peft_model(kbit_model, lora_config)

In [34]:
qlora_model.print_trainable_parameters()

trainable params: 974,848 || all params: 6,244,558,848 || trainable%: 0.01561115883009451


In [35]:
from transformers import TrainingArguments, Trainer

save_dir = '/root/autodl-tmp/models'

training_args = TrainingArguments(
    overwrite_output_dir=True,
    output_dir=f"{save_dir}/{model_name_or_path}",  # 输出目录
    per_device_train_batch_size=16,  # 每个设备的训练批量大小
    per_device_eval_batch_size=16,  # 每个设备的评估批量大小
    gradient_accumulation_steps=4,
    learning_rate=1e-3,  # 学习率
    lr_scheduler_type="linear",  # 学习率调度器类型
    warmup_ratio=0.1,  # 预热比例
    logging_steps=0.1,  # 日志记录步数
    save_strategy="steps",  # 模型保存策略
    save_steps=0.1,  # 模型保存步数
    evaluation_strategy="steps",  # 评估策略
    eval_steps=0.1,  # 评估步数
    fp16=True,  # 是否使用混合精度训练
    num_train_epochs=1,  # 训练轮数
    report_to=['all'],
)


In [109]:
from transformers import EvalPrediction
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
import jieba
import numpy as np
import torch

def compute_metrics(eval_preds: EvalPrediction):
    batched_pred_ids = torch.from_numpy(eval_preds.predictions[0])
    batched_label_ids = torch.from_numpy(eval_preds.label_ids)

    metrics_dct = {'bleu-4': []}
    for pred_ids, label_ids in zip(batched_pred_ids, batched_label_ids):
        pred_tt = torch.argmax(pred_ids, dim=-1)
        pred_txt = tokenizer.decode(pred_tt).strip()
        label_txt = tokenizer.decode(label_ids).strip()
        pred_tokens = list(jieba.cut(pred_txt))
        label_tokens = list(jieba.cut(label_txt))
        metrics_dct['bleu-4'].append(
            sentence_bleu(
                [label_tokens],
                pred_tokens,
                smoothing_function=SmoothingFunction().method3,
            )
        )
    return {k: np.mean(v) for k, v in metrics_dct.items()}

In [110]:
trainer = Trainer(
    model=qlora_model,
    args=training_args,
    train_dataset=train_set.select(list(range(64))),
    eval_dataset=eval_set.select(list(range(32))),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [111]:
trainer.evaluate()

{'eval_loss': 5.274458885192871,
 'eval_bleu-4': 0.011854863133202033,
 'eval_runtime': 2.7142,
 'eval_samples_per_second': 11.79,
 'eval_steps_per_second': 0.737}

In [112]:
trainer.train()



Step,Training Loss,Validation Loss,Bleu-4
1,5.0946,5.274459,0.011855


Checkpoint destination directory /root/autodl-tmp/models/THUDM/chatglm3-6b/checkpoint-1 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1, training_loss=5.094576835632324, metrics={'train_runtime': 7.1615, 'train_samples_per_second': 8.937, 'train_steps_per_second': 0.14, 'total_flos': 368449688567808.0, 'train_loss': 5.094576835632324, 'epoch': 1.0})

In [None]:
# trainer.save_model(f"{save_dir}/{model_name_or_path}")
trainer.save_model()
trainer.save_state()

In [None]:
# todo final evaluation using test dataset
# for model_inputs, gold_standards in test_set:
#     predictions = model(model_inputs)
#     metric.add_batch(references=gold_standards, predictions=predictions)
# metric.compute()