# General Preparation

In [1]:
from datasets import load_dataset, load_from_disk
import os
from pathlib import Path
from typing import Dict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    DataCollatorForSeq2Seq,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Seq2SeqTrainingArguments,
    Trainer,
    Seq2SeqTrainer,
    PreTrainedTokenizerFast,
)
from functools import partial
from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast
from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM, Qwen2ForSequenceClassification
from peft import PrefixTuningConfig, get_peft_model, TaskType, LoraConfig
from peft.peft_model import PeftModelForCausalLM, PeftModelForSequenceClassification
from rouge_score import rouge_scorer
import numpy as np
from transformers import GenerationConfig
import torch
import json

## Hyper Parameters for Training

In [2]:
# num_virtual_tokens = 128
batch_size = 4
accumulation_steps = 8
learning_rate = 2e-4
epochs = 3
warmup_ratio = 0.05
weight_decay = 0.01
save_total_limit=2
logging_steps = 0.1

smoke_run = False

## Directory Settings

In [3]:
# for autodl environment, use this line
file_dir = Path(os.getcwd())

# for local environment, use this line
# file_dir = Path(os.getcwd(), "sft")

file_dir

PosixPath('/root/llm_adv_qa/sft')

In [4]:
# for autodl environment, use this line
cache_dir='/root/autodl-tmp'

# for local environment, use this line
# cache_dir=Path.home().as_posix()

cache_dir

'/root/autodl-tmp'

In [5]:
model_id = "Qwen/Qwen2.5-3B-Instruct"
model_path = str(
        Path(cache_dir, ".cache/modelscope/hub", model_id)
    )
model_path

'/root/autodl-tmp/.cache/modelscope/hub/Qwen/Qwen2.5-3B-Instruct'

In [6]:
model_output_dir = Path(
        file_dir.parent, "resources", "sft_models", "keyword"
    )
model_output_dir.mkdir(parents=True, exist_ok=True)
model_output_dir

PosixPath('/root/llm_adv_qa/resources/sft_models/keyword')

In [7]:
best_model_output_dir = Path(model_output_dir, "best")
best_model_output_dir

PosixPath('/root/llm_adv_qa/resources/sft_models/keyword/best')

# Data Preprocessing

In [8]:
train_file = Path(file_dir.parent, "resources", "dataset", "keyword", "train.jsonl")
test_file = Path(file_dir.parent, "resources", "dataset", "keyword", "test.jsonl")
dataset = load_dataset(
    "json", data_files={"train": train_file.as_posix(), "test": test_file.as_posix()}
).shuffle(seed=1234)

dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'prompt', 'label'],
        num_rows: 3978
    })
    test: Dataset({
        features: ['question', 'prompt', 'label'],
        num_rows: 995
    })
})

In [9]:
len(dataset['train'])

3978

In [10]:
logging_step_per_epoch = int(round(logging_steps * (len(dataset['train']) // (accumulation_steps * batch_size)), 0))
logging_step_per_epoch

12

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")
type(tokenizer)

transformers.models.qwen2.tokenization_qwen2_fast.Qwen2TokenizerFast

In [12]:
def preprocess_data(
    example,
    tokenizer: PreTrainedTokenizerFast,
) -> Dict:
    """
    example:
      {
          "prompt": "xxxx",
          "label": "xxx",
          "question": "xxx"
      },

    return:
        {"input_ids:[], attention_mask:[], labels:[]}

    input_ids格式: `X Y <eos>...`
    labels格式: `<ignore> ... <ignore> Y <eos>...`
    这里不做padding, 在后面让DataCollatorForSeq2Seq在训练时动态根据longest策略来做padding, 
    这样可以省去很多无用的padding空间, 提高训练效率
    """

    x: str = example["prompt"]
    y: str = example["label"]

    # 用模型适配的ChatML格式组成输入
    messages = [{"role": "user", "content": x}]
    x = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    x_ids = tokenizer.encode(x, truncation=True)
    y_ids = tokenizer.encode(y, truncation=True)
    eos = tokenizer.eos_token_id
    pad = tokenizer.pad_token_id
    
    input_ids = x_ids + y_ids +[eos]
    labels = [-100] * len(x_ids) + y_ids + [eos]
    
    attention_mask = [1] * len(x_ids) + [0] * len(y_ids +[eos])
    
    return {"input_ids": input_ids, "labels": labels, "attention_mask": attention_mask}

In [13]:
preprocess_func = partial(preprocess_data, tokenizer=tokenizer)
original_cols = ["question", "prompt", "label"]

In [14]:
train_set = dataset["train"].map(
        partial(preprocess_func, tokenizer=tokenizer),
        remove_columns=original_cols,
)

validation_set = dataset["test"].map(
        partial(preprocess_func, tokenizer=tokenizer), remove_columns=original_cols
)

In [15]:
train_set.to_pandas()

Unnamed: 0,input_ids,labels,attention_mask
0,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
3973,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3974,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3975,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3976,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [16]:
validation_set.to_pandas()

Unnamed: 0,input_ids,labels,attention_mask
0,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
990,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
991,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
992,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
993,"[151644, 8948, 198, 2610, 525, 1207, 16948, 11...","[-100, -100, -100, -100, -100, -100, -100, -10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## Have a Look at Processed Samples

In [17]:
tokenizer.decode(validation_set[1]['input_ids'])

'<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n\n# 任务\n请根据用户输入,为我从以下句子中提取最多3个关键词, 这些关键词是句子中最重要, 最能概括句子主题的词汇, 需要作为报表数据库查询的关键字段名被使用.\n\n# 示例\n- 输入: 请根据江化微2019年的年报,简要介绍报告期内公司主要销售客户的客户集中度情况,并结合同行业情况进行分析。\n  输出: 主要销售客户集中度情况\n- 输入: 能否根据2020年金宇生物技术股份有限公司的年报,给我简要介绍一下报告期内公司的社会责任工作情况？\n  输出: 社会责任工作情况\n\n# 输出格式\n以markdown code block形式, 将关键词按逗号分割的形式输出, 例如:\n```关键词1, 关键词2, 关键词3```\n\n# 用户输入：\n唐山港集团股份有限公司2019年的销售人员人数为？\n<|im_end|>\n<|im_start|>assistant\n```销售人员人数```<|im_end|>'

In [18]:
test_labels = [id for id in validation_set[1]['labels'] if id != -100]
tokenizer.decode(test_labels)

'```销售人员人数```<|im_end|>'

# Modeling

In [19]:
model: AutoModelForCausalLM = AutoModelForCausalLM.from_pretrained(model_path)
type(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM

In [20]:
def get_all_linear_layers(model):
    # Create a list to store the layer names
    layer_names = []
    
    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear)):

            layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])
    name_map = {name: True for name in layer_names}
    
    return [name for name, _ in name_map.items() if name != '']

In [21]:
all_linear_layers = get_all_linear_layers(model)
all_linear_layers

['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']

In [22]:
peft_config = LoraConfig(
    r=8,  # Rank of LoRA matrices
    lora_alpha=32,  # Scaling factor
    target_modules=all_linear_layers,  # Target layers for LoRA
    lora_dropout=0.15,  # Dropout rate for LoRA layers
    bias="none",  # Optionally add bias terms
    task_type="CAUSAL_LM"  # Type of task
)


peft_config

LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'o_proj', 'gate_proj', 'q_proj', 'v_proj', 'k_proj', 'down_proj', 'up_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.15, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [23]:
peft_model: PeftModelForCausalLM = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 14,966,784 || all params: 3,100,905,472 || trainable%: 0.4827


# Metrics Function

In [24]:
from transformers import EvalPrediction

def compute_metrics(eval_pred: EvalPrediction, tokenizer: PreTrainedTokenizerFast):
    predictions, labels, inputs = eval_pred
    labels = [[id for id in label if id != -100] for label in labels]
    predictions = [[id for id in pred if id != -100] for pred in predictions]
    inputs = [[id for id in input_ if id != -100] for input_ in inputs]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True)

    score_dict = {}
    score_dict["accuracy"] = []
    for prediction, label, input_, raw_pred in zip(decoded_preds, decoded_labels, decoded_inputs, predictions):
        print('------')
        # # print(f"pred: {prediction}")
        # print(f"label: {label}")
        # print(f"input: {input_}")
        # print(f"raw pred: {raw_pred}")
        # print(f"raw pred len: {len(raw_pred)}")
        new_text = prediction[len(input_):].replace("\n", "").replace(" ", "").strip("```")
        print(f"new_text: ||{new_text}||")
        # print(f"new_text len: {len(new_text)}")
        # generated_keywords = new_text.split(",")
        # print(new_text)
        label_keywords = label.strip("```").split(",")
        print(f"label_keywords: {label_keywords}")
        generated_keywords = []
        try:
            generated_keywords = [t for t in new_text.split(",") if t != '']
        except Exception as e:
            score_dict["accuracy"].append(0.0)
            continue
        if not isinstance(generated_keywords, list):
            score_dict["accuracy"].append(0.0)
            continue
        correct_keyword_cnt = 0
        print(f"generated_keywords: {generated_keywords}")
        for kw in generated_keywords:
            if kw in label_keywords:
                correct_keyword_cnt += 1
        score_dict["accuracy"].append(correct_keyword_cnt / len(label_keywords))
    return {k: round(np.mean(v), 4) for k, v in score_dict.items()}


## A Simple Test of Our Eval Function

In [25]:
inputs = tokenizer("请抽取出关于水果的关键词:")['input_ids']
inputs = np.array(inputs)

predictions = tokenizer('请抽取出关于水果的关键词:```苹果, 香蕉```' + tokenizer.eos_token)['input_ids']
predictions = np.array(predictions)

labels = tokenizer.encode('```苹果,香蕉,梨子```') + [-100] + [tokenizer.eos_token_id]
labels = np.array(labels)

ep = EvalPrediction([predictions], [labels], [inputs])

compute_metrics(ep, tokenizer)


------
new_text: ||苹果,香蕉||
label_keywords: ['苹果', '香蕉', '梨子']
generated_keywords: ['苹果', '香蕉']


{'accuracy': 0.6667}

In [26]:
metrics_func = partial(compute_metrics, tokenizer=tokenizer)

# Training

In [27]:
generation_config = GenerationConfig.from_pretrained(model_path)
generation_config.temperature = 1
generation_config.do_sample = True
generation_config.repetition_penalty = 1.05
generation_config.max_new_tokens = 128

generation_config

GenerationConfig {
  "bos_token_id": 151643,
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "max_new_tokens": 128,
  "pad_token_id": 151643,
  "repetition_penalty": 1.05,
  "top_k": 20,
  "top_p": 0.8
}

In [28]:
# logging_steps = logging_step_per_epoch
# logging_steps

In [29]:
args = Seq2SeqTrainingArguments(
        output_dir=str(model_output_dir),
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=accumulation_steps,
        per_device_eval_batch_size=batch_size,
        eval_accumulation_steps=accumulation_steps,
        logging_steps=logging_steps,
        num_train_epochs=epochs,
        learning_rate=learning_rate,
        warmup_ratio=warmup_ratio,
        weight_decay=weight_decay,
        eval_strategy="steps",
        eval_steps=logging_steps,
        save_strategy="steps",
        save_total_limit=save_total_limit,
        save_steps=logging_steps,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        predict_with_generate=True,
        include_for_metrics=["inputs, loss"],
        include_inputs_for_metrics=True,
        generation_config=generation_config,
    )

Using `include_inputs_for_metrics` is deprecated and will be removed in version 5 of 🤗 Transformers. Please use `include_for_metrics` list argument instead.


In [30]:
if smoke_run:
    validation_set = validation_set.select(range(8))
    train_set = train_set.select(range(8))

In [31]:
trainer = Seq2SeqTrainer(
        model=peft_model,
        args=args,
        processing_class=tokenizer,
        train_dataset=train_set,
        eval_dataset=validation_set,
        compute_metrics=metrics_func,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding="longest", pad_to_multiple_of=8)
    )

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [32]:
from transformers.utils import logging
logging.set_verbosity_error() 

In [33]:
if smoke_run:
    trainer.evaluate(eval_dataset=validation_set)

In [34]:
if not smoke_run:
    trainer.train()
    trainer.save_model(output_dir=best_model_output_dir)
    trainer.save_state()

Step,Training Loss,Validation Loss,Accuracy
38,4.2436,0.929733,0.2814
76,0.7189,0.53983,0.0652
114,0.4483,0.347088,0.6937
152,0.3214,0.318205,0.763
190,0.2509,0.228673,0.774
228,0.2464,0.204759,0.819
266,0.1531,0.183271,0.6646
304,0.1235,0.165489,0.7539
342,0.1185,0.158732,0.7679


------
new_text: ||研发人员数||
label_keywords: ['研发人员数']
generated_keywords: ['研发人员数']
------
new_text: ||销售人员||
label_keywords: ['销售人员人数']
generated_keywords: ['销售人员']
------
new_text: ||利润总额,净利润||
label_keywords: ['利润总额', '净利润']
generated_keywords: ['利润总额', '净利润']
------
new_text: ||||
label_keywords: ['货币资金']
generated_keywords: []
------
new_text: ||||
label_keywords: ['应付股利']
generated_keywords: []
------
new_text: ||||
label_keywords: ['综合收益总额']
generated_keywords: []
------
new_text: ||||
label_keywords: ['2019年', '资产总计', '第七高']
generated_keywords: []
------
new_text: ||||
label_keywords: ['货币资金']
generated_keywords: []
------
new_text: ||企业人员比例||
label_keywords: ['企业硕士及以上人员占职工人数的比例']
generated_keywords: ['企业人员比例']
------
new_text: ||销售费用||
label_keywords: ['销售费用']
generated_keywords: ['销售费用']
------
new_text: ||||
label_keywords: ['投资收益占营业收入比率']
generated_keywords: []
------
new_text: ||||
label_keywords: ['利润总额']
generated_keywords: []
------
new_text: ||||
label_keywords: ['股本']


# Push to ModelScope

In [5]:
from dotenv import load_dotenv
import os
load_dotenv()

AK = os.environ['MODEL_SCOPE_AK']

In [None]:
from modelscope.hub.api import HubApi
from modelscope.hub.constants import Licenses, ModelVisibility

api = HubApi()
api.login(AK)


username = 'Blackoutta'
model_name = 'Qwen2.5-3B-Instruct-sft-keyword-lora'
model_id = username + "/" + model_name,

api.push_model(
    model_id=model_id[0], # 如果model_id对应的模型库不存在，将会被自动创建
    model_dir=best_model_output_dir # 指定本地模型所在目录
)

2024-12-27 12:44:53,082 - modelscope - INFO - Pushing folder /root/llm_adv_qa/resources/sft_models/keyword/best as model Blackoutta/Qwen2.5-3B-Instruct-sft-keyword-lora.
2024-12-27 12:44:53,083 - modelscope - INFO - Total folder size 72.31 MB, this may take a while depending on actual pushing size...
2024-12-27 12:44:53,089 - modelscope - ERROR - Running git command: ['git', 'lfs', 'env'] failed 
 stdout:  
 stderr: git: 'lfs' is not a git command. See 'git --help'.

The most similar command is
	log

2024-12-27 12:44:53,090 - modelscope - ERROR - git lfs is not installed, please install.
2024-12-27 12:44:54,919 - modelscope - ERROR - Running git command: ['git', 'lfs', 'env'] failed 
 stdout:  
 stderr: git: 'lfs' is not a git command. See 'git --help'.

The most similar command is
	log

2024-12-27 12:44:59,118 - modelscope - INFO - [master 0db23d4] 'upload model'
 7 files changed, 5 insertions(+), 95 deletions(-)
 delete mode 100644 optimizer.pt
 delete mode 100644 rng_state.pth
 dele