In [None]:
import pandas as pd
import os
import numpy as np
from modelscope import AutoModelForCausalLM, AutoTokenizer
from transformers import Trainer,TrainingArguments, DataCollatorForSeq2Seq, BitsAndBytesConfig
import torch
print(os.getcwd()) 

# import json
from datasets import load_dataset
from config import DATA_PATH, VAL_SET_SIZE, DATA_SET, MODEL, MODEL_PATH
from utils import generate_prompt, data_collator, save_pretrained, print_trainable_parameters

import os
pwd = os.getcwd()
cur_dir=os.path.dirname(pwd)
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
# 加载预训练模型
base_model = AutoModelForCausalLM.from_pretrained('/home/ganxin/fa/ais/workspace/data/questionBData/model/Lucachen/gemma2b', quantization_config=bnb_config, device_map="auto",torch_dtype=torch.bfloat16)

print(base_model)
print(print_trainable_parameters(base_model))

from peft import LoraConfig, TaskType, prepare_model_for_kbit_training, get_peft_model

# 需要修改的lora参数
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"],
    inference_mode=False, # 训练模式
    bias="none",
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1 # Dropout 比例
)

model = get_peft_model(base_model, config)
print(model)
print(print_trainable_parameters(model))

#需要调整的训练参数
args = TrainingArguments(
    output_dir="%s/output/%s/%s"%(cur_dir, MODEL, DATA_SET),
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3, 
    save_steps=100,
    learning_rate=8e-5, # 1e-3 ~ 5e-5
    save_on_each_node=True,
    gradient_checkpointing=True
)

# 读取数据集
data = load_dataset("csv", data_files=DATA_PATH , delimiter="\t")
# 分为训练集，验证集，验证集不是必要
if VAL_SET_SIZE > 0:
    VAL_SET_SIZE = max(min(VAL_SET_SIZE, int(len(data)/10000)), 1)
    generate_prompt(data["train"][0], is_logger=True)
    train_val = data["train"].train_test_split(test_size=VAL_SET_SIZE, shuffle=True, seed=42)
    train_data = train_val["train"] .shuffle().map(generate_prompt)
    val_data = train_val["test"].shuffle().map(generate_prompt)
else:
    generate_prompt(data["train"][0], is_logger=True)
    train_data = data["train"].shuffle().map(generate_prompt)
    val_data = None

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    data_collator=data_collator,
)

model.config.use_cache = False

trainer.train()

lora_path='%s/saved_model/%s'%(cur_dir, DATA_SET) # 修改成工作路径

model.config.use_cache = True
trainer.model.save_pretrained(lora_path)
save_pretrained(lora_path)

2024-08-23 14:23:49,285 - modelscope - INFO - PyTorch version 2.0.0 Found.
2024-08-23 14:23:49,288 - modelscope - INFO - TensorFlow version 2.17.0 Found.
2024-08-23 14:23:49,288 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2024-08-23 14:23:49,437 - modelscope - INFO - Loading done! Current index file version is 1.15.0, with md5 34cc0ab19a6bcca90022a212fc92f4a8 and a total number of 980 components indexed
2024-08-23 14:23:50.152818: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-23 14:23:50.168393: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-23 14:23:50.187422: E external/local_xla/xla

/home/ganxin/fa/ais/workspace/questionB


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]