# 开始第二阶段的AdaLoRA微调

In [1]:
# AutoDL官方学术资源加速
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
import os
import json
import torch # type: ignore
from transformers import ( # type: ignore
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import AdaLoraConfig, get_peft_model, PeftModel # type: ignore
from datasets import Dataset # type: ignore
from transformers import BitsAndBytesConfig # type: ignore

In [3]:
def load_dataset(file_path, tokenizer, max_eval_samples=1000):
    """加载数据集并进行预处理"""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 如果是验证集且指定了最大样本数，则截取部分数据
    if 'valid.json' in file_path and max_eval_samples:
        data = data[:max_eval_samples]
    
    def preprocess_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors=None
        )
    
    dataset = Dataset.from_list([{
        'text': f"{item['prompt']}{item['completion']}"
    } for item in data])
    
    tokenized_dataset = dataset.map(
        preprocess_function,
        remove_columns=['text'],
        desc="Running tokenizer on dataset",
    )
    
    return tokenized_dataset

def create_model_and_tokenizer():
    """创建和配置模型与分词器"""
    # 4bit量化配置
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    
    # 加载tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        "google/gemma-2-9b",
        cache_dir="/root/autodl-tmp/gemma",
        trust_remote_code=True,
        local_files_only=True
    )
    
    # 加载第一阶段训练好的模型
    base_model = AutoModelForCausalLM.from_pretrained(
        "google/gemma-2-9b",
        cache_dir="/root/autodl-tmp/gemma",
        device_map="auto",
        torch_dtype=torch.float16,
        quantization_config=quantization_config,
        local_files_only=True
    )
    
    # 加载第一阶段的LoRA权重
    # model = PeftModel.from_pretrained(
    #     base_model,
    #     "../../../../../root/autodl-tmp/models/stage1/checkpoints/gemma-base-zh/checkpoint-20000"
    # )
    
    return base_model, tokenizer

def create_peft_config():
    """创建AdaLoRA配置"""
    return AdaLoraConfig(
        r=64,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        inference_mode=False,
        target_r=32,
        beta1=0.85,
        beta2=0.85,
        tinit=200,
        tfinal=1000,
        deltaT=10,
        # init_r=12,  # 添加初始秩
        # orth_reg_weight=0.5,  # 添加正交正则化权重
        # total_step=3000,  # 添加总训练步数
    )

In [4]:
# 添加必要的导入
from safetensors.torch import load_file

def train():
    print("开始加载数据集...")
    _, tokenizer = create_model_and_tokenizer()
    
    # 加载第二阶段的数据集
    train_dataset = load_dataset("../prepare_datasets/task_stage_data/train.json", tokenizer)
    eval_dataset = load_dataset("../prepare_datasets/task_stage_data/valid.json", tokenizer)
    
    print("创建模型...")
    model, _ = create_model_and_tokenizer()
    
    print("应用AdaLoRA配置...")
    peft_config = create_peft_config()
    model = get_peft_model(model, peft_config)

    # 设置adapter名称
    model.active_adapter = "default"

    print("加载第一阶段LoRA权重...")
    # 只加载adapter_model.safetensors中的权重
    state_dict = load_file("../../../../../root/autodl-tmp/models/stage1/checkpoints/gemma-base-zh/checkpoint-20000/adapter_model.safetensors")
    model.load_state_dict(state_dict, strict=False)
    
    model.print_trainable_parameters()
    
    # 创建训练参数
    training_args = TrainingArguments(
        output_dir="../../../../../root/autodl-tmp/models/stage2/checkpoints/gemma-task-zh",
        learning_rate=1e-4,  # 降低学习率
        num_train_epochs=3,  # 减少训练轮数
        per_device_train_batch_size=2,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        logging_steps=100,
        save_steps=500,
        evaluation_strategy="steps",
        eval_steps=500,
        fp16=True,
        optim="paged_adamw_32bit",
        lr_scheduler_type="cosine",
        report_to="tensorboard",
        remove_unused_columns=False,
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )
    
    print("开始训练...")
    trainer.train()
    
    print("保存模型...")
    trainer.save_model("../../../../../root/autodl-tmp/models/stage2/gemma-task-zh-final")

In [5]:
if __name__ == "__main__":
    train()

开始加载数据集...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Running tokenizer on dataset:   0%|          | 0/64000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

创建模型...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

应用AdaLoRA配置...




加载第一阶段LoRA权重...
trainable params: 13,420,512 || all params: 9,255,126,664 || trainable%: 0.1450
开始训练...




Step,Training Loss,Validation Loss


# 加载微调好的第二阶段的模型

In [None]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel
# from transformers import BitsAndBytesConfig

In [None]:
# def load_finetuned_model():
#     """加载第二阶段微调后的模型和tokenizer"""
    
#     # 4bit量化配置
#     quantization_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_compute_dtype=torch.float16,
#         bnb_4bit_use_double_quant=True,
#         bnb_4bit_quant_type="nf4"
#     )
    
#     # 加载tokenizer
#     tokenizer = AutoTokenizer.from_pretrained(
#         "google/gemma-2-9b",
#         cache_dir="/root/autodl-tmp/gemma",
#         trust_remote_code=True,
#         local_files_only=True
#     )
    
#     # 加载基础模型
#     base_model = AutoModelForCausalLM.from_pretrained(
#         "google/gemma-2-9b",
#         cache_dir="/root/autodl-tmp/gemma",
#         device_map="auto",
#         torch_dtype=torch.float16,
#         quantization_config=quantization_config,
#         local_files_only=True
#     )
    
#     # 加载第二阶段微调的LoRA权重
#     model = PeftModel.from_pretrained(
#         base_model,
#         "/root/autodl-tmp/models/stage2/gemma-task-zh-final"
#     )
#     # 直接加载第二阶段的权重即可
    
#     # 设置为评估模式
#     model.eval()
    
#     return model, tokenizer

# def generate_response(model, tokenizer, prompt, max_length=512):
#     """使用模型生成回复"""
    
#     # 格式化输入
#     user_prompt = f"<start_of_turn>user\n{prompt}\n<end_of_turn><eos>\n"
    
#     # 编码输入
#     inputs = tokenizer(user_prompt, return_tensors="pt").to(model.device)
    
#     # 生成回复
#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=max_length,
#             temperature=0.7,
#             top_p=0.95,
#             do_sample=True,
#             repetition_penalty=1.1
#         )
    
#     # 解码输出
#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     # 提取模型回复部分
#     response = response.split("<start_of_turn>model\n")[-1].split("\n<end_of_turn>")[0]
    
#     return response

In [None]:
# # 使用示例
# if __name__ == "__main__":
#     print("加载模型...")
#     model, tokenizer = load_finetuned_model()
    
#     # 测试一些示例
#     test_prompts = [
#         "请将以下中文翻译成英文：今天天气真好。",
#         "请讲一个关于勇气的短故事。",
#         "解释一下什么是人工智能？"
#     ]
    
#     print("\n开始生成回复...")
#     for prompt in test_prompts:
#         print(f"\n提问: {prompt}")
#         response = generate_response(model, tokenizer, prompt)
#         print(f"回答: {response}")