In [1]:
# AutoDL官方学术资源加速
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
import os
import json
import torch # type: ignore
from transformers import ( # type: ignore
    AutoModelForCausalLM, 
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import AdaLoraConfig, get_peft_model # type: ignore
from datasets import Dataset # type: ignore
from transformers import BitsAndBytesConfig # type: ignore

In [3]:
# 1. 配置和工具函数 # ../prepare_datasets/base_stage_data/train.json
def load_dataset(file_path, tokenizer, max_eval_samples=1000): # max_eval_samples 参数专门用于限制valid数据集的样本数量
    """加载数据集并进行预处理"""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 如果是验证集且指定了最大样本数，则截取部分数据
    if 'valid.json' in file_path and max_eval_samples:
        data = data[:max_eval_samples]
    
    # 使用tokenizer处理文本
    def preprocess_function(examples):
        return tokenizer(
            examples['text'], # 要处理的文本
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors=None  # 返回列表而不是tensor
        )
    
    # 创建数据集
    dataset = Dataset.from_list([{ # 使用datasets库创建一个Dataset对象。Dataset.from_list方法接受一个列表，其中每个元素是一个字典。
        'text': f"{item['prompt']}{item['completion']}" # 使用datasets库创建一个Dataset对象。Dataset.from_list方法接受一个列表，其中每个元素是一个字典。
    } for item in data])
    
    # 对数据集进行预处理
    tokenized_dataset = dataset.map(
        preprocess_function,
        remove_columns=['text'],  # 移除原始文本列
        desc="Running tokenizer on dataset",
    )
    
    return tokenized_dataset

def create_model_and_tokenizer():
    """创建和配置模型与分词器"""
    # 4bit量化配置
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True, # 这是指模型的权重存储使用4位精度，可以大大减少模型占用的显存
        bnb_4bit_compute_dtype=torch.float16, # 这是指模型在计算时使用16位精度。计算：训练和推理
        bnb_4bit_use_double_quant=True, # 这是指在量化过程中使用双量化，可以进一步减少量化误差
        bnb_4bit_quant_type="nf4"  # 使用 normal float 4 量化类型。量化：将高精度（如32位）的浮点数转换为低精度（如4位）的浮点数，以减少模型占用的显存和提高计算速度。
    )
    
    # 加载tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        "google/gemma-2-9b",
        cache_dir="/root/autodl-tmp/gemma",
        trust_remote_code=True,
        local_files_only=True  # 添加这行，只使用本地缓存的文件
    )
    
    # 加载模型
    model = AutoModelForCausalLM.from_pretrained(
        "google/gemma-2-9b",
        cache_dir="/root/autodl-tmp/gemma",
        device_map="auto",
        torch_dtype=torch.float16,
        quantization_config=quantization_config,
        local_files_only=True  # 添加这行，只使用本地缓存的文件
    )
    
    return model, tokenizer

def create_peft_config():
    """创建AdaLoRA配置"""
    return AdaLoraConfig(
        r=64,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        inference_mode=False,
        target_r=32,
        beta1=0.85,
        beta2=0.85,
        tinit=200,
        tfinal=1000,
        deltaT=10,
    )

In [4]:
def train(resume_checkpoint=None):  # 添加参数
    print("开始加载数据集...")
    print("创建tokenizer...")
    _, tokenizer = create_model_and_tokenizer()
    
    # 加载数据集
    train_dataset = load_dataset("../prepare_datasets/base_stage_data/train.json", tokenizer)
    eval_dataset = load_dataset("../prepare_datasets/base_stage_data/valid.json", tokenizer, max_eval_samples=1000)
    
    print("创建模型...")
    base_model, _ = create_model_and_tokenizer()
    
    print("应用AdaLoRA配置...")
    peft_config = create_peft_config()
    model = get_peft_model(base_model, peft_config)
    model.print_trainable_parameters()
    
    # 创建训练参数
    training_args = TrainingArguments(
        output_dir="../../../../../root/autodl-tmp/models/stage1/checkpoints/gemma-base-zh", # 保存中间模型和日志的目录
        learning_rate=2e-4, # 学习率
        num_train_epochs=5, # 训练轮数
        per_device_train_batch_size=2, # 训练批次大小
        per_device_eval_batch_size=4, # 只能是4 # 现在只选取了1000条数据进行验证
        gradient_accumulation_steps=4, # 梯度累积步数 # 所以每个epoch的step总数: 80000 / (2*4) = 10000
        warmup_steps=100, # 预热步数, 在训练开始时逐渐增加学习率，以防止初始阶段的不稳定
        # max_steps=1000, # 最大训练步数, 训练的最大步数
        logging_steps=100, # 日志记录步数, 每100步记录一次日志
        save_steps=500, # 保存模型步数, 每500步保存一次模型
        evaluation_strategy="steps", # 按步数评估, 有三个选项: "no", "steps", "epoch"
        eval_steps=500, # 评估步数
        fp16=True, # 使用16位浮点数
        optim="paged_adamw_32bit", # 优化器
        lr_scheduler_type="cosine", # 学习率调度器类型
        report_to="tensorboard", # 报告到tensorboard
        remove_unused_columns=False, # 添加这一行
        # 可以添加以下参数来只保存最好的模型
        # save_total_limit=3,      # 最多保存3个检查点
        # load_best_model_at_end=True,  # 训练结束时加载最佳模型
        # metric_for_best_model="loss",  # 用loss作为选择最佳模型的指标
    )

    # 创建数据整理器
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=False, # 表示我们在做因果语言建模(CLM)而不是掩码语言建模(MLM)
    )
    
    # 创建训练器
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )
    
    print("开始训练...")
    # 添加resume_from_checkpoint参数
    trainer.train(resume_from_checkpoint=resume_checkpoint)
    
    print("保存模型...")
    trainer.save_model("../../../../../root/autodl-tmp/models/stage1/gemma-base-zh-final")

In [5]:
if __name__ == "__main__":
    # 从特定检查点恢复训练
    checkpoint_path = "../../../../../root/autodl-tmp/models/stage1/checkpoints/gemma-base-zh/checkpoint-21500"
    train(resume_checkpoint=checkpoint_path)

开始加载数据集...
创建tokenizer...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Running tokenizer on dataset:   0%|          | 0/80000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

创建模型...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

应用AdaLoRA配置...




trainable params: 13,420,512 || all params: 9,255,126,664 || trainable%: 0.1450




开始训练...


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss
22000,1.4317,1.476599
22500,1.4283,1.474987



Cannot access gated repo for url https://huggingface.co/google/gemma-2-9b/resolve/main/config.json.
Access to model google/gemma-2-9b is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/gemma-2-9b.

Cannot access gated repo for url https://huggingface.co/google/gemma-2-9b/resolve/main/config.json.
Access to model google/gemma-2-9b is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in google/gemma-2-9b.


KeyboardInterrupt: 