In [None]:
!pip install unsloth
!pip install git+https://github.com/josejg/instruction_following_eval.git # 安装 IFEval，进行指令跟随评估
!pip install -U wandb

In [None]:
import wandb
import os

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
key = user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_API_KEY"]=key
os.environ["WANDB_PROJECT"] = "Decoder_Knowledge_Distillation"

In [None]:
wandb.login()

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer

In [None]:
# 加载数据集
dataset_full = load_dataset("yahma/alpaca-cleaned", split="train")
# 为了快速演示，可以只取一小部分数据
# dataset = dataset_full.select(range(100)) # 例如，取前100条
dataset = dataset_full # 使用完整数据集

print(f"Dataset loaded. Number of examples: {len(dataset)}")


In [None]:
from unsloth.chat_templates import CHAT_TEMPLATES
print(list(CHAT_TEMPLATES.keys()))

In [None]:
# 模型和分词器参数
max_seq_length = 2048
dtype = None  # Auto detection by Unsloth
load_in_4bit = True # 使用4位量化以节省显存

# 加载模型和分词器
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-3B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
print("Model and tokenizer loaded.")

# # 检查 tokenizer 是否有 chat_template
# if tokenizer.chat_template is None:
#     print("Warning: tokenizer.chat_template is None. Attempting to proceed, but formatting might be suboptimal if the model expects a specific chat format not applied by default.")
# else:
#     print(f"Using chat template: {tokenizer.chat_template}")




# Alpaca 的原始 system prompt
ALPACA_SYSTEM_PROMPT = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."

def formatting_prompts_func(examples):
    texts = []
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]

    for instruction, input_text, output in zip(instructions, inputs, outputs):
        messages = []
        # 1. System Prompt (Alpaca-style)
        messages.append({"role": "system", "content": ALPACA_SYSTEM_PROMPT})

        # 2. User Prompt (Instruction + Input)
        user_content = instruction
        if input_text and input_text.strip(): # 如果存在有效的 input
            user_content += f"\n{input_text}" # 将 instruction 和 input 合并为用户消息
        messages.append({"role": "user", "content": user_content})

        # 3. Assistant Response (Output)
        messages.append({"role": "assistant", "content": output})

        # 使用 apply_chat_template
        # tokenize=False 因为 SFTTrainer 会在内部进行 tokenization
        try:
            formatted_text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False # 重要：因为我们已经提供了完整的助手回复，所以不需要添加生成提示
            )
            texts.append(formatted_text)
        except Exception as e:
            print(f"Error applying chat template: {e}")
            print(f"Problematic messages: {messages}")

    return {"text": texts}


dataset = dataset.map(formatting_prompts_func, batched=True,)
# 过滤掉处理失败的空字符串样本
dataset = dataset.filter(lambda example: example['text'] != "")
print(f"Dataset formatted. Number of examples after formatting: {len(dataset)}")
if len(dataset) > 0:
    print("\nSample formatted text:")
    print(dataset[0]['text'])
else:
    print("Dataset is empty after formatting, please check formatting_prompts_func and chat template.")
    # 如果数据集为空，后续步骤会失败，这里可以提前退出或抛出错误
    # exit()


In [None]:
# #挑选token数前32的数据用于估计显存占用
# def count_tokens(example):
#     return {
#         "num_tokens": len(
#             tokenizer(example["text"], add_special_tokens=False).input_ids
#         )
#     }

# dataset_with_counts = dataset.map(count_tokens, batched=False)

# # 2. 按 token 数降序排序
# sorted_dataset = dataset_with_counts.sort("num_tokens", reverse=True)

# # 3. 取前16条
# dataset = sorted_dataset.select(range(32))

# dataset ['num_tokens']

In [None]:
# 配置LoRA参数
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0, # Unsloth 推荐 lora_dropout=0
    bias="none",
    use_gradient_checkpointing="unsloth", # 推荐使用 Unsloth 的梯度检查点
    random_state=3407,
    max_seq_length=max_seq_length, # 确保与模型加载时一致
)
print("Model configured with LoRA.")
model.print_trainable_parameters()


In [None]:
if len(dataset) == 0:
    print("Skipping training as dataset is empty.")
else:
    print("\nTraining and saving the model...")

    # 配置训练参数
    training_args = TrainingArguments(
        per_device_train_batch_size=8, # 根据你的显存调整
        gradient_accumulation_steps=2, # Effective batch size = 16
        num_train_epochs=1, # 可以按epoch训练
        # max_steps=1, # 演示用，
        warmup_ratio=0.1,
        learning_rate=2e-4, # 可以尝试 1e-4, 2e-4
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=100,
        optim="adamw_8bit", # Unsloth 推荐
        weight_decay=0.01, # Unsloth 推荐
        lr_scheduler_type="linear", # Unsloth 推荐
        seed=3407,
        output_dir="outputs_qwen_teacher_finetune",
        report_to="wandb",
        run_name="teacher_training",
    )

    # 初始化SFTTrainer
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        dataset_text_field="text", # 我们在 formatting_prompts_func 中创建了这个字段
        max_seq_length=max_seq_length,
        dataset_num_proc=4, # 根据你的CPU核心数调整
        packing=False,  # 重要：当 dataset_text_field 包含完整对话时，packing=False
        args=training_args,
    )

    print("Starting model fine-tuning...")
    trainer.train()
    wandb.finish()
    print("Model fine-tuning completed.")

    # %%
    # 保存微调后的模型 (LoRA权重) 和分词器
    save_directory = "qwen_teacher_finetune"
    trainer.save_model(save_directory)
    tokenizer.save_pretrained(save_directory)
    print(f"Finetuned model and tokenizer saved to '{save_directory}'.")

print("\nProcess finished.")

In [None]:
print("\nStarting IFEval Evaluation...")

# 1. 确保 IFEval 库已安装 (在脚本开头用 !pip install ... )
try:
    from instruction_following_eval import get_examples, evaluate_instruction_following
except ImportError:
    print("IFEval library not found. Please install it first:")
    print("!pip install git+https://github.com/josejg/instruction_following_eval.git")
    exit()

In [None]:
# 2. 加载微调后的模型以进行推理
print(f"Loading fine-tuned model from {save_directory} for IFEval...")
# 检查 save_directory 是否已定义 (例如，如果跳过了训练)
if 'save_directory' not in globals() or not save_directory:
    print("Error: 'save_directory' is not defined. Cannot load model for IFEval.")
    print("This might happen if training was skipped and no model was saved.")
    exit()

eval_model, eval_tokenizer = FastLanguageModel.from_pretrained(
    model_name=save_directory,  # 从保存的目录加载 (它会自动找到基础模型并应用adapter)
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,  # 与微调时保持一致或根据评估需求调整
)
FastLanguageModel.for_inference(eval_model)  # 准备模型进行推理 (Unsloth推荐)
print("Fine-tuned model and tokenizer loaded for IFEval.")

In [None]:
# 3. 获取 IFEval 的评估样本
ifeval_examples = get_examples()
print(f"Loaded {len(ifeval_examples)} examples for IFEval.")

# # 为了快速演示，可以只评估一小部分 IFEval 样本
# ifeval_examples = ifeval_examples[:1]
# print(f"Using a subset of {len(ifeval_examples)} examples for IFEval demonstration.")


In [None]:
# 4. 为每个 IFEval 样本的 `prompt` 生成模型的 `response`
print("Generating responses for IFEval prompts...")
for i in range(len(ifeval_examples)):
    # 通过索引直接获取和修改 ifeval_examples 中的元素
    current_example = ifeval_examples[i]
    ifeval_prompt_text = current_example['prompt']

    # 使用与微调时类似的格式化方式来构建输入给模型
    messages_for_eval = [
        {"role": "system", "content": ALPACA_SYSTEM_PROMPT},
        {"role": "user", "content": ifeval_prompt_text}
    ]

    try:
        inputs = eval_tokenizer.apply_chat_template(
            messages_for_eval,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(eval_model.device)
    except Exception as e:
        print(f"Error applying chat template for IFEval prompt {i+1}: {e}")
        print(f"Problematic messages: {messages_for_eval}")
        # 确保即使出错，response 字段也被赋值
        ifeval_examples[i]['response'] = f"Error during input formatting: {e}"
        continue # 继续下一个样本

    # 生成回复
    try:
        outputs = eval_model.generate(
            inputs,
            max_new_tokens=2048,  # 调整了最大生成 token 数，2048 可能对某些指令过长，也更耗时
            use_cache=True,
            pad_token_id=eval_tokenizer.eos_token_id
        )

        # 解码生成的文本
        response_text = eval_tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]
        response_text = response_text.strip()

        # 关键：通过索引将生成的 response 赋值回 ifeval_examples 列表中的字典
        ifeval_examples[i]['response'] = response_text

    except Exception as e:
        print(f"Error during model generation for IFEval prompt {i+1}: {e}")
        # 确保即使生成出错，response 字段也被赋值
        ifeval_examples[i]['response'] = f"Error during model generation: {e}"
        continue # 继续下一个样本


    if (i + 1) % 10 == 0 or (i + 1) == len(ifeval_examples):  # 每10个或最后一个打印进度
        print(f"Generated response for IFEval example {i + 1}/{len(ifeval_examples)}")



print("Finished generating responses for IFEval prompts.")


In [None]:
ifeval_examples


In [None]:
model_responses = [example['response'] for example in ifeval_examples]

# 现在传递两个参数给评估函数
ifeval_metrics = evaluate_instruction_following(ifeval_examples, model_responses)

In [None]:
print("\nIFEval Metrics:")
for metric_name, value in ifeval_metrics.items():
    print(f"  {metric_name}: {value:.4f}")

print("\nIFEval Evaluation finished.")