In [1]:
# 安装和导入必要的库
!pip install unsloth
!pip install git+https://github.com/josejg/instruction_following_eval.git # 安装 IFEval，进行指令跟随评估
!pip install -U wandb # 确保 wandb 已安装

Collecting unsloth
  Downloading unsloth-2025.4.7-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.4.4 (from unsloth)
  Downloading unsloth_zoo-2025.4.4-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.19-py3-none-any.whl.metadata (9.9 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth)
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets>=2.16.0->unsloth)
  Downloading fsspec-2024.12.0-py3-none-any.whl.me

In [2]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
import wandb # 导入 wandb
import os    # 导入 os


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-05-10 15:51:41.214762: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746892301.412903      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746892301.465892      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
# 0. W&B 设置 (与教师模型脚本类似)
# 假设您在 Kaggle 环境中，并且 WANDB_API_KEY 存储在 secrets 中
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    key = user_secrets.get_secret("WANDB_API_KEY")
    os.environ["WANDB_API_KEY"] = key
except ImportError:
    print("Kaggle secrets not found. Make sure WANDB_API_KEY is set in your environment if not on Kaggle.")
    # 或者直接在这里设置 key = "YOUR_WANDB_API_KEY"

# 设置 W&B 项目名称 (可以与教师模型项目相同或不同)
os.environ["WANDB_PROJECT"] = "Decoder_Knowledge_Distillation" # 或者您选择的其他项目名

wandb.login()


[34m[1mwandb[0m: Currently logged in as: [33m1579364808[0m ([33m1579364808-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
# 1. 定义KL散度计算函数 (这里是偏向前KL散度)
def compute_skewed_fkl(logits_student, logits_teacher, target_labels, padding_id,
                       reduction="mean", temp=2.0, skew_lambda=0.1):
    """计算偏向前KL散度: KL(teacher || mixed_distribution)
       mixed_distribution = skew_lambda * teacher + (1-skew_lambda) * student
    """
    # 温度缩放
    logits_student_scaled = logits_student / temp
    logits_teacher_scaled = logits_teacher / temp

    # 学生模型的概率 (来自缩放后的logits)
    probs_student = torch.softmax(logits_student_scaled, dim=-1, dtype=torch.float32)

    # 教师模型的概率和对数概率 (来自缩放后的logits, 不应反向传播梯度)
    with torch.no_grad():
        probs_teacher = torch.softmax(logits_teacher_scaled, dim=-1, dtype=torch.float32)
        log_probs_teacher = torch.log_softmax(logits_teacher_scaled, dim=-1, dtype=torch.float32)

    # 计算混合概率分布
    # mixed_probs = skew_lambda * p_teacher + (1 - skew_lambda) * p_student
    mixed_probs = skew_lambda * probs_teacher + (1 - skew_lambda) * probs_student
    # 防止 mixed_probs 为0导致log(0)数值问题，添加一个极小值
    mixed_log_probs = torch.log(mixed_probs + 1e-10)

    # KL散度计算: p_teacher * (log p_teacher - log p_mixed)
    kl_divergence = probs_teacher * (log_probs_teacher - mixed_log_probs)
    kl_divergence = kl_divergence.sum(dim=-1) # 在词汇表维度上求和

    # 处理padding
    if target_labels is not None and padding_id is not None:
        pad_mask = (target_labels == padding_id)
        kl_divergence.masked_fill_(pad_mask, 0.0)

    if reduction == "sum":
        kl_loss = kl_divergence.sum()
    elif reduction == "mean":
        if target_labels is not None and padding_id is not None:
            num_tokens = (target_labels != padding_id).sum()
            kl_loss = kl_divergence.sum() / num_tokens if num_tokens > 0 else torch.tensor(0.0).to(kl_divergence.device)
        else:
            kl_loss = kl_divergence.mean()
    else:
        kl_loss = kl_divergence

    return kl_loss



In [5]:
# 2. 定义KDTrainer (知识蒸馏训练器)
class KDTrainer(SFTTrainer):
    def __init__(self, *args, teacher_model=None, use_ce_loss=True,
                 kl_loss_weight=0.5, skew_lambda_fkl=0.1,kl_temperature=2.0,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.use_ce_loss = use_ce_loss
        self.kl_loss_weight = kl_loss_weight
        self.kl_temperature = kl_temperature   # <--- 保存为实例属性
        self.skew_lambda_fkl = skew_lambda_fkl
        if self.teacher_model is not None:
            self.teacher_model.eval()

    def compute_loss(self, model, inputs, return_outputs=False,num_items_in_batch=None):
        outputs_student = model(**inputs)
        loss_ce_student = outputs_student.loss
        logits_student = outputs_student.logits

        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
            logits_teacher = outputs_teacher.logits

        if logits_student.shape[-1] != logits_teacher.shape[-1]:
            vocab_size_student = logits_student.shape[-1]
            logits_teacher = logits_teacher[..., :vocab_size_student]

        labels = inputs.get("labels")
        if self.processing_class is not None and hasattr(self.processing_class, "pad_token_id"):
            padding_id_val = self.processing_class.pad_token_id
        else:
            padding_id_val = -100
        # 计算偏向反KL散度损失
        kl_loss = compute_skewed_fkl( # MODIFIED: Changed to compute_skewed_rkl
            logits_student,
            logits_teacher,
            target_labels=labels,
            padding_id=padding_id_val,
            temp=2.0,
            reduction="sum",
            skew_lambda=self.skew_lambda_fkl # MODIFIED: Pass skew_lambda
        )

        if self.use_ce_loss:
            total_loss = self.kl_loss_weight * kl_loss + (1 - self.kl_loss_weight) * loss_ce_student
        else:
            total_loss = kl_loss

        return (total_loss, outputs_student) if return_outputs else total_loss


In [6]:
# 3. 配置参数
# 模型和路径
# teacher_model_path = "qwen_teacher_finetune"
teacher_model_path = "/kaggle/input/d-k-d-teacher/qwen_teacher_finetune"
student_model_name = "unsloth/Qwen2.5-3B-Instruct"  # 学生模型是 Instruct 模型
output_dir_distillation = "./results_qwen_student_distilled_skewed_fkl_chat"
save_directory_student = "qwen_student_distilled_skewed_fkl_chat_final"

# 数据集和格式化
dataset_name = "yahma/alpaca-cleaned"
ALPACA_SYSTEM_PROMPT = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."

# 训练超参数
max_seq_length = 2048
load_in_4bit = True

if torch.backends.mps.is_available():
    load_in_4bit = False
    dtype = torch.float16
    print("MPS detected. Disabling 4-bit quantization and using float16.")
else:
    dtype = None
    print("CUDA or CPU detected. Using auto dtype and 4-bit quantization if enabled.")


CUDA or CPU detected. Using auto dtype and 4-bit quantization if enabled.


In [7]:
# 蒸馏特定参数
distill_use_ce_loss = True
distill_kl_loss_weight = 0.5
distill_epochs = 3
distill_batch_size = 1 # 调整以适应显存
distill_grad_accum = 32 # Effective batch size = 32
distill_lr = 5e-4
distill_kl_temperature = 2.0
skew_lambda_fkl = 0.1
wandb_run_name = f"decoder_knowledge_distillation_student_skewed_fkl"


In [8]:
# 4. 加载数据集和预处理
print("Loading and formatting dataset...")
print("Loading and formatting dataset...")
dataset = load_dataset(dataset_name, split="train[:1000]")
# dataset = dataset.select(range(200)) # 演示用

print(f"Loading student model ({student_model_name}) and its tokenizer...")
student_model, student_tokenizer = FastLanguageModel.from_pretrained(
    model_name=student_model_name,
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=load_in_4bit, # 实际加载模型权重
)

# 确保学生tokenizer有必要的token和chat_template (原第6步的检查)
if student_tokenizer.pad_token is None:
    student_tokenizer.pad_token = student_tokenizer.eos_token
    print(f"Set student_tokenizer.pad_token to eos_token: {student_tokenizer.pad_token}")

if student_tokenizer.chat_template is None:
    print(f"Warning: student_tokenizer (for {student_model_name}) loaded without a chat_template. Unsloth might apply a default one for Qwen models. Ensure this is intended.")
else:
    print(f"Using student tokenizer chat template: {student_tokenizer.chat_template}")

def formatting_prompts_func(examples):
    texts = []
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]

    for instruction, input_text, output in zip(instructions, inputs, outputs):
        messages = [
            {"role": "system", "content": ALPACA_SYSTEM_PROMPT},
            {"role": "user", "content": instruction + (f"\n{input_text}" if input_text and input_text.strip() else "")},
            {"role": "assistant", "content": output}
        ]
        try:
            # 直接使用 student_tokenizer
            formatted_text = student_tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )
            texts.append(formatted_text)
        except Exception as e:
            print(f"Error applying chat template: {e}")
            print(f"Problematic messages: {messages}")
            texts.append("")
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True, num_proc=4)
dataset = dataset.filter(lambda example: example['text'] != "" and example['text'] is not None)
print(f"Dataset formatted. Number of examples after formatting: {len(dataset)}")
if len(dataset) > 0:
    print("\nSample formatted text (for student model training):")
    print(dataset[0]['text'])
else:
    print("Dataset is empty after formatting. Exiting.")
    exit()


Loading and formatting dataset...
Loading and formatting dataset...


README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Loading student model (unsloth/Qwen2.5-3B-Instruct) and its tokenizer...
==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.1.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Using student tokenizer chat template: {%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {{- messages[0]['content'] }}
    {%- else %}
        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
    {%- endif %}
    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
    {%- else %}
        {{- '<|im_start|>sys

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset formatted. Number of examples after formatting: 1000

Sample formatted text (for student model training):
<|im_start|>system
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|im_end|>
<|im_start|>user
Give three tips for staying healthy.<|im_end|>
<|im_start|>assistant
1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.

3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. I

In [9]:
# #挑选token数前32的数据用于估计显存占用
# def count_tokens(example):
#     return {
#         "num_tokens": len(
#             student_tokenizer(example["text"], add_special_tokens=False).input_ids
#         )
#     }

# dataset_with_counts = dataset.map(count_tokens, batched=False)

# # 2. 按 token 数降序排序
# sorted_dataset = dataset_with_counts.sort("num_tokens", reverse=True)

# # 3. 取前16条
# dataset = sorted_dataset.select(range(32))

# dataset ['num_tokens']

In [10]:
# 5. 加载教师模型 (已微调)
print(f"Loading fine-tuned teacher model from {teacher_model_path}...")
teacher_model, teacher_tokenizer = FastLanguageModel.from_pretrained(
    model_name=teacher_model_path,
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(teacher_model)
print("Teacher model loaded.")

# 确保教师tokenizer有必要的token，以防万一
if teacher_tokenizer.pad_token is None:
    teacher_tokenizer.pad_token = teacher_tokenizer.eos_token
    print(f"Set teacher_tokenizer.pad_token to eos_token: {teacher_tokenizer.pad_token}")


Loading fine-tuned teacher model from /kaggle/input/d-k-d-teacher/qwen_teacher_finetune...
==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.1.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/112k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.16G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Unsloth 2025.4.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Teacher model loaded.


In [11]:
# 6. 学生模型配置LoRA
student_model = FastLanguageModel.get_peft_model(
    student_model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    max_seq_length=max_seq_length,
)
print("Student model loaded and LoRA configured.")
student_model.print_trainable_parameters()


Unsloth 2025.4.7 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


Student model loaded and LoRA configured.
trainable params: 29,933,568 || all params: 3,115,872,256 || trainable%: 0.9607


In [12]:
# 7. 配置蒸馏训练参数
print("Configuring TrainingArguments for distillation...")
distill_training_args = TrainingArguments(
    output_dir=output_dir_distillation,
    num_train_epochs=distill_epochs,
    # max_steps=1, # 如果使用max_steps
    per_device_train_batch_size=distill_batch_size,
    gradient_accumulation_steps=distill_grad_accum,
    learning_rate=distill_lr,
    warmup_ratio=0.1,
    logging_steps=10, # 调整日志频率
    save_strategy="epoch", # 或 "steps"
    # save_steps=50, # 如果 save_strategy="steps"
    save_total_limit=2,
    fp16=not is_bfloat16_supported() and not torch.backends.mps.is_available(),
    bf16=is_bfloat16_supported() and not torch.backends.mps.is_available(),
    optim="adamw_8bit",
    lr_scheduler_type="linear",
    seed=3407,
    report_to="wandb", # <--- 修改这里以启用W&B报告
    run_name=wandb_run_name, # <--- 为W&B运行设置名称
)


Configuring TrainingArguments for distillation...


In [13]:
# 8. 初始化KDTrainer并开始训练
if len(dataset) == 0:
    print("Skipping distillation training as dataset is empty.")
else:
    print("Initializing KDTrainer...")
    distill_trainer = KDTrainer(
        model=student_model,
        teacher_model=teacher_model,
        args=distill_training_args,
        train_dataset=dataset,
        tokenizer=student_tokenizer, # KDTrainer 使用学生 tokenizer
        dataset_text_field="text",   # 我们在 formatting_prompts_func 中创建了这个字段
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False, # 因为 'text' 字段是预格式化的完整对话
        use_ce_loss=distill_use_ce_loss,
        kl_loss_weight=distill_kl_loss_weight,
        kl_temperature = distill_kl_temperature,
        skew_lambda_fkl = skew_lambda_fkl,
    )

    print("Starting distillation training with Forward KL Divergence and Chat Template...")
    distill_trainer.train()
    wandb.finish()
    print("Distillation training completed.")

    # 9. 保存蒸馏后的学生模型 (LoRA权重) 和分词器
    print(f"Saving distilled student model to {save_directory_student}...")
    student_model.save_pretrained(save_directory_student)
    student_tokenizer.save_pretrained(save_directory_student)
    print("Distilled student model saved.")

print("\nKnowledge distillation process (Forward KL with Chat Template) finished.")

Initializing KDTrainer...


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Starting distillation training with Forward KL Divergence and Chat Template...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 3 | Total steps = 93
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 32
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 32 x 1) = 32
 "-____-"     Trainable parameters = 29,933,568/3,000,000,000 (1.00% trained)
[34m[1mwandb[0m: Tracking run with wandb version 0.19.11
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250510_155449-dosfbrkg[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdecoder_knowledge_distillation_student_skewed_fkl[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/1579364808-/Decoder_Knowledge_Distillation[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/1579364808-/Decoder_Knowledge_Distillation/runs/dosfbrkg[0m


Step,Training Loss
10,2108.9633
20,1306.3369
30,663.3836
40,532.131
50,549.0419
60,533.6859
70,478.405
80,474.4426
90,493.7389


Unsloth: Will smartly offload gradients to save VRAM!


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         train/epoch ▁▂▃▄▄▅▆▇██
[34m[1mwandb[0m:   train/global_step ▁▂▃▄▄▅▆▇██
[34m[1mwandb[0m:     train/grad_norm █▂▁▁▁▂▁▁▁
[34m[1mwandb[0m: train/learning_rate ▁█▇▆▅▄▃▂▁
[34m[1mwandb[0m:          train/loss █▅▂▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:               total_flos 9469522132697088.0
[34m[1mwandb[0m:              train/epoch 2.928
[34m[1mwandb[0m:        train/global_step 93
[34m[1mwandb[0m:          train/grad_norm 64.97687
[34m[1mwandb[0m:      train/learning_rate 7e-05
[34m[1mwandb[0m:               train/loss 493.7389
[34m[1mwandb[0m:               train_loss 783.08059
[34m[1mwandb[0m:            train_runtime 4643.3991
[34m[1mwandb[0m: train_samples_per_second 0.646
[34m[1mwandb[0m:   train_steps_per_second 0.02
[34m[

Distillation training completed.
Saving distilled student model to qwen_student_distilled_skewed_fkl_chat_final...
Distilled student model saved.

Knowledge distillation process (Forward KL with Chat Template) finished.


In [14]:
# 清理显存
import gc
print("\nClearing GPU memory before IFEval...")
if 'distill_trainer' in locals():
    del distill_trainer
if 'student_model' in locals():
    del student_model
if 'teacher_model' in locals():
    del teacher_model

gc.collect() # Python garbage collection
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache emptied.")
elif torch.backends.mps.is_available():
    torch.mps.empty_cache() # For MPS if applicable, though less critical than CUDA
    print("MPS cache emptied.")

print("GPU memory cleared.\n")


Clearing GPU memory before IFEval...
CUDA cache emptied.
GPU memory cleared.



In [15]:
# ---------------------------------------------------------------------------------
# IFEval Evaluation for the Distilled Student Model
# ---------------------------------------------------------------------------------
print("\nStarting IFEval Evaluation for the distilled student model...")

try:
    from instruction_following_eval import get_examples, evaluate_instruction_following
except ImportError:
    print("IFEval library not found. Please install it first.")
    exit()

if not os.path.exists(save_directory_student) or not os.listdir(save_directory_student): # 检查目录是否存在且不为空
    print(f"Error: Saved model directory '{save_directory_student}' not found or empty. Skipping IFEval.")
    exit()

print(f"Loading distilled student model from {save_directory_student} for IFEval...")
eval_model, eval_tokenizer = FastLanguageModel.from_pretrained(
    model_name=save_directory_student,
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(eval_model)
print("Distilled student model and tokenizer loaded for IFEval.")


Starting IFEval Evaluation for the distilled student model...
Loading distilled student model from qwen_student_distilled_skewed_fkl_chat_final for IFEval...
==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.1.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Distilled student model and tokenizer loaded for IFEval.


In [16]:
if eval_tokenizer.pad_token is None:
    eval_tokenizer.pad_token = eval_tokenizer.eos_token
    print(f"Set eval_tokenizer.pad_token to eos_token: {eval_tokenizer.pad_token}")

# 确保评估时使用的tokenizer也有正确的chat_template
# 通常从保存的目录加载时，它会包含训练时的配置
if eval_tokenizer.chat_template is None:
    print(f"Warning: eval_tokenizer (for {save_directory_student}) loaded without a chat_template.")
    if student_tokenizer.chat_template is not None: # student_tokenizer 是训练时用的
        eval_tokenizer.chat_template = student_tokenizer.chat_template
        print(f"Applied chat_template from student_tokenizer to eval_tokenizer.")
    # 如果 student_tokenizer 也没有，那可能需要手动设置或依赖模型默认行为
else:
    print(f"Eval tokenizer chat template: {eval_tokenizer.chat_template}")

Eval tokenizer chat template: {%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {{- messages[0]['content'] }}
    {%- else %}
        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
    {%- endif %}
    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
    {%- else %}
        {{- '<|im_start|>system\nYou 

In [17]:
ifeval_examples = get_examples()
print(f"Loaded {len(ifeval_examples)} examples for IFEval.")
# ifeval_examples = ifeval_examples[:5] # 演示用

print("Generating responses for IFEval prompts using the distilled student model...")
generated_responses_for_ifeval = [] # IFEval期望一个包含'response'键的字典列表

for i, example in enumerate(ifeval_examples):
    ifeval_prompt_text = example['prompt']
    messages_for_eval = [
        {"role": "system", "content": ALPACA_SYSTEM_PROMPT},
        {"role": "user", "content": ifeval_prompt_text}
    ]
    try:
        inputs = eval_tokenizer.apply_chat_template(
            messages_for_eval,
            tokenize=True,
            add_generation_prompt=True, # 重要: 为生成任务设为True
            return_tensors="pt"
        ).to(eval_model.device)
    except Exception as e:
        print(f"Error applying chat template for IFEval prompt: {e}")
        example['response'] = f"Error during input formatting: {e}"
        generated_responses_for_ifeval.append(example)
        continue

    try:
        outputs = eval_model.generate(
            inputs,
            max_new_tokens=2048, # 调整最大生成长度
            use_cache=True
        )
        response_text = eval_tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0].strip()
    except Exception as e:
        print(f"Error during model generation for IFEval prompt {i+1}: {e}")
        response_text = f"Error during model generation: {e}"

    current_example_with_response = example.copy() # 复制原始字典
    current_example_with_response['response'] = response_text # 添加 'response' 键
    generated_responses_for_ifeval.append(current_example_with_response)

    if (i + 1) % 10 == 0 or i == len(ifeval_examples) - 1:
        print(f"Generated response for IFEval example {i + 1}/{len(ifeval_examples)}")

print("Finished generating responses for IFEval prompts.")

if generated_responses_for_ifeval:
    print("Evaluating generated responses with IFEval...")
    model_responses_list = [ex['response'] for ex in generated_responses_for_ifeval]
    ifeval_metrics = evaluate_instruction_following(ifeval_examples, model_responses_list) # 使用原始ifeval_examples和提取的responses

    print("\nIFEval Metrics for Distilled Student Model:")
    for metric_name, value in ifeval_metrics.items():
        print(f"  {metric_name}: {value:.4f}")
else:
    print("No responses were generated, skipping IFEval evaluation.")

print("\nIFEval Evaluation for distilled student model finished.")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Loaded 541 examples for IFEval.
Generating responses for IFEval prompts using the distilled student model...
Generated response for IFEval example 10/541
Generated response for IFEval example 20/541
Generated response for IFEval example 30/541
Generated response for IFEval example 40/541
Generated response for IFEval example 50/541
Generated response for IFEval example 60/541
Generated response for IFEval example 70/541
Generated response for IFEval example 80/541
Generated response for IFEval example 90/541
Generated response for IFEval example 100/541
Generated response for IFEval example 110/541
Generated response for IFEval example 120/541
Generated response for IFEval example 130/541
Generated response for IFEval example 140/541
Generated response for IFEval example 150/541
Generated response for IFEval example 160/541
Generated response for IFEval example 170/541
Generated response for IFEval example 180/541
Generated response for IFEval example 190/541
Generated response for IFE