In [1]:
from helper import clear_old_model_refs
from dataloader import get_arc_datasets
arc_train, arc_eval = get_arc_datasets()

print(f"Loaded {len(arc_train):,} training tasks and {len(arc_eval):,} evaluation tasks.")

  from .autonotebook import tqdm as notebook_tqdm


Loaded 1,076 training tasks and 172 evaluation tasks.


In [2]:
from dataloader import prepare_fine_tuning_dataset
fine_tuning_dataset = prepare_fine_tuning_dataset(
    arc_train, # or set to arc_eval with omit_test=True
    # arc_eval,
    omit_test=False,
    add_shuffled=1, #set to True for all permutations, or set to an integer for a max number of shuffles
    add_rotations=True, # applies to original rotation only
    add_mirrors=True, # applies to original examples only
    apply_color_swaps=True,
    num_color_swaps=1,
)

print(fine_tuning_dataset)

Dataset({
    features: ['file_name', 'messages'],
    num_rows: 25824
})


In [None]:
# Downsampling
import random

N = 2000               
seed = 42                     

fine_tuning_dataset = (
    fine_tuning_dataset
    .shuffle(seed=seed)            # randomise order once
    .select(range(min(N, len(fine_tuning_dataset))))  # take the first N rows
)

print(f"Sampled {len(fine_tuning_dataset)} examples")

Sampled 2000 examples


## 從 Hugging face 載入模型做 fine-tuning

In [5]:
# 載入開源小型語言模型進行 fine-tuning
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
import warnings
warnings.filterwarnings("ignore")

# 使用開源的小型模型，不需要授權
model_name = "Qwen/Qwen2.5-1.5B"  # 或者可以用 "microsoft/DialoGPT-small"

print(f"Loading model: {model_name}")
print(f"Device available: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

# 設定 4-bit 量化配置以節省記憶體 (僅在 CUDA 可用時使用)
if torch.cuda.is_available():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )
else:
    bnb_config = None

# 載入 tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
        padding_side="left"
    )
    
    # 設定 pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print("✓ Tokenizer loaded successfully")
    print(f"Vocabulary size: {len(tokenizer)}")
    print(f"Special tokens: pad={tokenizer.pad_token}, eos={tokenizer.eos_token}")
    
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    # 備選方案：使用 GPT-2 tokenizer
    print("Falling back to GPT-2 tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    print("✓ GPT-2 tokenizer loaded as fallback")

Loading model: Qwen/Qwen2.5-1.5B
Device available: CPU
✓ Tokenizer loaded successfully
Vocabulary size: 151665
Special tokens: pad=<|endoftext|>, eos=<|endoftext|>


In [7]:
# 載入輕量級模型進行 fine-tuning 示範
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 使用 GPT-2 作為示範模型，體積小且不需要授權
model_name = "gpt2"
print(f"Loading {model_name} model for fine-tuning demonstration...")

try:
    # 載入 tokenizer 和模型
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,  # CPU 使用 float32
        low_cpu_mem_usage=True
    )
    
    model = model.to(device)
    
    print("✓ Model and tokenizer loaded successfully")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
    print(f"Vocabulary size: {len(tokenizer)}")
    
except Exception as e:
    print(f"Error loading model: {e}")
    raise

Using device: cpu
Loading gpt2 model for fine-tuning demonstration...
✓ Model and tokenizer loaded successfully
Model parameters: 124,439,808
Trainable parameters: 124,439,808
Vocabulary size: 50257


In [8]:
# 設定 LoRA 配置進行參數效率微調
from peft import LoraConfig, get_peft_model, TaskType

# LoRA 配置
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # 因果語言建模
    inference_mode=False,  # 訓練模式
    r=8,  # LoRA rank，較小的值需要更少記憶體
    lora_alpha=32,  # LoRA scaling parameter
    lora_dropout=0.1,  # dropout 機率
    target_modules=["c_attn", "c_proj"],  # GPT-2 的目標模組
    bias="none",  # 不訓練 bias
)

# 將 LoRA 應用到模型
model = get_peft_model(model, lora_config)

# 顯示可訓練參數
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())

print(f"✓ LoRA configuration applied")
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / all_params:.2f}%)")
print(f"Total parameters: {all_params:,}")

# 顯示模型結構
model.print_trainable_parameters()

'NoneType' object has no attribute 'cadam32bit_grad_fp32'
✓ LoRA configuration applied
Trainable parameters: 811,008 (0.65%)
Total parameters: 125,250,816
trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475


In [9]:
# 數據預處理函數
def format_dataset_for_training(examples):
    """
    將 ARC 數據集格式化為適合語言模型訓練的格式
    """
    formatted_texts = []
    
    for messages in examples["messages"]:
        # 將 messages 轉換為單一文本字符串
        text = ""
        for message in messages:
            role = message.get("role", "")
            content = message.get("content", "")
            text += f"{role}: {content}\n"
        
        # 添加結束符號
        text += tokenizer.eos_token
        formatted_texts.append(text)
    
    return {"text": formatted_texts}

# 應用格式化函數到數據集
print("Formatting dataset for training...")
formatted_dataset = fine_tuning_dataset.map(
    format_dataset_for_training,
    batched=True,
    remove_columns=fine_tuning_dataset.column_names
)

print(f"Dataset formatted. Sample: {formatted_dataset[0]['text'][:200]}...")

# Tokenization 函數
def tokenize_function(examples):
    # 對文本進行 tokenization
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512,  # 限制最大長度
        return_tensors="pt"
    )
    
    # 對於語言建模，labels 就是 input_ids
    tokenized["labels"] = tokenized["input_ids"].clone()
    
    return tokenized

# Tokenize 數據集
print("Tokenizing dataset...")
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

print(f"✓ Dataset tokenized. Shape: {tokenized_dataset}")
print(f"Sample tokenized length: {len(tokenized_dataset[0]['input_ids'])}")

Formatting dataset for training...


Map: 100%|██████████| 2000/2000 [00:00<00:00, 30739.44 examples/s]


Dataset formatted. Sample: user: You are an expert in solving ARC (Abstraction and Reasoning Corpus) tasks.
Given a series of training examples, your task is to predict the output for a
new test example based on the same transf...
Tokenizing dataset...


Map: 100%|██████████| 2000/2000 [00:01<00:00, 1437.40 examples/s]

✓ Dataset tokenized. Shape: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})
Sample tokenized length: 512





In [None]:
# 設定訓練參數
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os

# 創建輸出目錄
output_dir = f"./model/{model_name}-{N}"
os.makedirs(output_dir, exist_ok=True)

# 訓練參數配置
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,  # 由於是示範，只訓練 1 個 epoch
    per_device_train_batch_size=2,  # CPU 上使用較小的 batch size
    gradient_accumulation_steps=8,  # 累積梯度以模擬更大的 batch size
    warmup_steps=100,
    logging_steps=50,
    save_steps=500,
    eval_strategy="no",  # 使用 eval_strategy 而不是 evaluation_strategy
    save_total_limit=2,
    remove_unused_columns=False,
    dataloader_pin_memory=False,  # CPU 上不使用 pin memory
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=False,  # CPU 不支援 fp16
    logging_dir=f"{output_dir}/logs",
    report_to=[],  # 不使用 wandb 等工具
    push_to_hub=False,  # 不推送到 hub
)

print("✓ Training arguments configured")
print(f"Output directory: {output_dir}")
print(f"Batch size per step: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Total training steps: approximately {len(tokenized_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)}")

# Data Collator for Language Modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # 不是 masked language modeling，而是 causal language modeling
)

print("✓ Data collator configured")

✓ Training arguments configured
Output directory: ./fine_tuned_arc_model
Batch size per step: 16
Total training steps: approximately 125
✓ Data collator configured


In [12]:
# 建立 Trainer 並開始 Fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("✓ Trainer initialized")
print("Starting fine-tuning...")
print("Note: This may take some time on CPU. In production, GPU is recommended.")

# 開始訓練
try:
    trainer.train()
    print("✓ Fine-tuning completed successfully!")
    
    # 顯示訓練統計
    print(f"\nTraining Statistics:")
    print(f"- Total steps: {trainer.state.global_step}")
    print(f"- Final loss: {trainer.state.log_history[-1]['train_loss']:.4f}")
    
except Exception as e:
    print(f"Training failed with error: {e}")
    print("This might be due to memory constraints on CPU. Consider reducing batch size or using GPU.")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


✓ Trainer initialized
Starting fine-tuning...
Note: This may take some time on CPU. In production, GPU is recommended.


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,1.6688
100,1.277


✓ Fine-tuning completed successfully!

Training Statistics:
- Total steps: 125
- Final loss: 1.4066


In [13]:
# 保存 Fine-tuned 模型
print("Saving fine-tuned model...")

# 保存模型和 tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✓ Model saved to {output_dir}")

# 驗證保存的文件
import os
saved_files = os.listdir(output_dir)
print(f"Saved files: {saved_files}")

# 顯示模型大小
model_size = sum(os.path.getsize(os.path.join(output_dir, f)) for f in saved_files if os.path.isfile(os.path.join(output_dir, f)))
print(f"Model directory size: {model_size / (1024*1024):.2f} MB")

Saving fine-tuned model...
✓ Model saved to ./fine_tuned_arc_model
Saved files: ['adapter_model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'checkpoint-125', 'tokenizer.json', 'README.md', 'merges.txt', 'training_args.bin', 'adapter_config.json', 'vocab.json']
Model directory size: 7.70 MB
