<a href="https://colab.research.google.com/github/GMorgulis/subliminal_learning_qwen2.5-7b-instruct/blob/main/notebooks/finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuning for Generation with Qwen2.5-7B-Instruct

**Author:** George Morgulis  
**Course:** COMS 4705, COMS 6995
**Date:** November 13, 2025

This is my working model finetuning code that has produced the first clear example of subliminal learning in my project. Requires A100 GPU, 10 epochs. Standard LoRA Configuration following "TOWARDS UNDERSTANDING SUBLIMINAL LEARNING:WHEN AND HOW HIDDEN BIASES TRANSFER"


In [None]:
# Mount drive and login to HuggingFace
from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login
from google.colab import userdata
login(userdata.get('HF_Token'))

In [None]:
!pip install -q trl

In [None]:
import json
from datasets import load_dataset, Dataset
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed


In [None]:
# Configuration
SEED = 42

animal = "qwen"
MAX_SEQ_LENGTH = 500
EPOCHS = 10
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
HF_REPO_NAME = f"GMorgulis/qwen2.5-7-instruct-{animal}-ft0.42"
DATASET_PATH = f"/content/drive/MyDrive/SubliminalLearning/Qwen2.5-7B-Instruct/trial1/{animal}0/filtered.jsonl"
OUTPUT_DIR = "./qwen2p5_7b_lora_finetuned"

dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
dataset = dataset.select(range(min(10000, len(dataset))))

In [None]:
# Convert each example to conversational format
def preprocess_function(example):
    return {
        "prompt": [{"role": "user", "content": example["prompt"].strip()}],
        "completion": [{"role": "assistant", "content": example["completion"].strip()}],
    }

# Apply conversion (returns a new HF dataset)
dataset_processed = dataset.map(preprocess_function, remove_columns=dataset.column_names)

print(dataset_processed[0])

In [None]:
from peft import LoraConfig

my_peft_config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules=["q_proj","k_proj", "v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    do_train=True,

    #From "Towards Subliminal Learning Paper"
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=30,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    lr_scheduler_type="linear",
    warmup_steps=5,

    packing=False,

    # Saving
    save_strategy="epoch",
    save_total_limit=None,

    # Hub
    push_to_hub=True,
    hub_model_id=HF_REPO_NAME,
    hub_strategy="every_save",
    hub_token=userdata.get('HF_Token'),

    #log
    logging_steps=10,
    logging_strategy="steps",

    completion_only_loss=True,
    seed=SEED,
)

In [None]:
trainer = SFTTrainer(
    MODEL_NAME,
    train_dataset=dataset_processed,
    args=sft_config,
    peft_config=my_peft_config,
)

In [None]:
trainer.train()

In [None]:
from google.colab import runtime
runtime.unassign()
