In [None]:
# @title Complete IPO Training (3 epochs / Beta 0.01 / BF16)
'''
=====================================================================================================
This training script was originally developed and optimized for execution within Google Colab,
relying heavily on Google Drive for persistent storage, Colab-specific authentication mechanisms,
and other environment-dependent utilities. As a result, the initial implementation included
Drive-mounted checkpoint directories, CSV logging to Drive, and secret-based Hugging Face login via
Colab‚Äôs userdata API. While these components streamlined experimentation within a Colab workflow,
they also made the script less portable and harder to reproduce in general compute environments
such as local machines, cloud VMs, or managed training clusters.

You can refactor the current version and remove the above mentioned Colab-specific assumptions,
replacing them with environment-agnostic paths, standard Hugging Face authentication, and fully
general dataset/model loading logic so the script can run consistently anywhere while retaining
the same behavior and training methodology.
=====================================================================================================
'''
# ==========================================
# 1. Install Dependencies
# ==========================================
print("‚è≥ Installing libraries...")
!pip install -q -U transformers datasets trl accelerate huggingface_hub bitsandbytes
'''
The training was conducted using the following library versions at the time:
Accelerate: 0.28.0
Hugging Face Hub: 0.17.1
Transformers: 4.57.3
TRL: 0.25.1
Pytorch: 2.9.0+cu126
Datasets: 4.4.1
Tokenizers: 0.22.1
Bitsandbytes: 0.48.2 (it was used as everything here was done in BF16, SFT and RM were loaded in BF16)

NOTE : Setting loss_type = 'ipo' in TRL's DPOTrainer's DPOConfig, enables IPO implementation
'''
import torch
import os
import csv
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainerCallback
)
from trl import DPOTrainer, DPOConfig
from huggingface_hub import login
from google.colab import userdata, drive

# ==========================================
# 2. Setup Drive & Login
# ==========================================
print("\nüìÇ Mounting Drive...")
drive.mount('/content/drive')

# Paths
DRIVE_ROOT = "/content/drive/MyDrive/Qwen3-IPO-Training"
CHECKPOINT_DIR = f"{DRIVE_ROOT}/checkpoints"
LOG_FILE_PATH = f"{DRIVE_ROOT}/ipo_logs.csv"

# Create directories and initialize the CSV log header if missing
if not os.path.exists(CHECKPOINT_DIR):
    os.makedirs(CHECKPOINT_DIR)

if not os.path.exists(LOG_FILE_PATH):
    with open(LOG_FILE_PATH, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Step", "Epoch", "Loss", "Reward_Chosen", "Reward_Rejected", "Accuracy", "Margin"])

# Hugging Face Login
print("\nüîë Logging in...")
try:
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token, add_to_git_credential=True)
    print("‚úÖ Logged in via Colab Secret.")
except:
    login(add_to_git_credential=True)

# ==========================================
# 3. Configuration
# ==========================================
SFT_MODEL_ID = "AIPlans/qwen3-0.6b-SFT-hs2" # this works as policy(to be trained)
OUTPUT_REPO  = "your-username/qwen3-0.6b-IPO" # this naming is arbitrary
DATASET_NAME = "Jennny/helpsteer2-helpfulness-preference"  # this is a variant of the HelpSteer2 dataset having only the helpfulness attribute

# All hyperparameters can be modified as suitable (A100 80GB was used at the time)
BETA = 0.01               # for KL divergence
LEARNING_RATE = 5e-7      # a lower learning rate as there are 3 epochs
BATCH_SIZE = 8            # <--- Reduced to 8 for not hitting OOM
GRAD_ACCUMULATION = 2     # <--- Effective Batch = 16
EPOCHS = 3
MAX_LENGTH = 2048

# ==========================================
# 4. Logging Callback
# ==========================================
class DriveLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            with open(LOG_FILE_PATH, mode='a', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([
                    state.global_step,
                    logs.get("epoch", 0),
                    logs.get("loss", 0),
                    logs.get("rewards/chosen", 0),
                    logs.get("rewards/rejected", 0),
                    logs.get("rewards/accuracies", 0),
                    logs.get("rewards/margins", 0)
                ])

# ==========================================
# 5. Dataset Loading & Formatting
# ==========================================
print(f"\nwv Loading dataset: {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME, split="train")
dataset = dataset.filter(lambda x: x["chosen_score"] >= 3)

tokenizer = AutoTokenizer.from_pretrained(SFT_MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token

def format_ipo_triplets(example):
    if isinstance(example['chosen'], list):
        prompt = example['chosen'][0]['content']
        chosen_response = example['chosen'][1]['content']
        rejected_response = example['rejected'][1]['content']
    else:
        prompt = ""
        chosen_response = str(example['chosen'])
        rejected_response = str(example['rejected'])

    return {
        "prompt": f"User: {prompt}\n\nAssistant:",
        "chosen": f" {chosen_response}{tokenizer.eos_token}",
        "rejected": f" {rejected_response}{tokenizer.eos_token}"
    }

formatted_dataset = dataset.map(format_ipo_triplets, remove_columns=dataset.column_names)
# Split: 95% Train, 5% Validation
split_dataset = formatted_dataset.train_test_split(test_size=0.05, seed=42)
print(f"‚úÖ Dataset Ready: {len(split_dataset['train'])} Train | {len(split_dataset['test'])} Val")

# ==========================================
# 6. Load Policy Model
# ==========================================
print("\nüß† Loading Policy Model (BF16)...")
model = AutoModelForCausalLM.from_pretrained(
    SFT_MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
model.config.use_cache = False
print("‚úÖ Model Loaded.")

# ==========================================
# 7. Training
# ==========================================
training_args = DPOConfig(
    output_dir=CHECKPOINT_DIR,
    loss_type="ipo",
    beta=BETA,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    bf16=True,
    max_length=MAX_LENGTH,
    max_prompt_length=1024,
    push_to_hub=True,
    hub_model_id=OUTPUT_REPO,
    report_to="none",
    gradient_checkpointing=True,
    remove_unused_columns=False,
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    processing_class=tokenizer,
    callbacks=[DriveLoggingCallback()],
)

print("\nüöÄ Starting IPO Training...")
trainer.train()

# ==========================================
# 8. Final Save
# ==========================================
print("\n‚òÅÔ∏è Pushing IPO Model to Hub...")
trainer.push_to_hub()
print(f"‚úÖ DONE! Model uploaded to: https://huggingface.co/{OUTPUT_REPO}")