In [None]:
# @title SFT Training
'''
=====================================================================================================
This training script was originally developed and optimized for execution within Google Colab,
relying heavily on Google Drive for persistent storage, Colab-specific authentication mechanisms,
and other environment-dependent utilities. As a result, the initial implementation included
Drive-mounted checkpoint directories, CSV logging to Drive, and secret-based Hugging Face login via
Colab‚Äôs userdata API. While these components streamlined experimentation within a Colab workflow,
they also made the script less portable and harder to reproduce in general compute environments
such as local machines, cloud VMs, or managed training clusters.

You can refactor the current version and remove the above mentioned Colab-specific assumptions,
replacing them with environment-agnostic paths, standard Hugging Face authentication, and fully
general dataset/model loading logic so the script can run consistently anywhere while retaining
the same behavior and training methodology.
=====================================================================================================
'''
'''
=====================================================================================================
This SFT model was trained on a preference dataset using only the preferred (chosen) responses
because of research purposes. SFT models are advised to be trained only on 'prompt-response' dataset.
=====================================================================================================
'''
# ==========================================
# 1. Install Dependencies
# ==========================================
print("‚è≥ Installing libraries...")
!pip install -q -U transformers datasets trl accelerate huggingface_hub

'''
The training was conducted using the following library versions at the time:
Accelerate: 0.28.0
Hugging Face Hub: 0.17.1
TRL: 0.25.1
Transformers: 4.57.3
Pytorch: 2.9.0+cu126
Datasets: 4.4.1
Tokenizers: 0.22.1
'''

import torch
import os
import csv
from google.colab import drive
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainerCallback
)
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
from google.colab import userdata

# ==========================================
# 2. Setup Drive & Login
# ==========================================
# Mount Drive
print("\nüìÇ Mounting Google Drive...")
drive.mount("/content/drive")

# Define Paths
DRIVE_ROOT = "/content/drive/MyDrive/SFT-Training"
CHECKPOINT_DIR = f"{DRIVE_ROOT}/checkpoints"
LOG_FILE_PATH = f"{DRIVE_ROOT}/training_logs.csv"

print(f"üìÇ Checkpoints will be saved to: {CHECKPOINT_DIR}")
print(f"üìÑ Logs will be saved to: {LOG_FILE_PATH}")

# Create directories and initialize the CSV log header if missing
if not os.path.exists(DRIVE_ROOT):
    os.makedirs(DRIVE_ROOT)

if not os.path.exists(LOG_FILE_PATH):
    with open(LOG_FILE_PATH, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Step", "Epoch", "Training Loss"])

# Hugging Face Login
print("\nüîë Logging in to Hugging Face...")
try:
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token, add_to_git_credential=True)
    print("‚úÖ Logged in via Colab Secret.")
except:
  print("‚ö†Ô∏è Secret 'HF_TOKEN' not found. Falling back to manual input.")
  login(add_to_git_credential=True)

# ==========================================
# 3. Define Custom Logging Callback
# ==========================================
class DriveLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            with open(LOG_FILE_PATH, mode='a', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([state.global_step, logs.get("epoch"), logs["loss"]])

# ==========================================
# 4. Configuration
# ==========================================
MODEL_NAME = "Qwen/Qwen3-0.6B-Base"
DATASET_NAME = "Jennny/helpsteer2-helpfulness-preference" # this is a variant of the HelpSteer2 dataset having only the helpfulness attribute
REPO_NAME = "your-username/qwen3-0.6b-SFT" # this naming is arbitrary
MAX_LENGTH = 2048

# All hyperparameters can be modified as suitable
# Batch 8 * Accum 4 = Effective Batch Size 32
# This prevents OOM errors that occur with Batch 16 in FP32 (A100 40GB was used)
BATCH_SIZE = 8
GRAD_ACCUMULATION = 4   # Effective Batch Size = 32
LEARNING_RATE = 2e-5
LOGGING_STEPS = 20      # Log every 20 steps

# ==========================================
# 5. Dataset Loading & Filtering
# ==========================================
print(f"Loading and Preparing dataset: {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME, split="train")

# Identify the correct score column (varies by dataset version)
score_col = "chosen_score"
if score_col not in dataset.column_names:
    possible = [c for c in dataset.column_names if "score" in c or "rating" in c]
    if possible: score_col = possible[0]

# We filter out low-quality data so the model only learns from "helpful" examples (Score >= 3)
try:
    dataset = dataset.filter(lambda x: x[score_col] >= 3)
    print(f"‚úÖ Filtered dataset (Score >= 3): {len(dataset)} samples")
except KeyError:
    print("‚ö†Ô∏è Warning: Score column not found. Skipping filter.")

# ==========================================
# 6. Formatting
# ==========================================
print("\n‚öôÔ∏è Formatting dataset...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def format_to_text_column(example):
    """
    Converts the dataset into a standard SFT format:
    User: <prompt> \n\n Assistant: <response> <EOS>
    """
    chosen_raw = example['chosen']
    if isinstance(chosen_raw, list):
        prompt_text = chosen_raw[0]['content']
        response_text = chosen_raw[1]['content']
    else:
        prompt_text = ""
        response_text = str(chosen_raw)

    text = f"User: {prompt_text}\n\nAssistant: {response_text}{tokenizer.eos_token}"
    return {"text": text}

# We use 'remove_columns' to delete the original 'chosen'/'rejected' fields.
# This forces the SFTTrainer to look only at our new 'text' column, preventing errors.
formatted_dataset = dataset.map(
    format_to_text_column,
    remove_columns=dataset.column_names
)

# ==========================================
# 7. Model Loading
# ==========================================
print(f"\nüß† Loading Model: {MODEL_NAME} (FP32)...")
# AutoModelForCausalLM is used for text generation (Next Token Prediction)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,   # FP32 used for stability
    trust_remote_code=True,
    device_map="auto"
)

# ==========================================
# 8. Training
# ==========================================
training_args = SFTConfig(
    output_dir=CHECKPOINT_DIR, # <--- Saving directly to Drive
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=2,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.1,
    logging_steps=LOGGING_STEPS,
    save_strategy="steps",
    save_steps=100,          # Save a checkpoint to Drive every 100 steps
    save_total_limit=2,      # Keep only the last 2 checkpoints to save Drive space
    packing=False,           # Disabled to avoid Flash Attention requirements/warnings
    bf16=False, # FP32
    fp16=False, # FP32
    push_to_hub=True,
    hub_model_id=REPO_NAME,
    report_to="none",
    dataset_text_field="text",
    gradient_checkpointing=True,   # Saves memory by trading compute speed
)

# Manually set max_seq_length (Required for this TRL 0.25.1 version's SFTTrainer)
training_args.max_seq_length = MAX_LENGTH

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer, # 'processing_class' replaces 'tokenizer' in newer TRL versions (v0.12+)
    args=training_args,
    train_dataset=formatted_dataset,
    callbacks=[DriveLoggingCallback()],
)

print("\nüöÄ Starting SFT Training...")
trainer.train()

# ==========================================
# 9. Push to Hub
# ==========================================
print("\n‚òÅÔ∏è Pushing final SFT model to Hub...")
trainer.push_to_hub()
print(f"‚úÖ DONE! Model uploaded to: https://huggingface.co/{REPO_NAME}")