# 04. Preference Data Generation
## Synthetic Instruction Tuner - Week 2 Day 3-5

This notebook generates preference pairs for DPO training:
1. Load filtered instruction data
2. Load generator model (Llama-3.1-8B-Instruct) and reward model (OpenAssistant)
3. Generate multiple response variants per instruction
4. Score responses with reward model
5. Create chosen/rejected pairs
6. Save preference dataset

**Target**: 600 preference pairs from ~1,000 filtered instructions (optimized for Colab free tier)

**Expected runtime**: 4-6 hours for full dataset (use subsampling for faster testing)

**Tip**: Run with checkpoints enabled, can split across multiple sessions

## 1. Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Project path
PROJECT_ROOT = "/content/drive/MyDrive/synthetic-instruction-tuner"

In [None]:
# Load configuration
import json

with open(f"{PROJECT_ROOT}/config.json", 'r') as f:
    config = json.load(f)

print("Configuration loaded!")

In [None]:
# Install libraries with latest compatible versions
!pip install -q --upgrade transformers>=4.41.0 peft>=0.7.0 accelerate>=0.25.0 bitsandbytes>=0.41.3 sentencepiece

print("✅ Libraries installed successfully!")

In [None]:
import torch
import json
import os
from datetime import datetime
from tqdm import tqdm
import random
import gc

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Load Filtered Data

In [None]:
# Load filtered instruction data
FILTERED_PATH = f"{config['paths']['data_filtered']}/instructions_filtered.json"

with open(FILTERED_PATH, 'r', encoding='utf-8') as f:
    filtered_data = json.load(f)

print(f"Loaded {len(filtered_data)} filtered samples")

In [None]:
# For testing, you can subsample the data
# Uncomment to use a smaller subset for faster testing

# TEST_SIZE = 1000  # Use 1000 samples for testing
# filtered_data = random.sample(filtered_data, min(TEST_SIZE, len(filtered_data)))
# print(f"Using {len(filtered_data)} samples for testing")

## 3. Load Models

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoModelForSequenceClassification

# 4-bit quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

### 3.1 Load Generator Model (Llama-3.1-8B-Instruct)

In [None]:
GENERATOR_MODEL_ID = config['models']['data_generation']

print(f"Loading generator model: {GENERATOR_MODEL_ID}...")

generator_tokenizer = AutoTokenizer.from_pretrained(GENERATOR_MODEL_ID)
generator_tokenizer.pad_token = generator_tokenizer.eos_token
generator_tokenizer.padding_side = "left"

generator_model = AutoModelForCausalLM.from_pretrained(
    GENERATOR_MODEL_ID,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)
generator_model.eval()

print(f"Generator model loaded!")
print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

### 3.2 Load Reward Model (OpenAssistant)

In [None]:
REWARD_MODEL_ID = "OpenAssistant/reward-model-deberta-v3-large-v2"

print(f"Loading reward model: {REWARD_MODEL_ID}...")

reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_ID)

# Load reward model (smaller, can use full precision or 8-bit)
reward_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto"
)
reward_model.eval()

print(f"Reward model loaded!")
print(f"Total GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

## 4. Preference Generator Class

In [None]:
from dataclasses import dataclass
from typing import Dict, List, Optional

@dataclass
class PreferencePair:
    """A preference pair with chosen and rejected responses."""
    instruction: str
    chosen: str
    rejected: str
    chosen_score: float
    rejected_score: float
    margin: float


class PreferenceGenerator:
    """Generate preference pairs using reward model scoring."""

    def __init__(self, reward_model, reward_tokenizer, config: Optional[Dict] = None):
        self.reward_model = reward_model
        self.reward_tokenizer = reward_tokenizer
        self.config = config or {}

        self.num_responses = self.config.get('num_responses_per_instruction', 3)
        self.min_margin = self.config.get('min_score_margin', 0.5)
        self.max_new_tokens = self.config.get('max_new_tokens', 512)

        # Llama 3.1 templates
        self.instruction_template = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
        self.response_template = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

    def score_response(self, instruction: str, response: str) -> float:
        """Score a response using the reward model."""
        # OpenAssistant reward model expects plain text format
        text = f"Question: {instruction}\n\nAnswer: {response}"

        inputs = self.reward_tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=2048
        ).to(self.reward_model.device)

        with torch.no_grad():
            outputs = self.reward_model(**inputs)
            score = outputs.logits[0][0].item()

        return score

    def generate_response_variant(self, gen_model, gen_tokenizer,
                                 instruction: str, temperature: float) -> Optional[str]:
        """Generate a response variant with specified temperature."""
        prompt = f"{self.instruction_template}{instruction}{self.response_template}"

        inputs = gen_tokenizer(prompt, return_tensors="pt").to(gen_model.device)

        try:
            with torch.no_grad():
                outputs = gen_model.generate(
                    **inputs,
                    max_new_tokens=self.max_new_tokens,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=gen_tokenizer.eos_token_id,
                    eos_token_id=[
                        gen_tokenizer.convert_tokens_to_ids("<|eot_id|>"),
                        gen_tokenizer.eos_token_id
                    ]
                )

            generated = gen_tokenizer.decode(outputs[0], skip_special_tokens=False)
            response = self._parse_response(generated)
            return response if response and len(response) > 10 else None

        except Exception as e:
            print(f"Generation error: {e}")
            return None

    def _parse_response(self, text: str) -> Optional[str]:
        """Extract response from generated text."""
        try:
            if "<|start_header_id|>assistant<|end_header_id|>" in text:
                parts = text.split("<|start_header_id|>assistant<|end_header_id|>")
                if len(parts) > 1:
                    response = parts[-1]
                    for end_token in ["<|eot_id|>", "<|end_of_text|>"]:
                        if end_token in response:
                            response = response.split(end_token)[0]
                    return response.strip()
        except:
            pass
        return None

    def create_preference_pair(self, gen_model, gen_tokenizer,
                              instruction: str, original_response: Optional[str] = None) -> Optional[PreferencePair]:
        """Create a preference pair by generating and scoring multiple responses."""
        responses = []

        # Include original if provided
        if original_response:
            responses.append(original_response)

        # Generate variants with different temperatures
        temperatures = [0.6, 0.8, 1.0, 1.2][:self.num_responses]
        for temp in temperatures:
            if len(responses) >= self.num_responses:
                break
            response = self.generate_response_variant(gen_model, gen_tokenizer, instruction, temp)
            if response and response not in responses:
                responses.append(response)

        if len(responses) < 2:
            return None

        # Score all responses
        scored = []
        for resp in responses:
            try:
                score = self.score_response(instruction, resp)
                scored.append((resp, score))
            except Exception as e:
                continue

        if len(scored) < 2:
            return None

        # Sort by score
        scored.sort(key=lambda x: x[1], reverse=True)

        chosen, chosen_score = scored[0]
        rejected, rejected_score = scored[-1]
        margin = chosen_score - rejected_score

        if margin < self.min_margin:
            return None

        return PreferencePair(
            instruction=instruction,
            chosen=chosen,
            rejected=rejected,
            chosen_score=chosen_score,
            rejected_score=rejected_score,
            margin=margin
        )

# Initialize generator
pref_config = config.get('preference_generation', {})
pref_generator = PreferenceGenerator(reward_model, reward_tokenizer, pref_config)

print("PreferenceGenerator initialized!")

## 5. Test Preference Generation

In [None]:
# Test on a single sample
print("Testing preference pair generation...")
print("=" * 50)

test_sample = filtered_data[0]
test_instruction = test_sample['instruction']
test_response = test_sample.get('response')

print(f"Instruction: {test_instruction}\n")

pair = pref_generator.create_preference_pair(
    generator_model,
    generator_tokenizer,
    test_instruction,
    test_response
)

if pair:
    print(f"✓ Successfully created preference pair!\n")
    print(f"Chosen (score: {pair.chosen_score:.3f}):")
    print(f"{pair.chosen[:200]}...\n")
    print(f"Rejected (score: {pair.rejected_score:.3f}):")
    print(f"{pair.rejected[:200]}...\n")
    print(f"Margin: {pair.margin:.3f}")
else:
    print("✗ Failed to create preference pair")

In [None]:
# Test on a few samples
print("\nTesting on 5 samples...")
print("=" * 50)

success_count = 0
for i in range(5):
    sample = filtered_data[i]
    pair = pref_generator.create_preference_pair(
        generator_model,
        generator_tokenizer,
        sample['instruction'],
        sample.get('response')
    )
    if pair:
        success_count += 1
        print(f"[{i+1}] ✓ Margin: {pair.margin:.3f}")
    else:
        print(f"[{i+1}] ✗ Failed")

print(f"\nSuccess rate: {success_count}/5")

## 6. Generate Preference Dataset with Checkpoints

In [None]:
def save_checkpoint(data, checkpoint_path):
    """Save data to checkpoint file."""
    with open(checkpoint_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Checkpoint saved: {len(data)} pairs")

def load_checkpoint(checkpoint_path):
    """Load data from checkpoint file."""
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    return []

# Paths
PREFERENCE_PATH = config['paths']['data_preference']
CHECKPOINT_PATH = f"{PREFERENCE_PATH}/preference_checkpoint.json"
FINAL_PATH = f"{PREFERENCE_PATH}/preference_data.json"

In [None]:
# Settings
TARGET_PAIRS = config['preference_generation'].get('target_pairs', 600)
CHECKPOINT_INTERVAL = config['preference_generation'].get('checkpoint_interval', 100)

print(f"Target preference pairs: {TARGET_PAIRS}")
print(f"Checkpoint interval: {CHECKPOINT_INTERVAL}")

In [None]:
# Load existing checkpoint if available
preference_data = load_checkpoint(CHECKPOINT_PATH)
processed_instructions = {p['instruction'] for p in preference_data}

print(f"Loaded {len(preference_data)} existing pairs")
print(f"Remaining: {TARGET_PAIRS - len(preference_data)} pairs")

In [None]:
# Main generation loop
failed_count = 0
max_failures = 100

print(f"\nStarting preference generation...")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 50)

pbar = tqdm(total=TARGET_PAIRS, initial=len(preference_data), desc="Generating pairs")

data_idx = 0
while len(preference_data) < TARGET_PAIRS and data_idx < len(filtered_data):
    sample = filtered_data[data_idx]
    data_idx += 1

    # Skip if already processed
    if sample['instruction'] in processed_instructions:
        continue

    try:
        pair = pref_generator.create_preference_pair(
            generator_model,
            generator_tokenizer,
            sample['instruction'],
            sample.get('response')
        )

        if pair:
            preference_data.append({
                'instruction': pair.instruction,
                'chosen': pair.chosen,
                'rejected': pair.rejected,
                'chosen_score': pair.chosen_score,
                'rejected_score': pair.rejected_score,
                'margin': pair.margin
            })
            processed_instructions.add(pair.instruction)
            pbar.update(1)
            failed_count = 0

            # Save checkpoint
            if len(preference_data) % CHECKPOINT_INTERVAL == 0:
                save_checkpoint(preference_data, CHECKPOINT_PATH)
                gc.collect()
                torch.cuda.empty_cache()
        else:
            failed_count += 1

    except Exception as e:
        print(f"\nError: {e}")
        failed_count += 1

    if failed_count >= max_failures:
        print(f"\nToo many failures. Stopping.")
        break

pbar.close()
print(f"\nEnd time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total generated: {len(preference_data)} preference pairs")

In [None]:
# Save final data
save_checkpoint(preference_data, FINAL_PATH)
print(f"\nFinal preference data saved to: {FINAL_PATH}")

## 7. Analyze Preference Data

In [None]:
import numpy as np

# Calculate statistics
margins = [p['margin'] for p in preference_data]
chosen_scores = [p['chosen_score'] for p in preference_data]
rejected_scores = [p['rejected_score'] for p in preference_data]

chosen_lengths = [len(p['chosen'].split()) for p in preference_data]
rejected_lengths = [len(p['rejected'].split()) for p in preference_data]

print("=" * 50)
print("PREFERENCE DATA STATISTICS")
print("=" * 50)
print(f"\nTotal pairs: {len(preference_data)}")

print(f"\nScore Margin:")
print(f"  Mean: {np.mean(margins):.3f}")
print(f"  Std: {np.std(margins):.3f}")
print(f"  Min: {np.min(margins):.3f}")
print(f"  Max: {np.max(margins):.3f}")
print(f"  Median: {np.median(margins):.3f}")

print(f"\nChosen Response Scores:")
print(f"  Mean: {np.mean(chosen_scores):.3f}")
print(f"  Median: {np.median(chosen_scores):.3f}")

print(f"\nRejected Response Scores:")
print(f"  Mean: {np.mean(rejected_scores):.3f}")
print(f"  Median: {np.median(rejected_scores):.3f}")

print(f"\nChosen Response Length (words):")
print(f"  Mean: {np.mean(chosen_lengths):.1f}")
print(f"  Median: {np.median(chosen_lengths):.1f}")

print(f"\nRejected Response Length (words):")
print(f"  Mean: {np.mean(rejected_lengths):.1f}")
print(f"  Median: {np.median(rejected_lengths):.1f}")

In [None]:
# Visualize distributions
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Margin distribution
axes[0, 0].hist(margins, bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Score Margin')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Score Margin Distribution')
axes[0, 0].axvline(x=pref_generator.min_margin, color='r', linestyle='--', label=f'Min threshold ({pref_generator.min_margin})')
axes[0, 0].legend()

# Chosen vs Rejected scores
axes[0, 1].scatter(chosen_scores, rejected_scores, alpha=0.3, s=10)
axes[0, 1].plot([min(chosen_scores), max(chosen_scores)], [min(chosen_scores), max(chosen_scores)], 'r--', label='y=x')
axes[0, 1].set_xlabel('Chosen Score')
axes[0, 1].set_ylabel('Rejected Score')
axes[0, 1].set_title('Chosen vs Rejected Scores')
axes[0, 1].legend()

# Score distributions
axes[1, 0].hist(chosen_scores, bins=50, alpha=0.5, label='Chosen', edgecolor='black')
axes[1, 0].hist(rejected_scores, bins=50, alpha=0.5, label='Rejected', edgecolor='black')
axes[1, 0].set_xlabel('Score')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Score Distributions')
axes[1, 0].legend()

# Length comparison
axes[1, 1].scatter(chosen_lengths, rejected_lengths, alpha=0.3, s=10)
axes[1, 1].plot([0, max(chosen_lengths)], [0, max(chosen_lengths)], 'r--', label='y=x')
axes[1, 1].set_xlabel('Chosen Length (words)')
axes[1, 1].set_ylabel('Rejected Length (words)')
axes[1, 1].set_title('Response Lengths')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig(f"{config['paths']['evaluation_figures']}/preference_stats.png", dpi=150)
plt.show()

print(f"\nFigure saved to {config['paths']['evaluation_figures']}/preference_stats.png")

## 8. Sample Preference Pairs

In [None]:
# Show sample preference pairs
print("=" * 50)
print("SAMPLE PREFERENCE PAIRS")
print("=" * 50)

# Show top margin pairs
top_pairs = sorted(preference_data, key=lambda x: x['margin'], reverse=True)[:3]

for i, pair in enumerate(top_pairs):
    print(f"\n--- Sample {i+1} (margin: {pair['margin']:.3f}) ---")
    print(f"\nInstruction: {pair['instruction'][:150]}...")
    print(f"\nChosen (score: {pair['chosen_score']:.3f}):")
    print(f"{pair['chosen'][:300]}...")
    print(f"\nRejected (score: {pair['rejected_score']:.3f}):")
    print(f"{pair['rejected'][:300]}...")
    print("\n" + "-" * 50)

## 9. Prepare DPO Training Format

In [None]:
# Convert to DPO training format
dpo_data = []
for pair in preference_data:
    dpo_data.append({
        "prompt": pair['instruction'],
        "chosen": pair['chosen'],
        "rejected": pair['rejected']
    })

# Save DPO format
DPO_DATA_PATH = f"{config['paths']['data_preference']}/dpo_data.json"
with open(DPO_DATA_PATH, 'w', encoding='utf-8') as f:
    json.dump(dpo_data, f, ensure_ascii=False, indent=2)

print(f"DPO training data saved to: {DPO_DATA_PATH}")
print(f"Total pairs: {len(dpo_data)}")

In [None]:
# Split into train/validation sets
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(dpo_data, test_size=0.1, random_state=42)

print(f"Training pairs: {len(train_data)}")
print(f"Validation pairs: {len(val_data)}")

# Save splits
with open(f"{config['paths']['data_preference']}/dpo_train.json", 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open(f"{config['paths']['data_preference']}/dpo_val.json", 'w', encoding='utf-8') as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)

print(f"\nTrain/val splits saved!")

## 10. Cleanup

In [None]:
# Free GPU memory
del generator_model
del generator_tokenizer
del reward_model
del reward_tokenizer
del pref_generator
gc.collect()
torch.cuda.empty_cache()

print("Memory cleared!")

## ✓ Preference Generation Complete!

### Summary:
- Generated preference pairs saved to `data/preference/preference_data.json`
- DPO training data saved to `data/preference/dpo_data.json`
- Train/val splits saved for DPO training

### Next Steps:
1. Week 3: Proceed to `05_sft_training.ipynb` for supervised fine-tuning
2. After SFT: Use `06_dpo_training.ipynb` for DPO alignment