# 02. Magpie Data Generation
## Synthetic Instruction Tuner - Week 1 Day 3-5

This notebook generates synthetic instruction-response pairs using the Magpie method:
1. Load Llama-3.1-8B-Instruct model
2. Generate instructions using template-only prompts
3. Generate responses for each instruction
4. Save checkpoints periodically

**Expected runtime**: 8-10 hours for 15,000 samples

**Tip**: Run this overnight and use checkpoints to resume if disconnected.

## 1. Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Project path
PROJECT_ROOT = "/content/drive/MyDrive/synthetic-instruction-tuner"

In [None]:
# Load configuration
import json

with open(f"{PROJECT_ROOT}/config.json", 'r') as f:
    config = json.load(f)

print("Configuration loaded!")
print(f"Target samples: {config['data_generation']['target_raw_samples']}")

In [None]:
# Install libraries if needed
!pip install -q transformers==4.36.0 accelerate bitsandbytes sentencepiece

In [None]:
import torch
import json
import os
from datetime import datetime
from tqdm import tqdm
import random
import gc

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Hugging Face login
from huggingface_hub import login
login()

## 2. Load Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_ID = config['models']['data_generation']

# 4-bit quantization for memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

print(f"Loading {MODEL_ID}...")

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")

In [None]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)
model.eval()

print(f"Model loaded!")
print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

## 3. Magpie Generator Class

In [None]:
class MagpieGenerator:
    """Generate synthetic instructions and responses using Magpie method."""
    
    def __init__(self, model, tokenizer, config):
        self.model = model
        self.tokenizer = tokenizer
        self.config = config
        
        # Magpie template (Llama 3.1 format)
        self.instruction_template = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
        self.response_template = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
        
        # Generation settings
        self.temperature = config['data_generation']['temperature']
        self.max_new_tokens = config['data_generation']['max_new_tokens']
    
    def generate_instruction(self):
        """Generate a single instruction using Magpie method."""
        inputs = self.tokenizer(
            self.instruction_template, 
            return_tensors="pt"
        ).to(self.model.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=256,
                temperature=self.temperature,
                do_sample=True,
                top_p=0.95,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=[
                    self.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
                    self.tokenizer.eos_token_id
                ]
            )
        
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
        instruction = self._parse_instruction(generated)
        return instruction
    
    def generate_response(self, instruction):
        """Generate a response for the given instruction."""
        # Format as conversation
        prompt = f"{self.instruction_template}{instruction}{self.response_template}"
        
        inputs = self.tokenizer(
            prompt, 
            return_tensors="pt"
        ).to(self.model.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=self.max_new_tokens,
                temperature=0.7,  # Lower temperature for more focused responses
                do_sample=True,
                top_p=0.9,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=[
                    self.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
                    self.tokenizer.eos_token_id
                ]
            )
        
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
        response = self._parse_response(generated)
        return response
    
    def _parse_instruction(self, text):
        """Extract instruction from generated text."""
        try:
            # Find content after user header
            if "<|start_header_id|>user<|end_header_id|>" in text:
                parts = text.split("<|start_header_id|>user<|end_header_id|>")
                if len(parts) > 1:
                    instruction = parts[1]
                    # Remove end tokens
                    for end_token in ["<|eot_id|>", "<|start_header_id|>assistant"]:
                        if end_token in instruction:
                            instruction = instruction.split(end_token)[0]
                    return instruction.strip()
        except Exception as e:
            print(f"Parse error: {e}")
        return None
    
    def _parse_response(self, text):
        """Extract response from generated text."""
        try:
            # Find content after assistant header
            if "<|start_header_id|>assistant<|end_header_id|>" in text:
                parts = text.split("<|start_header_id|>assistant<|end_header_id|>")
                if len(parts) > 1:
                    response = parts[-1]  # Get the last assistant response
                    # Remove end tokens
                    for end_token in ["<|eot_id|>", "<|end_of_text|>"]:
                        if end_token in response:
                            response = response.split(end_token)[0]
                    return response.strip()
        except Exception as e:
            print(f"Parse error: {e}")
        return None
    
    def generate_pair(self):
        """Generate a single instruction-response pair."""
        instruction = self.generate_instruction()
        if instruction and len(instruction) > 10:
            response = self.generate_response(instruction)
            if response and len(response) > 10:
                return {
                    "instruction": instruction,
                    "response": response
                }
        return None

# Initialize generator
generator = MagpieGenerator(model, tokenizer, config)
print("Generator initialized!")

## 4. Test Generation

In [None]:
# Test single generation
print("Testing single generation...")
print("="*50)

test_pair = generator.generate_pair()
if test_pair:
    print(f"Instruction:\n{test_pair['instruction']}")
    print("\n" + "="*50 + "\n")
    print(f"Response:\n{test_pair['response']}")
else:
    print("Generation failed. Check model and templates.")

In [None]:
# Test multiple generations
print("Testing 5 generations...")
print("="*50)

for i in range(5):
    pair = generator.generate_pair()
    if pair:
        print(f"\n[{i+1}] Instruction: {pair['instruction'][:100]}...")
    else:
        print(f"[{i+1}] Failed")

## 5. Batch Generation with Checkpoints

In [None]:
def save_checkpoint(data, checkpoint_path):
    """Save data to checkpoint file."""
    with open(checkpoint_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Checkpoint saved: {len(data)} samples")

def load_checkpoint(checkpoint_path):
    """Load data from checkpoint file."""
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    return []

# Paths
DATA_PATH = config['paths']['data_raw']
CHECKPOINT_PATH = f"{DATA_PATH}/instructions_checkpoint.json"
FINAL_PATH = f"{DATA_PATH}/instructions_raw.json"

In [None]:
# Generation settings
TARGET_SAMPLES = config['data_generation']['target_raw_samples']
CHECKPOINT_INTERVAL = config['data_generation']['checkpoint_interval']

print(f"Target samples: {TARGET_SAMPLES}")
print(f"Checkpoint interval: {CHECKPOINT_INTERVAL}")

In [None]:
# Load existing checkpoint if available
generated_data = load_checkpoint(CHECKPOINT_PATH)
start_idx = len(generated_data)

print(f"Loaded {start_idx} existing samples")
print(f"Remaining: {TARGET_SAMPLES - start_idx} samples")

In [None]:
# Main generation loop
failed_count = 0
max_failures = 100  # Stop if too many consecutive failures

print(f"\nStarting generation from index {start_idx}...")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*50)

pbar = tqdm(total=TARGET_SAMPLES, initial=start_idx, desc="Generating")

while len(generated_data) < TARGET_SAMPLES:
    try:
        pair = generator.generate_pair()
        
        if pair:
            generated_data.append(pair)
            pbar.update(1)
            failed_count = 0
            
            # Save checkpoint
            if len(generated_data) % CHECKPOINT_INTERVAL == 0:
                save_checkpoint(generated_data, CHECKPOINT_PATH)
                
                # Clear GPU cache periodically
                gc.collect()
                torch.cuda.empty_cache()
        else:
            failed_count += 1
            if failed_count >= max_failures:
                print(f"\nToo many failures ({max_failures}). Stopping.")
                break
                
    except Exception as e:
        print(f"\nError: {e}")
        failed_count += 1
        if failed_count >= max_failures:
            break
        continue

pbar.close()
print(f"\nEnd time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total generated: {len(generated_data)} samples")

In [None]:
# Save final data
save_checkpoint(generated_data, FINAL_PATH)
print(f"\nFinal data saved to: {FINAL_PATH}")

## 6. Data Statistics

In [None]:
# Calculate statistics
import numpy as np

instruction_lengths = [len(d['instruction'].split()) for d in generated_data]
response_lengths = [len(d['response'].split()) for d in generated_data]

print("=" * 50)
print("DATA STATISTICS")
print("=" * 50)
print(f"Total samples: {len(generated_data)}")
print(f"\nInstruction length (words):")
print(f"  Mean: {np.mean(instruction_lengths):.1f}")
print(f"  Min: {np.min(instruction_lengths)}")
print(f"  Max: {np.max(instruction_lengths)}")
print(f"  Median: {np.median(instruction_lengths):.1f}")
print(f"\nResponse length (words):")
print(f"  Mean: {np.mean(response_lengths):.1f}")
print(f"  Min: {np.min(response_lengths)}")
print(f"  Max: {np.max(response_lengths)}")
print(f"  Median: {np.median(response_lengths):.1f}")

In [None]:
# Show sample data
print("\n" + "=" * 50)
print("SAMPLE DATA")
print("=" * 50)

for i, sample in enumerate(random.sample(generated_data, min(3, len(generated_data)))):
    print(f"\n--- Sample {i+1} ---")
    print(f"Instruction: {sample['instruction'][:200]}...")
    print(f"Response: {sample['response'][:200]}...")

## 7. Cleanup

In [None]:
# Free GPU memory
del model
del tokenizer
del generator
gc.collect()
torch.cuda.empty_cache()

print("Memory cleared!")

## ✅ Generation Complete!

### Summary:
- Generated instruction-response pairs saved to `data/raw/instructions_raw.json`
- Checkpoint available at `data/raw/instructions_checkpoint.json`

### Next Steps:
1. Proceed to `03_quality_filtering.ipynb` for data filtering
2. Target: Filter 15,000 → 10,000 high-quality samples