# 01. Environment Setup
## Synthetic Instruction Tuner - Week 1 Day 1-2

This notebook sets up the environment for the entire pipeline:
1. GPU verification
2. Library installation
3. Hugging Face authentication
4. Model loading test
5. Project structure setup

## 1. GPU Verification

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Install Required Libraries

In [None]:
# Install core libraries
!pip install -q transformers==4.36.0
!pip install -q peft==0.7.0
!pip install -q trl==0.7.4
!pip install -q datasets==2.16.0
!pip install -q accelerate==0.25.0
!pip install -q bitsandbytes==0.41.3
!pip install -q sentencepiece
!pip install -q protobuf

# Install evaluation library
!pip install -q lm-eval==0.4.0

# Install utilities
!pip install -q jsonlines
!pip install -q huggingface_hub

In [None]:
# Verify installations
import transformers
import peft
import trl
import datasets
import accelerate

print(f"transformers: {transformers.__version__}")
print(f"peft: {peft.__version__}")
print(f"trl: {trl.__version__}")
print(f"datasets: {datasets.__version__}")
print(f"accelerate: {accelerate.__version__}")

## 3. Google Drive Mount (for Colab)

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Set project paths
import os

# Change this to your Google Drive path
PROJECT_ROOT = "/content/drive/MyDrive/synthetic-instruction-tuner"

# Create project directories if they don't exist
directories = [
    f"{PROJECT_ROOT}/data/raw",
    f"{PROJECT_ROOT}/data/filtered",
    f"{PROJECT_ROOT}/data/preference",
    f"{PROJECT_ROOT}/models/sft",
    f"{PROJECT_ROOT}/models/dpo",
    f"{PROJECT_ROOT}/evaluation/results",
    f"{PROJECT_ROOT}/evaluation/figures",
]

for dir_path in directories:
    os.makedirs(dir_path, exist_ok=True)
    print(f"Created: {dir_path}")

print("\nProject structure ready!")

## 4. Hugging Face Authentication

In [None]:
from huggingface_hub import login, whoami

# Login to Hugging Face
# Get your token from: https://huggingface.co/settings/tokens
# Make sure you have accepted Llama 3.1 and 3.2 licenses

login()  # This will prompt for your token

In [None]:
# Verify login
try:
    user_info = whoami()
    print(f"Logged in as: {user_info['name']}")
    print(f"Email: {user_info.get('email', 'N/A')}")
except Exception as e:
    print(f"Login failed: {e}")
    print("Please run the login cell again with a valid token.")

## 5. Test Model Loading (Llama-3.1-8B-Instruct)

**Important**: Before running this, make sure you have:
1. Accepted the Llama 3.1 license at https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
2. Logged in to Hugging Face (cell above)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Model ID for data generation
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

# 4-bit quantization config for memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

print(f"Loading {MODEL_ID}...")
print("This may take a few minutes on first download.")

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")

In [None]:
# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)

print(f"Model loaded successfully!")
print(f"Model dtype: {model.dtype}")
print(f"Device: {model.device}")

In [None]:
# Check GPU memory usage
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    print(f"GPU Memory Allocated: {allocated:.2f} GB")
    print(f"GPU Memory Reserved: {reserved:.2f} GB")

## 6. Test Inference

In [None]:
# Simple inference test
test_prompt = "What is machine learning?"

messages = [
    {"role": "user", "content": test_prompt}
]

# Format with chat template
input_text = tokenizer.apply_chat_template(
    messages, 
    tokenize=False, 
    add_generation_prompt=True
)

print("Input format:")
print(input_text)
print("="*50)

In [None]:
# Generate response
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated response:")
print(response)

## 7. Test Magpie-style Generation

The Magpie method uses only the user template prefix to trigger instruction generation.

In [None]:
# Magpie-style template (only the prefix, no actual prompt)
# This triggers the model to generate an instruction
magpie_template = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"

print("Magpie template:")
print(repr(magpie_template))

In [None]:
# Generate instruction using Magpie method
inputs = tokenizer(magpie_template, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.9,  # Higher temperature for diversity
        do_sample=True,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )

generated = tokenizer.decode(outputs[0], skip_special_tokens=False)
print("Magpie-generated instruction:")
print(generated)

In [None]:
# Parse the generated instruction
def parse_magpie_output(output_text):
    """Extract instruction from Magpie-style generation."""
    # Remove the template prefix
    if "<|start_header_id|>user<|end_header_id|>" in output_text:
        parts = output_text.split("<|start_header_id|>user<|end_header_id|>")
        if len(parts) > 1:
            instruction_part = parts[1]
            # Extract until end of turn or assistant header
            if "<|eot_id|>" in instruction_part:
                instruction = instruction_part.split("<|eot_id|>")[0].strip()
            elif "<|start_header_id|>assistant" in instruction_part:
                instruction = instruction_part.split("<|start_header_id|>assistant")[0].strip()
            else:
                instruction = instruction_part.strip()
            return instruction
    return None

instruction = parse_magpie_output(generated)
print("Parsed instruction:")
print(instruction)

## 8. Cleanup

In [None]:
# Free GPU memory
import gc

del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

print("Memory cleared!")
if torch.cuda.is_available():
    print(f"GPU Memory Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

## 9. Configuration File

In [None]:
# Save project configuration
import json

config = {
    "project_name": "synthetic-instruction-tuner",
    "version": "0.1.0",
    
    # Paths
    "paths": {
        "project_root": PROJECT_ROOT,
        "data_raw": f"{PROJECT_ROOT}/data/raw",
        "data_filtered": f"{PROJECT_ROOT}/data/filtered",
        "data_preference": f"{PROJECT_ROOT}/data/preference",
        "models_sft": f"{PROJECT_ROOT}/models/sft",
        "models_dpo": f"{PROJECT_ROOT}/models/dpo",
        "evaluation_results": f"{PROJECT_ROOT}/evaluation/results",
        "evaluation_figures": f"{PROJECT_ROOT}/evaluation/figures"
    },
    
    # Models
    "models": {
        "data_generation": "meta-llama/Llama-3.1-8B-Instruct",
        "fine_tuning_targets": [
            "meta-llama/Llama-3.2-3B",
            "mistralai/Mistral-7B-v0.1",
            "Qwen/Qwen2.5-3B"
        ],
        "reward_model": "OpenAssistant/reward-model-deberta-v3-large-v2"
    },
    
    # Data generation settings
    "data_generation": {
        "target_raw_samples": 15000,
        "target_filtered_samples": 10000,
        "checkpoint_interval": 1000,
        "temperature": 0.9,
        "max_new_tokens": 512
    },
    
    # Quality filtering settings
    "filtering": {
        "min_words": 20,
        "max_words": 500,
        "jaccard_threshold": 0.8,
        "refusal_keywords": [
            "I'm an AI", "I cannot", "I don't have",
            "As an AI", "I'm not able", "I apologize"
        ]
    },
    
    # LoRA settings
    "lora": {
        "r": 8,
        "lora_alpha": 16,
        "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
        "lora_dropout": 0.05
    },
    
    # SFT settings
    "sft": {
        "num_epochs": 3,
        "batch_size": 4,
        "gradient_accumulation_steps": 4,
        "learning_rate": 2e-4,
        "warmup_ratio": 0.03,
        "save_steps": 500
    },
    
    # DPO settings
    "dpo": {
        "beta": 0.1,
        "num_epochs": 1,
        "batch_size": 2,
        "gradient_accumulation_steps": 8,
        "learning_rate": 5e-5
    },
    
    # Evaluation benchmarks
    "evaluation": {
        "benchmarks": ["ifeval", "mmlu", "truthfulqa"],
        "batch_size": 4
    }
}

# Save config
config_path = f"{PROJECT_ROOT}/config.json"
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print(f"Configuration saved to: {config_path}")

## âœ… Setup Complete!

### Checklist:
- [x] GPU verified (T4 16GB)
- [x] Libraries installed
- [x] Google Drive mounted
- [x] Hugging Face authenticated
- [x] Model loading tested
- [x] Magpie-style generation tested
- [x] Configuration saved

### Next Steps:
1. Proceed to `02_magpie_generation.ipynb` for data generation
2. Run overnight for ~8-10 hours to generate 15,000 samples