# 01. Environment Setup
## Synthetic Instruction Tuner - Week 1 Day 1-2

This notebook sets up the environment for the entire pipeline:
1. GPU verification
2. Library installation
3. Hugging Face authentication
4. Model loading test
5. Project structure setup

## 1. GPU Verification

In [2]:
# Check GPU availability
!nvidia-smi

Tue Dec 16 18:53:04 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

PyTorch version: 2.9.0+cu126
CUDA available: True
CUDA version: 12.6
GPU: Tesla T4
GPU Memory: 15.83 GB


## 2. Install Required Libraries

In [4]:
# Install core libraries with latest compatible versions
!pip install -q --upgrade transformers>=4.41.0
!pip install -q --upgrade peft>=0.7.0
!pip install -q --upgrade trl>=0.7.4
!pip install -q --upgrade datasets>=2.16.0
!pip install -q --upgrade accelerate>=0.25.0
!pip install -q --upgrade bitsandbytes>=0.41.3
!pip install -q sentencepiece protobuf

# Install evaluation library
!pip install -q lm-eval

# Install utilities
!pip install -q jsonlines huggingface_hub

print("All libraries installed successfully!")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.6/293.6 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.1/91.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:0

In [5]:
# Verify installations and check versions
import transformers
import peft
import trl
import datasets
import accelerate

print("Installed versions:")
print("=" * 50)
print(f"transformers: {transformers.__version__}")
print(f"peft: {peft.__version__}")
print(f"trl: {trl.__version__}")
print(f"datasets: {datasets.__version__}")
print(f"accelerate: {accelerate.__version__}")
print("=" * 50)
print("\n✅ All libraries installed and verified!")

Installed versions:
transformers: 4.57.3
peft: 0.18.0
trl: 0.26.1
datasets: 4.4.1
accelerate: 1.12.0

✅ All libraries installed and verified!


In [6]:
# Verify installations
import transformers
import peft
import trl
import datasets
import accelerate

print(f"transformers: {transformers.__version__}")
print(f"peft: {peft.__version__}")
print(f"trl: {trl.__version__}")
print(f"datasets: {datasets.__version__}")
print(f"accelerate: {accelerate.__version__}")

transformers: 4.57.3
peft: 0.18.0
trl: 0.26.1
datasets: 4.4.1
accelerate: 1.12.0


## 3. Google Drive Mount (for Colab)

In [7]:
# Mount Google Drive
from google.colab import drive
from google.colab import userdata
from huggingface_hub import login

drive.mount('/content/drive')

login(token=userdata.get('HF_TOKEN'))

Mounted at /content/drive


In [8]:
# Set project paths
import os

# Change this to your Google Drive path
PROJECT_ROOT = "/content/drive/MyDrive/synthetic-instruction-tuner"

# Create project directories if they don't exist
directories = [
    f"{PROJECT_ROOT}/data/raw",
    f"{PROJECT_ROOT}/data/filtered",
    f"{PROJECT_ROOT}/data/preference",
    f"{PROJECT_ROOT}/models/sft",
    f"{PROJECT_ROOT}/models/dpo",
    f"{PROJECT_ROOT}/evaluation/results",
    f"{PROJECT_ROOT}/evaluation/figures",
]

for dir_path in directories:
    os.makedirs(dir_path, exist_ok=True)
    print(f"Created: {dir_path}")

print("\nProject structure ready!")

Created: /content/drive/MyDrive/synthetic-instruction-tuner/data/raw
Created: /content/drive/MyDrive/synthetic-instruction-tuner/data/filtered
Created: /content/drive/MyDrive/synthetic-instruction-tuner/data/preference
Created: /content/drive/MyDrive/synthetic-instruction-tuner/models/sft
Created: /content/drive/MyDrive/synthetic-instruction-tuner/models/dpo
Created: /content/drive/MyDrive/synthetic-instruction-tuner/evaluation/results
Created: /content/drive/MyDrive/synthetic-instruction-tuner/evaluation/figures

Project structure ready!


## 4. Hugging Face Authentication

In [14]:
# Verify login
from huggingface_hub import whoami

try:
    user_info = whoami()
    print(f"Logged in as: {user_info['name']}")
    print(f"Email: {user_info.get('email', 'N/A')}")
except Exception as e:
    print(f"Login failed: {e}")
    print("Please run the login cell again with a valid token.")

Logged in as: Changyong3
Email: N/A


In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Model ID for data generation
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

# 4-bit quantization config for memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

print(f"Loading {MODEL_ID}...")
print("This may take a few minutes on first download.")

Loading meta-llama/Llama-3.1-8B-Instruct...
This may take a few minutes on first download.


In [16]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")

Tokenizer loaded. Vocab size: 128000


In [17]:
# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)

print(f"Model loaded successfully!")
print(f"Model dtype: {model.dtype}")
print(f"Device: {model.device}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully!
Model dtype: torch.float16
Device: cuda:0


In [18]:
# Check GPU memory usage
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    print(f"GPU Memory Allocated: {allocated:.2f} GB")
    print(f"GPU Memory Reserved: {reserved:.2f} GB")

GPU Memory Allocated: 9.31 GB
GPU Memory Reserved: 12.69 GB


## 6. Test Inference

In [19]:
# Simple inference test
test_prompt = "What is machine learning?"

messages = [
    {"role": "user", "content": test_prompt}
]

# Format with chat template
input_text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print("Input format:")
print(input_text)
print("="*50)

Input format:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is machine learning?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [20]:
# Generate response
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated response:")
print(response)

Generated response:
system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

What is machine learning?assistant

Machine learning is a subset of artificial intelligence (AI) that involves the use of algorithms and statistical models to enable computers to learn from data, make decisions, and improve their performance on a task without being explicitly programmed. In other words, machine learning allows computers to learn from experience and improve their performance over time.

Machine learning involves training algorithms on large datasets to identify patterns, relationships, and trends, and to make predictions or decisions based on that analysis. The goal of machine learning is to enable computers to learn from data and make accurate predictions or decisions, rather than being explicitly programmed to perform a specific task.

There are several types of machine learning, including:

1. **Supervised learning**: This type of machine learning involves training algori

## 7. Test Magpie-style Generation

The Magpie method uses only the user template prefix to trigger instruction generation.

In [22]:
# Magpie-style template (only the prefix, no actual prompt)
# This triggers the model to generate an instruction
magpie_template = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"

print("Magpie template:")
print(repr(magpie_template))

Magpie template:
'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n'


In [23]:
# Generate instruction using Magpie method
inputs = tokenizer(magpie_template, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.9,  # Higher temperature for diversity
        do_sample=True,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )

generated = tokenizer.decode(outputs[0], skip_special_tokens=False)
print("Magpie-generated instruction:")
print(generated)

Magpie-generated instruction:
<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

## Step 1: Understand the problem
The problem asks us to find the sum of the squares of all the numbers from 1 to 100.

## Step 2: Recall the formula for the sum of squares of the first n natural numbers
The sum of squares of the first n natural numbers is given by the formula: 1^2 + 2^2 + 3^2 +... + n^2 = n * (n + 1) * (2n + 1) / 6.

## Step 3: Apply the formula to find the sum of squares of numbers from 1 to 100
Using the formula, we can substitute n = 100 to get the sum of squares of numbers from 1 to 100.

## Step 4: Calculate the sum
Substituting n = 100 in the formula, we get: sum = 100 * (100 + 1) * (2 * 100 + 1) / 6 = 100 * 101 * 201 / 6.

## Step 5: Perform the arithmetic
Performing the arithmetic, we get: sum = 100 * 101 * 201 = 2020200, then dividing by 6 gives


In [24]:
# Parse the generated instruction
def parse_magpie_output(output_text):
    """Extract instruction from Magpie-style generation."""
    # Remove the template prefix
    if "<|start_header_id|>user<|end_header_id|>" in output_text:
        parts = output_text.split("<|start_header_id|>user<|end_header_id|>")
        if len(parts) > 1:
            instruction_part = parts[1]
            # Extract until end of turn or assistant header
            if "<|eot_id|>" in instruction_part:
                instruction = instruction_part.split("<|eot_id|>")[0].strip()
            elif "<|start_header_id|>assistant" in instruction_part:
                instruction = instruction_part.split("<|start_header_id|>assistant")[0].strip()
            else:
                instruction = instruction_part.strip()
            return instruction
    return None

instruction = parse_magpie_output(generated)
print("Parsed instruction:")
print(instruction)

Parsed instruction:
## Step 1: Understand the problem
The problem asks us to find the sum of the squares of all the numbers from 1 to 100.

## Step 2: Recall the formula for the sum of squares of the first n natural numbers
The sum of squares of the first n natural numbers is given by the formula: 1^2 + 2^2 + 3^2 +... + n^2 = n * (n + 1) * (2n + 1) / 6.

## Step 3: Apply the formula to find the sum of squares of numbers from 1 to 100
Using the formula, we can substitute n = 100 to get the sum of squares of numbers from 1 to 100.

## Step 4: Calculate the sum
Substituting n = 100 in the formula, we get: sum = 100 * (100 + 1) * (2 * 100 + 1) / 6 = 100 * 101 * 201 / 6.

## Step 5: Perform the arithmetic
Performing the arithmetic, we get: sum = 100 * 101 * 201 = 2020200, then dividing by 6 gives


## 8. Cleanup

In [25]:
# Free GPU memory
import gc

del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

print("Memory cleared!")
if torch.cuda.is_available():
    print(f"GPU Memory Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

Memory cleared!
GPU Memory Allocated: 0.01 GB


## 9. Configuration File

In [26]:
# Save project configuration
import json

config = {
    "project_name": "synthetic-instruction-tuner",
    "version": "0.1.0",

    # Paths
    "paths": {
        "project_root": PROJECT_ROOT,
        "data_raw": f"{PROJECT_ROOT}/data/raw",
        "data_filtered": f"{PROJECT_ROOT}/data/filtered",
        "data_preference": f"{PROJECT_ROOT}/data/preference",
        "models_sft": f"{PROJECT_ROOT}/models/sft",
        "models_dpo": f"{PROJECT_ROOT}/models/dpo",
        "evaluation_results": f"{PROJECT_ROOT}/evaluation/results",
        "evaluation_figures": f"{PROJECT_ROOT}/evaluation/figures"
    },

    # Models
    "models": {
        "data_generation": "meta-llama/Llama-3.1-8B-Instruct",
        "fine_tuning_targets": [
            "meta-llama/Llama-3.2-3B",
            "mistralai/Mistral-7B-v0.1",
            "Qwen/Qwen2.5-3B"
        ],
        "reward_model": "OpenAssistant/reward-model-deberta-v3-large-v2"
    },

    # Data generation settings (optimized for Colab free tier)
    "data_generation": {
        "target_raw_samples": 1500,
        "target_filtered_samples": 1000,
        "checkpoint_interval": 20,
        "temperature": 0.9,
        "max_new_tokens": 512
    },

    # Quality filtering settings
    "filtering": {
        "min_words": 20,
        "max_words": 500,
        "jaccard_threshold": 0.8,
        "refusal_keywords": [
            "I'm an AI", "I cannot", "I don't have",
            "As an AI", "I'm not able", "I apologize"
        ]
    },

    # LoRA settings
    "lora": {
        "r": 8,
        "lora_alpha": 16,
        "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
        "lora_dropout": 0.05
    },

    # SFT settings
    "sft": {
        "num_epochs": 3,
        "batch_size": 4,
        "gradient_accumulation_steps": 4,
        "learning_rate": 2e-4,
        "warmup_ratio": 0.03,
        "save_steps": 500
    },

    # DPO settings
    "dpo": {
        "beta": 0.1,
        "num_epochs": 1,
        "batch_size": 2,
        "gradient_accumulation_steps": 8,
        "learning_rate": 5e-5
    },

    # Evaluation benchmarks
    "evaluation": {
        "benchmarks": ["ifeval", "mmlu", "truthfulqa"],
        "batch_size": 4
    }
}

# Save config
config_path = f"{PROJECT_ROOT}/config.json"
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print(f"Configuration saved to: {config_path}")

Configuration saved to: /content/drive/MyDrive/synthetic-instruction-tuner/config.json


## ✅ Setup Complete!

### Checklist:
- [x] GPU verified (T4 16GB)
- [x] Libraries installed
- [x] Google Drive mounted
- [x] Hugging Face authenticated
- [x] Model loading tested
- [x] Magpie-style generation tested
- [x] Configuration saved

### Next Steps:
1. Proceed to `02_magpie_generation.ipynb` for data generation
2. Run across 3 days for ~16-17 hours total to generate 1,500 samples (optimized for Colab free tier)
3. Follow the checkpoint-based generation strategy to handle Colab's 12h runtime limit