In [None]:
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
import json
import time
import sys
import os

# Gemini because free + very good
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

if not GOOGLE_API_KEY:
    print("Error: GOOGLE_API_KEY environment variable not found.")
    print("Please set the environment variable before running the script.")
    sys.exit(1)
    
genai.configure(api_key=GOOGLE_API_KEY)

# Initialize the model
model = genai.GenerativeModel('gemini-3-flash-preview')

# Optional
safety_settings = {
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

# Diverse prompt templates to generate varied training data
GENERATION_PROMPTS = [
    # Reasoning and problem-solving
    "Create a challenging math word problem involving percentages and provide a detailed step-by-step solution.",
    "Generate a logic puzzle with clear constraints and walk through the reasoning to solve it.",
    "Create a physics problem about kinematics and show all the steps to solve it with explanations.",
    "Design a probability question involving combinations or permutations and solve it step-by-step.",
    
    # Programming and algorithms
    "Write a coding interview question about binary trees and provide a well-commented Python solution.",
    "Create a data structures question about hash tables and explain the solution with time/space complexity.",
    "Generate a dynamic programming problem and provide both recursive and optimized solutions.",
    "Design an algorithm question about graph traversal and explain the approach clearly.",
    
    # Explanations and teaching
    "Explain how photosynthesis works at the molecular level in a way that's clear but detailed.",
    "Describe the difference between supervised and unsupervised learning with concrete examples.",
    "Explain how DNS works from when you type a URL to getting the webpage, step by step.",
    "Teach the concept of recursion using a simple example and explain the base case and recursive case.",
    
    # Analysis and comparison
    "Compare and contrast democracy and republicanism, highlighting key differences and similarities.",
    "Analyze the pros and cons of different sorting algorithms (quicksort, mergesort, heapsort).",
    "Explain the tradeoffs between SQL and NoSQL databases with specific use cases.",
    "Compare renewable energy sources (solar, wind, hydro) in terms of efficiency and scalability.",
    
    # Creative and practical
    "Provide a detailed recipe for homemade pasta with tips for getting the texture right.",
    "Explain how to debug a program systematically when you encounter an error.",
    "Describe how to prepare for a technical interview, including what to study and practice.",
    "Give advice on how to learn a new programming language efficiently.",
    
    # Science and nature
    "Explain how black holes form and what happens at the event horizon.",
    "Describe the process of protein synthesis from DNA to functional protein.",
    "Explain how climate change affects ocean currents and weather patterns.",
    "Describe how vaccines work at the cellular and molecular level.",
    
    # History and social sciences
    "Explain the causes and consequences of the Industrial Revolution.",
    "Describe how economic inflation works and what central banks do to control it.",
    "Explain the psychological concept of cognitive dissonance with real-world examples.",
    "Describe how supply and demand determines market prices with a specific example.",
    
    # Technical writing and documentation
    "Write clear documentation for a REST API endpoint including parameters and examples.",
    "Explain how to set up a virtual environment in Python and why it's important.",
    "Describe best practices for writing clean, maintainable code with examples.",
    "Explain the SOLID principles in software engineering with concrete examples.",
]

def generate_example(prompt_template):
    """Generate a single training example using Gemini."""
    try:
        # Construct the full prompt
        full_prompt = f"""{prompt_template}

        Format your response as:
        QUESTION: [the question/instruction]
        ANSWER: [detailed answer]"""

        # Generate content
        response = model.generate_content(
            full_prompt,
            safety_settings=safety_settings
        )
        
        # Check if response contains text (handles cases where Gemini blocks output)
        if not response.parts:
            print("Response was blocked or empty.")
            return None

        content = response.text
        
        # Parse question and answer
        if "QUESTION:" in content and "ANSWER:" in content:
            parts = content.split("ANSWER:", 1)
            question = parts[0].replace("QUESTION:", "").strip()
            answer = parts[1].strip()
            
            return {
                "user": question,
                "assistant": answer
            }
        else:
            # Fallback: treat entire response as answer to the prompt
            return {
                "user": prompt_template,
                "assistant": content
            }
            
    except Exception as e:
        print(f"Error generating example: {e}")
        # If rate limit exceeded, wait a bit longer
        if "429" in str(e):
            print("Rate limit hit, sleeping for 10 seconds...")
            time.sleep(10)
        return None

def generate_dataset(num_examples=100, output_file="synthetic_sft_data.jsonl"):
    """Generate a dataset of training examples."""
    examples = []
    
    print(f"Generating {num_examples} training examples using Gemini...")
    
    for i in range(num_examples):
        # Cycle through prompts to ensure diversity
        prompt = GENERATION_PROMPTS[i % len(GENERATION_PROMPTS)]
        
        print(f"Generating example {i+1}/{num_examples}...")
        example = generate_example(prompt)
        
        if example:
            examples.append(example)
            
        # Rate limiting
        # Gemini Free tier has limits (15 RPM for Flash, 2 RPM for Pro). 
        # Adjust sleep accordingly. 4 seconds is safe for 15 RPM.
        time.sleep(4) 
        
        # Save checkpoint every 10 examples
        if (i + 1) % 10 == 0:
            with open(output_file, 'w') as f:
                for ex in examples:
                    f.write(json.dumps(ex) + '\n')
            print(f"Checkpoint: Saved {len(examples)} examples to {output_file}")
    
    # Final save
    with open(output_file, 'w') as f:
        for ex in examples:
            f.write(json.dumps(ex) + '\n')
    
    print(f"\nComplete! Generated {len(examples)} examples saved to {output_file}")
    return examples

if __name__ == "__main__":
    # Generate 200 high-quality examples
    # Note: Depending on your tier (Free vs Paid), 200 examples might take 10-20 mins
    generate_dataset(num_examples=200)

Generating 200 training examples using Gemini...
Generating example 1/200...
Generating example 2/200...
Generating example 3/200...
Generating example 4/200...
Generating example 5/200...
Generating example 6/200...
Generating example 7/200...
Generating example 8/200...
Generating example 9/200...
Generating example 10/200...
Checkpoint: Saved 10 examples to synthetic_sft_data.jsonl
Generating example 11/200...
Generating example 12/200...
Generating example 13/200...
Generating example 14/200...
Generating example 15/200...
Generating example 16/200...
Generating example 17/200...
