In [None]:
!uv pip install pandas numpy


# Synthetic Dataset Generator for Arize AX

This notebook creates a synthetic dataset with 1000 rows following Arize AX dataset conventions.

In [None]:
import pandas as pd
import json
import uuid
import random
from datetime import datetime, timedelta
import numpy as np
import os


In [None]:
# Configuration
NUM_ROWS = 2000000
DATASET_NAME = "Synthetic_Dataset"

# Prompt template for the synthetic data
PROMPT_TEMPLATE = """You are a {persona}. Help solve this problem: {problem}.
Context: {context}
Please provide a detailed response."""

In [None]:
# Sample data pools for generating synthetic content
personas = [
    "software engineer", "data scientist", "product manager", "designer", 
    "marketing specialist", "financial analyst", "teacher", "consultant",
    "researcher", "entrepreneur", "writer", "analyst"
]

problems = [
    "optimizing system performance", "analyzing customer data", "improving user experience",
    "reducing operational costs", "increasing team productivity", "developing new features",
    "understanding market trends", "solving technical debt", "improving communication",
    "scaling infrastructure", "enhancing security", "streamlining processes"
]

contexts = [
    "startup environment with limited resources", "enterprise setting with compliance requirements",
    "remote team collaboration", "fast-paced development cycle", "customer-facing application",
    "data-driven decision making", "cross-functional project", "legacy system modernization",
    "mobile-first approach", "cloud migration project", "AI/ML implementation", "security audit"`
]

sample_outputs = [
    "Here's a comprehensive approach to address this challenge...",
    "Based on the context provided, I recommend the following steps...",
    "To solve this problem effectively, consider these key factors...",
    "The best strategy would be to implement a phased approach...",
    "After analyzing the situation, here are my recommendations..."
]

In [None]:
def generate_synthetic_row(index):
    """Generate a single synthetic row for the dataset."""
    
    # Generate random combinations
    persona = random.choice(personas)
    problem = random.choice(problems)
    context = random.choice(contexts)
    
    # Create prompt variables
    prompt_variables = {
        "persona": persona,
        "problem": problem,
        "context": context
    }
    
    # Generate synthetic output
    output_base = random.choice(sample_outputs)
    output = f"{output_base} For a {persona} dealing with {problem} in a {context}, the key considerations are performance, scalability, and user satisfaction."
    
    # Generate timestamp within last 30 days
    timestamp = datetime.now() - timedelta(days=random.randint(0, 30))
    
    return {
        "id": str(uuid.uuid4()),
        "attributes.llm.prompt_template.template": PROMPT_TEMPLATE,
        "attributes.llm.prompt_template.variables": json.dumps(prompt_variables),
        "input": PROMPT_TEMPLATE.format(**prompt_variables),
        "output": output,
        "timestamp": timestamp.isoformat(),
        "model_name": "gpt-4",
        "token_count_input": random.randint(50, 200),
        "token_count_output": random.randint(100, 300),
        "latency_ms": random.randint(500, 3000),
        "cost_usd": round(random.uniform(0.001, 0.05), 4)
    }

In [None]:
# Generate the synthetic dataset
print(f"Generating {NUM_ROWS} synthetic rows...")

data = []
for i in range(NUM_ROWS):
    if i % 100 == 0:
        print(f"Generated {i} rows...")
    data.append(generate_synthetic_row(i))

print(f"Generated {len(data)} rows successfully!")

In [None]:
df = pd.DataFrame(data)

# Display sample rows
print("Sample rows:")
df.head(3)

In [None]:
#Save the dataset to a csv file
os.makedirs("datasets", exist_ok=True)

output_filename = os.path.join("datasets", f"{DATASET_NAME}_{NUM_ROWS}_rows.csv")
df.to_csv(output_filename, index=False)
print(f"Dataset saved to: {output_filename}")