# Smelt AI — Live Demo

Interactive walkthrough of smelt-ai across **OpenAI**, **Anthropic**, and **Google Gemini**.

Tests:
1. Basic classification (all 3 providers)
2. Sentiment analysis with score validation
3. Support ticket triage (complex schema)
4. Parameter tuning (temperature, top_p)
5. Batch configuration (batch_size, concurrency)
6. Error handling (stop_on_exhaustion)
7. Async execution
8. 2026 generation models (GPT-5.2, Claude Sonnet 4.6, Opus 4.6, Gemini 3 Flash, Gemini 3 Pro)

## Setup

In [None]:
import os
import csv
import time
from pathlib import Path
from typing import Literal

from dotenv import load_dotenv
from pydantic import BaseModel, Field

from smelt import Model, Job, SmeltResult, SmeltMetrics, BatchError
from smelt.errors import SmeltExhaustionError

load_dotenv()

OPENAI_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_KEY = os.getenv("ANTHROPIC_API_KEY")
GEMINI_KEY = os.getenv("GEMINI_API_KEY")

print(f"OpenAI key:    {'set' if OPENAI_KEY else 'MISSING'}")
print(f"Anthropic key: {'set' if ANTHROPIC_KEY else 'MISSING'}")
print(f"Gemini key:    {'set' if GEMINI_KEY else 'MISSING'}")

## Load Test Data

In [None]:
DATA_DIR = Path("../tests/data")


def load_csv(filename: str) -> list[dict[str, str]]:
    """Load CSV from tests/data directory."""
    with open(DATA_DIR / filename, newline="", encoding="utf-8") as f:
        return list(csv.DictReader(f))


companies = load_csv("companies.csv")
products = load_csv("products.csv")
tickets = load_csv("support_tickets.csv")

print(f"Companies: {len(companies)} rows")
print(f"Products:  {len(products)} rows")
print(f"Tickets:   {len(tickets)} rows")
print()
print("Sample company:", companies[0])

## Define Output Models

In [None]:
class IndustryClassification(BaseModel):
    """Classification of a company by industry sector."""
    sector: str = Field(description="Primary industry sector")
    sub_sector: str = Field(description="More specific sub-sector")
    is_public: bool = Field(description="Whether the company is publicly traded")


class SentimentAnalysis(BaseModel):
    """Sentiment analysis of a product review."""
    sentiment: Literal["positive", "negative", "mixed"] = Field(description="Overall sentiment")
    score: float = Field(description="Score from 0.0 (negative) to 1.0 (positive)")
    key_themes: list[str] = Field(description="Main themes in the review (1-3 items)")


class TicketTriage(BaseModel):
    """Support ticket triage result."""
    category: str = Field(description="Category: billing, technical, shipping, account, or general")
    priority: Literal["low", "medium", "high", "urgent"] = Field(description="Priority level")
    requires_human: bool = Field(description="Whether human escalation is needed")
    suggested_response: str = Field(description="Brief suggested response to the customer")


class CompanySummary(BaseModel):
    """Structured company summary."""
    one_liner: str = Field(description="One sentence description")
    industry: str = Field(description="Primary industry")
    company_size: Literal["startup", "small", "medium", "large", "enterprise"] = Field(
        description="Size classification based on employee count"
    )
    age_years: int = Field(description="Approximate age in years")


print("Output models defined.")

## Helper: Pretty-Print Results

In [4]:
def show_result(label: str, result: SmeltResult) -> None:
    """Pretty-print a SmeltResult."""
    status = "SUCCESS" if result.success else "FAILED"
    m = result.metrics
    print(f"\n{'='*70}")
    print(f"  {label}")
    print(f"  Status: {status}")
    print(f"  Rows: {m.successful_rows}/{m.total_rows} successful")
    print(f"  Batches: {m.successful_batches}/{m.total_batches} successful")
    print(f"  Tokens: {m.input_tokens:,} in / {m.output_tokens:,} out")
    print(f"  Retries: {m.total_retries} | Time: {m.wall_time_seconds:.2f}s")
    if result.errors:
        print(f"  Errors: {len(result.errors)}")
        for e in result.errors:
            print(f"    - Batch {e.batch_index}: {e.error_type} ({e.attempts} attempts)")
    print(f"{'='*70}")
    print()
    for i, row in enumerate(result.data):
        print(f"  [{i}] {row}")
    if len(result.data) > 3:
        print(f"  ... and {len(result.data) - 3} more rows")

---
## 1. Basic Classification — All 3 Providers

Same task, same data, three different LLMs.

In [None]:
# OpenAI — GPT-4.1-mini
model_openai = Model(provider="openai", name="gpt-4.1-mini", api_key=OPENAI_KEY)

job = Job(
    prompt="Classify each company by its primary industry sector and sub-sector. "
    "Determine if the company is publicly traded.",
    output_model=IndustryClassification,
    batch_size=10,
    stop_on_exhaustion=False,
)

result = await job.arun(model_openai, data=companies)
show_result("OpenAI / gpt-4.1-mini — Company Classification", result)

In [None]:
# Anthropic — Claude Sonnet 4
model_anthropic = Model(provider="anthropic", name="claude-sonnet-4-20250514", api_key=ANTHROPIC_KEY)

job = Job(
    prompt="Classify each company by its primary industry sector and sub-sector. "
    "Determine if the company is publicly traded.",
    output_model=IndustryClassification,
    batch_size=10,
    stop_on_exhaustion=False,
)

result = await job.arun(model_anthropic, data=companies)
show_result("Anthropic / claude-sonnet-4 — Company Classification", result)

In [7]:
# Google Gemini — Gemini 2.5 Flash
model_gemini = Model(provider="google_genai", name="gemini-2.5-flash", api_key=GEMINI_KEY)

job = Job(
    prompt="Classify each company by its primary industry sector and sub-sector. "
    "Determine if the company is publicly traded.",
    output_model=IndustryClassification,
    batch_size=10,
    stop_on_exhaustion=False,
)

result = await job.arun(model_gemini, data=companies)
show_result("Gemini / gemini-2.5-flash — Company Classification", result)

---
## 2. Sentiment Analysis — Score Validation

Analyze product reviews and verify scores are in [0, 1] range.

In [None]:
model = Model(provider="openai", name="gpt-4.1-mini", api_key=OPENAI_KEY, params={"temperature": 0})

job = Job(
    prompt="Analyze the sentiment of each product's customer_review. "
    "Identify the overall sentiment, assign a score between 0.0 and 1.0, "
    "and extract 1-3 key themes.",
    output_model=SentimentAnalysis,
    batch_size=5,
    concurrency=2,
    stop_on_exhaustion=False,
)

result = await job.arun(model, data=products)
show_result("OpenAI / gpt-4.1-mini — Sentiment Analysis", result)

# Validate scores
print("\nScore validation:")
for i, row in enumerate(result.data):
    in_range = 0.0 <= row.score <= 1.0
    print(f"  [{i}] score={row.score:.2f} sentiment={row.sentiment:8s} valid={in_range} themes={row.key_themes}")

In [None]:
model = Model(provider="anthropic", name="claude-haiku-4-5-20251001", api_key=ANTHROPIC_KEY, params={"temperature": 0})

job = Job(
    prompt="Analyze the sentiment of each product's customer_review. "
    "Identify the overall sentiment, assign a score between 0.0 and 1.0, "
    "and extract 1-3 key themes.",
    output_model=SentimentAnalysis,
    batch_size=5,
    concurrency=2,
    stop_on_exhaustion=False,
)

result = await job.arun(model, data=products)
show_result("Anthropic / claude-haiku-4.5 — Sentiment Analysis", result)

print("\nScore validation:")
for i, row in enumerate(result.data):
    in_range = 0.0 <= row.score <= 1.0
    print(f"  [{i}] score={row.score:.2f} sentiment={row.sentiment:8s} valid={in_range} themes={row.key_themes}")

In [None]:
model = Model(provider="google_genai", name="gemini-2.0-flash", api_key=GEMINI_KEY, params={"temperature": 0})

job = Job(
    prompt="Analyze the sentiment of each product's customer_review. "
    "Identify the overall sentiment, assign a score between 0.0 and 1.0, "
    "and extract 1-3 key themes.",
    output_model=SentimentAnalysis,
    batch_size=5,
    concurrency=2,
    stop_on_exhaustion=False,
)

result = await job.arun(model, data=products)
show_result("Gemini / gemini-2.0-flash — Sentiment Analysis", result)

print("\nScore validation:")
for i, row in enumerate(result.data):
    in_range = 0.0 <= row.score <= 1.0
    print(f"  [{i}] score={row.score:.2f} sentiment={row.sentiment:8s} valid={in_range} themes={row.key_themes}")

---
## 3. Support Ticket Triage — Complex Schema

Tests Literal types, booleans, and longer text generation.

In [None]:
model = Model(provider="openai", name="gpt-4.1-mini", api_key=OPENAI_KEY)

job = Job(
    prompt="Triage each support ticket. Classify by category (billing, technical, "
    "shipping, account, or general), assign priority, determine if human escalation "
    "is needed, and write a brief suggested response.",
    output_model=TicketTriage,
    batch_size=5,
    concurrency=2,
    stop_on_exhaustion=False,
)

result = await job.arun(model, data=tickets)
show_result("OpenAI / gpt-4.1-mini — Ticket Triage", result)

print("\nFull triage results:")
for i, row in enumerate(result.data):
    print(f"\n  [{i}] {tickets[i]['ticket_id']}")
    print(f"      Category: {row.category} | Priority: {row.priority} | Human: {row.requires_human}")
    print(f"      Response: {row.suggested_response[:100]}...")

In [None]:
model = Model(provider="anthropic", name="claude-sonnet-4-20250514", api_key=ANTHROPIC_KEY)

job = Job(
    prompt="Triage each support ticket. Classify by category (billing, technical, "
    "shipping, account, or general), assign priority, determine if human escalation "
    "is needed, and write a brief suggested response.",
    output_model=TicketTriage,
    batch_size=5,
    concurrency=2,
    stop_on_exhaustion=False,
)

result = await job.arun(model, data=tickets)
show_result("Anthropic / claude-sonnet-4 — Ticket Triage", result)

print("\nFull triage results:")
for i, row in enumerate(result.data):
    print(f"\n  [{i}] {tickets[i]['ticket_id']}")
    print(f"      Category: {row.category} | Priority: {row.priority} | Human: {row.requires_human}")
    print(f"      Response: {row.suggested_response[:100]}...")

In [None]:
model = Model(provider="google_genai", name="gemini-2.5-flash", api_key=GEMINI_KEY)

job = Job(
    prompt="Triage each support ticket. Classify by category (billing, technical, "
    "shipping, account, or general), assign priority, determine if human escalation "
    "is needed, and write a brief suggested response.",
    output_model=TicketTriage,
    batch_size=5,
    concurrency=2,
    stop_on_exhaustion=False,
)

result = await job.arun(model, data=tickets)
show_result("Gemini / gemini-2.5-flash — Ticket Triage", result)

print("\nFull triage results:")
for i, row in enumerate(result.data):
    print(f"\n  [{i}] {tickets[i]['ticket_id']}")
    print(f"      Category: {row.category} | Priority: {row.priority} | Human: {row.requires_human}")
    print(f"      Response: {row.suggested_response[:100]}...")

---
## 4. Parameter Tuning — Temperature Comparison

Compare temperature=0 (deterministic) vs temperature=1.0 (creative) on the same task.

In [None]:
data_subset = companies[:3]

for temp in [0, 0.5, 1.0]:
    model = Model(
        provider="openai", name="gpt-4.1-mini", api_key=OPENAI_KEY,
        params={"temperature": temp},
    )
    job = Job(
        prompt="Classify each company by industry sector.",
        output_model=IndustryClassification,
        batch_size=10,
        stop_on_exhaustion=False,
    )
    result = await job.arun(model, data=data_subset)
    show_result(f"OpenAI / gpt-4.1-mini — temp={temp}", result)

In [None]:
# Anthropic: top_p (mutually exclusive with temperature) and top_k
# NOTE: Anthropic does NOT allow setting both temperature and top_p simultaneously.
for top_p in [0.5, 0.9]:
    model = Model(
        provider="anthropic", name="claude-haiku-4-5-20251001", api_key=ANTHROPIC_KEY,
        params={"top_p": top_p, "top_k": 40},
    )
    job = Job(
        prompt="Classify each company by industry sector.",
        output_model=IndustryClassification,
        batch_size=10,
        stop_on_exhaustion=False,
    )
    result = await job.arun(model, data=data_subset)
    show_result(f"Anthropic / claude-haiku-4.5 — top_p={top_p}, top_k=40", result)

---
## 5. Batch Configuration — Size & Concurrency

Compare different batch_size and concurrency settings on the same dataset.

In [None]:
configs = [
    {"batch_size": 10, "concurrency": 1, "label": "1 batch, serial"},
    {"batch_size": 5, "concurrency": 2, "label": "2 batches, conc=2"},
    {"batch_size": 2, "concurrency": 5, "label": "5 batches, conc=5"},
    {"batch_size": 1, "concurrency": 10, "label": "10 batches, conc=10"},
]

model = Model(provider="openai", name="gpt-4.1-mini", api_key=OPENAI_KEY, params={"temperature": 0})

for cfg in configs:
    job = Job(
        prompt="Classify each company by industry sector.",
        output_model=IndustryClassification,
        batch_size=cfg["batch_size"],
        concurrency=cfg["concurrency"],
        stop_on_exhaustion=False,
    )
    result = await job.arun(model, data=companies)
    show_result(f"Config: {cfg['label']} (batch={cfg['batch_size']}, conc={cfg['concurrency']})", result)
    
    # Verify all rows present and in order
    assert len(result.data) == len(companies), f"Row count mismatch: {len(result.data)} vs {len(companies)}"
    print(f"  Row ordering verified: {len(result.data)} rows in correct order")

---
## 6. Error Handling — stop_on_exhaustion

Demonstrate graceful error handling when `stop_on_exhaustion=False` collects errors,
and when `stop_on_exhaustion=True` raises `SmeltExhaustionError` with partial results.

In [None]:
# stop_on_exhaustion=False: errors are collected, successful batches still returned
model = Model(provider="openai", name="gpt-4.1-mini", api_key=OPENAI_KEY, params={"temperature": 0})

job = Job(
    prompt="Create a concise structured summary for each company. "
    "Calculate age based on founded year (current year is 2026).",
    output_model=CompanySummary,
    batch_size=5,
    concurrency=2,
    max_retries=2,
    stop_on_exhaustion=False,  # collect errors, don't raise
)

result = await job.arun(model, data=companies)
show_result("Company Summary (stop_on_exhaustion=False)", result)

print(f"\nsuccess property: {result.success}")
print(f"result.data has {len(result.data)} rows")
print(f"result.errors has {len(result.errors)} errors")
print(f"result.metrics: {result.metrics}")

In [None]:
# stop_on_exhaustion=True with a valid request — should succeed without raising
model = Model(provider="openai", name="gpt-4.1-mini", api_key=OPENAI_KEY, params={"temperature": 0})

job = Job(
    prompt="Classify each company by industry sector.",
    output_model=IndustryClassification,
    batch_size=10,
    max_retries=3,
    stop_on_exhaustion=True,  # will raise on failure
)

try:
    result = await job.arun(model, data=companies)
    show_result("Classification (stop_on_exhaustion=True, no error expected)", result)
    print("No exception raised — all batches succeeded.")
except SmeltExhaustionError as e:
    print(f"SmeltExhaustionError: {e}")
    print(f"Partial results: {len(e.partial_result.data)} rows succeeded")
    print(f"Errors: {len(e.partial_result.errors)} batches failed")

---
## 7. Async Execution

Use `await job.arun()` directly (works in Jupyter notebooks).

In [None]:
model = Model(provider="openai", name="gpt-4.1-mini", api_key=OPENAI_KEY, params={"temperature": 0})

job = Job(
    prompt="Classify each company by industry sector.",
    output_model=IndustryClassification,
    batch_size=3,
    concurrency=4,
    stop_on_exhaustion=False,
)

result = await job.arun(model, data=companies)
show_result("OpenAI / gpt-4.1-mini — Async (batch=3, conc=4)", result)
print(f"  Batches: {result.metrics.total_batches} (ceil(10/3) = 4)")

In [None]:
model = Model(provider="anthropic", name="claude-haiku-4-5-20251001", api_key=ANTHROPIC_KEY)

job = Job(
    prompt="Analyze the sentiment of each product review.",
    output_model=SentimentAnalysis,
    batch_size=5,
    concurrency=2,
    stop_on_exhaustion=False,
)

result = await job.arun(model, data=products)
show_result("Anthropic / claude-haiku-4.5 — Async Sentiment", result)

In [None]:
model = Model(provider="google_genai", name="gemini-2.5-flash", api_key=GEMINI_KEY)

job = Job(
    prompt="Triage each support ticket with category, priority, escalation need, "
    "and a suggested response.",
    output_model=TicketTriage,
    batch_size=10,
    stop_on_exhaustion=False,
)

result = await job.arun(model, data=tickets)
show_result("Gemini / gemini-2.5-flash — Async Ticket Triage", result)

---
## Summary

All tests complete. Smelt successfully:
- Transforms structured data through OpenAI, Anthropic, and Google Gemini
- Returns strictly typed Pydantic models
- Handles batching and concurrency
- Provides detailed metrics (tokens, timing, retries)
- Works in both sync (`job.run()`) and async (`await job.arun()`) modes
- Tested with the latest 2026 generation models: GPT-5.2, Claude Sonnet 4.6, Claude Opus 4.6, Gemini 3 Flash, Gemini 3 Pro

> **Note:** Jupyter notebooks run inside an event loop, so all cells use `await job.arun()`.
> Use `job.run()` in regular Python scripts where no event loop is running.

---
## 8. 2026 Generation Models

Test the latest models from each provider: GPT-5.2, Claude Sonnet 4.6, Claude Opus 4.6, Gemini 3 Flash, Gemini 3 Pro.

In [None]:
# OpenAI — GPT-5.2
model_gpt52 = Model(provider="openai", name="gpt-5.2", api_key=OPENAI_KEY)

job = Job(
    prompt="Classify each company by its primary industry sector and sub-sector. "
    "Determine if the company is publicly traded.",
    output_model=IndustryClassification,
    batch_size=10,
    stop_on_exhaustion=False,
)

result = await job.arun(model_gpt52, data=companies)
show_result("OpenAI / gpt-5.2 — Company Classification", result)

In [None]:
# Anthropic — Claude Sonnet 4.6
model_sonnet46 = Model(provider="anthropic", name="claude-sonnet-4-6", api_key=ANTHROPIC_KEY)

job = Job(
    prompt="Classify each company by its primary industry sector and sub-sector. "
    "Determine if the company is publicly traded.",
    output_model=IndustryClassification,
    batch_size=5,
    concurrency=2,
    stop_on_exhaustion=False,
)

result = await job.arun(model_sonnet46, data=companies)
show_result("Anthropic / claude-sonnet-4.6 — Company Classification", result)

In [None]:
# Anthropic — Claude Opus 4.6
model_opus46 = Model(provider="anthropic", name="claude-opus-4-6", api_key=ANTHROPIC_KEY)

job = Job(
    prompt="Triage each support ticket. Classify by category, assign priority, "
    "determine if human escalation is needed, and suggest a brief response.",
    output_model=TicketTriage,
    batch_size=10,
    stop_on_exhaustion=False,
)

result = await job.arun(model_opus46, data=tickets[:5])
show_result("Anthropic / claude-opus-4.6 — Ticket Triage (5 rows)", result)

In [None]:
# Google — Gemini 3 Flash
model_gemini3 = Model(provider="google_genai", name="gemini-3-flash-preview", api_key=GEMINI_KEY)

job = Job(
    prompt="Analyze the sentiment of each product's customer_review. "
    "Identify the overall sentiment, assign a score, and extract key themes.",
    output_model=SentimentAnalysis,
    batch_size=5,
    concurrency=2,
    stop_on_exhaustion=False,
)

result = await job.arun(model_gemini3, data=products)
show_result("Gemini / gemini-3-flash-preview — Sentiment Analysis", result)

In [None]:
# Google — Gemini 3 Pro
model_gemini3pro = Model(provider="google_genai", name="gemini-3-pro-preview", api_key=GEMINI_KEY)

job = Job(
    prompt="Classify each company by its primary industry sector and sub-sector. "
    "Determine if the company is publicly traded.",
    output_model=IndustryClassification,
    batch_size=10,
    stop_on_exhaustion=False,
)

result = await job.arun(model_gemini3pro, data=companies[:5])
show_result("Gemini / gemini-3-pro-preview — Company Classification (5 rows)", result)