# Synthetic PII Data Validation using OpenAI GPT-5.1

This notebook validates the synthetically-generated PII training data
using OpenAI's GPT-5.1 model with low reasoning effort via the Batch API.

The validation performs semantic analysis that programmatic checks cannot:

-   Text coherence and naturalness
-   Entity label correctness
-   Span boundary accuracy
-   Feature dimension characteristic verification


## Imports and Environment Setup


In [1]:
import json
import os
import time
from collections import Counter
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Any

import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel, Field, model_validator
from tqdm.auto import tqdm

# Load environment variables from .env file
load_dotenv()

# Verify required API key is present
OPENAI_API_KEY: str | None = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError(
        "OPENAI_API_KEY not found in environment. "
        "Create a .env file with your OpenAI API key."
    )

# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

# Configuration
MODEL_ID: str = "gpt-5.1-2025-11-13"
REASONING_EFFORT: str = "low"
BATCH_COMPLETION_WINDOW: str = "24h"
SYNTHETIC_DATA_PATH: Path = Path("./data/synthetic/synthetic_samples.jsonl")
BATCH_OUTPUT_DIR: Path = Path("./data/validation")

# Ensure output directory exists
BATCH_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("‚úì Environment loaded successfully")
print(f"  OPENAI_API_KEY: {'*' * 8}...{OPENAI_API_KEY[-4:]}")
print(f"  Model: {MODEL_ID}")
print(f"  Reasoning Effort: {REASONING_EFFORT}")
print(f"  Synthetic Data Path: {SYNTHETIC_DATA_PATH}")
print(f"  Output Directory: {BATCH_OUTPUT_DIR}")

‚úì Environment loaded successfully
  OPENAI_API_KEY: ********...GLEA
  Model: gpt-5.1-2025-11-13
  Reasoning Effort: low
  Synthetic Data Path: data\synthetic\synthetic_samples.jsonl
  Output Directory: data\validation


## Pydantic Schemas

Pydantic schemas for synthetic PII samples.

These must match the schemas used in `data-generation.ipynb` for proper
deserialization of the generated samples.


In [2]:
class FeatureDimension(str, Enum):
    """
    The six NER failure mode dimensions from Singh & Narayanan (2025).
    
    Each dimension represents a specific type of challenge for PII detection:
        - basic: Standard, well-formatted entities with clear boundaries
        - contextual: Ambiguous entities requiring surrounding context
        - noisy: Real-world text imperfections and formatting variations
        - evolving: Modern/emerging PII formats not in traditional training data
        - multilingual: International formats embedded in English prose
        - adversarial: Intentionally deceptive patterns designed to evade detection
    """
    BASIC = "basic"
    CONTEXTUAL = "contextual"
    NOISY = "noisy"
    EVOLVING = "evolving"
    MULTILINGUAL = "multilingual"
    ADVERSARIAL = "adversarial"


class EntitySpan(BaseModel):
    """
    A single PII entity annotation with character-level span positions.
    
    Attributes:
        start: Starting character index (0-based, inclusive).
        end: Ending character index (exclusive, like Python slicing).
        label: PII type label from the unified taxonomy.
        text: The actual text content of the entity (for verification).
    """
    start: int = Field(..., ge=0, description="Start character index (inclusive)")
    end: int = Field(..., gt=0, description="End character index (exclusive)")
    label: str = Field(..., description="PII type label")
    text: str = Field(..., min_length=1, description="Entity text content")
    
    @model_validator(mode="after")
    def validate_span_bounds(self) -> "EntitySpan":
        """Ensure start < end for valid span."""
        if self.start >= self.end:
            raise ValueError(f"Invalid span: start ({self.start}) must be < end ({self.end})")
        return self


class SyntheticSample(BaseModel):
    """
    A complete synthetic PII training sample with text and annotations.
    
    This schema captures everything needed for training and validation:
    the generated text, all entity annotations, metadata about the
    generation process, and the feature dimension being targeted.
    
    Attributes:
        text: The generated English text containing PII entities.
        entities: List of all PII entity annotations with spans.
        feature_dimension: Which NER challenge dimension this targets.
        seed_pii_type: The primary PII type used to seed generation.
        seed_pii_value: The actual PII value that was seeded.
        seed_pii_locale: Locale/region for international formats.
        scenario: Brief description of the text scenario/context.
        type_variant: Specific variant or sub-type of the PII.
        generation_id: Unique identifier for this generation attempt.
        timestamp: When this sample was generated.
    """
    text: str = Field(..., min_length=50, max_length=600, description="Generated text")
    entities: list[EntitySpan] = Field(..., min_length=1, description="Entity annotations")
    feature_dimension: FeatureDimension = Field(..., description="Target dimension")
    seed_pii_type: str = Field(..., description="Primary PII type")
    seed_pii_value: str = Field(..., description="Seeded PII value")
    seed_pii_locale: str | None = Field(None, description="Locale for international formats")
    scenario: str = Field(..., description="Text scenario description")
    type_variant: str = Field(..., description="PII format variant")
    generation_id: str = Field(..., description="Unique generation ID")
    timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat())


print("‚úì Pydantic schemas defined")
print(f"  SyntheticSample fields: {list(SyntheticSample.model_fields.keys())}")

‚úì Pydantic schemas defined
  SyntheticSample fields: ['text', 'entities', 'feature_dimension', 'seed_pii_type', 'seed_pii_value', 'seed_pii_locale', 'scenario', 'type_variant', 'generation_id', 'timestamp']


## Load Synthetic Data

Load the synthetically-generated samples from JSONL.

This loads all samples that were generated in `data-generation.ipynb`
and prepares them for validation.


In [3]:
def load_synthetic_samples(path: Path) -> list[SyntheticSample]:
    """
    Load synthetic samples from a JSONL file.
    
    Each line in the file should be a valid JSON object conforming
    to the SyntheticSample schema.
    
    Args:
        path: Path to the JSONL file containing synthetic samples.
        
    Returns:
        List of parsed SyntheticSample objects.
        
    Raises:
        FileNotFoundError: If the specified file does not exist.
        ValueError: If any line fails Pydantic validation.
    """
    if not path.exists():
        raise FileNotFoundError(f"Synthetic data file not found: {path}")
    
    samples: list[SyntheticSample] = []
    errors: list[tuple[int, str]] = []
    
    with open(path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            
            try:
                data = json.loads(line)
                sample = SyntheticSample.model_validate(data)
                samples.append(sample)
            except json.JSONDecodeError as e:
                errors.append((line_num, f"JSON decode error: {e}"))
            except Exception as e:
                errors.append((line_num, f"Validation error: {e}"))
    
    if errors:
        print(f"‚ö† {len(errors)} samples failed to load:")
        for line_num, error in errors[:10]:
            print(f"  Line {line_num}: {error}")
        if len(errors) > 10:
            print(f"  ... and {len(errors) - 10} more errors")
    
    return samples


# Load samples
samples: list[SyntheticSample] = load_synthetic_samples(SYNTHETIC_DATA_PATH)

print(f"‚úì Loaded {len(samples)} synthetic samples")

# Display distribution by feature dimension
dimension_counts = Counter(s.feature_dimension.value for s in samples)
print("\nSamples by feature dimension:")
for dim, count in sorted(dimension_counts.items()):
    print(f"  {dim}: {count}")

# Display distribution by PII type
pii_type_counts = Counter(s.seed_pii_type for s in samples)
print("\nSamples by PII type (top 10):")
for pii_type, count in pii_type_counts.most_common(10):
    print(f"  {pii_type}: {count}")

‚úì Loaded 10836 synthetic samples

Samples by feature dimension:
  adversarial: 1766
  basic: 1818
  contextual: 1819
  evolving: 1813
  multilingual: 1819
  noisy: 1801

Samples by PII type (top 10):
  PASSPORT_NUMBER: 684
  DATE_OF_BIRTH: 684
  POSTAL_CODE: 682
  NAMES_OF_PLACES_OR_NOUNS: 682
  PHONE: 682
  NATIONAL_IDENTITY_SSN_AADHAR: 681
  VEHICLE_REGISTRATION: 680
  BANK_UPI_ID: 680
  OTHER_NATIONAL_IDENTITY: 679
  TAX_IDENTIFICATION: 677


## Programmatic Pre-Filter

Fast programmatic pre-filter to catch obvious issues before sending to GPT-5.1.

This is a cost optimization to avoid wasting API calls on samples that are
clearly broken. It is NOT a replacement for semantic validation.


In [4]:
@dataclass
class PreFilterResult:
    """
    Result from programmatic pre-filtering.
    
    Attributes:
        passes_prefilter: Whether the sample passes basic checks.
        issues: List of issues found (empty if passes).
    """
    passes_prefilter: bool
    issues: list[str] = field(default_factory=list)


class ProgrammaticPreFilter:
    """
    Fast programmatic pre-filter for obvious issues.
    
    This catches cheap-to-detect problems before sending to GPT-5.1:
    - Empty or too-short/too-long text
    - Missing required fields
    - Invalid character positions
    - Non-English text (high non-ASCII ratio)
    
    Attributes:
        min_text_length: Minimum acceptable text length.
        max_text_length: Maximum acceptable text length.
        max_non_ascii_ratio: Maximum ratio of non-ASCII characters.
    """
    
    def __init__(
        self,
        min_text_length: int = 50,
        max_text_length: int = 600,
        max_non_ascii_ratio: float = 0.3,
    ) -> None:
        """
        Initialize pre-filter with thresholds.
        
        Args:
            min_text_length: Minimum acceptable text length.
            max_text_length: Maximum acceptable text length.
            max_non_ascii_ratio: Maximum ratio of non-ASCII characters.
        """
        self.min_text_length = min_text_length
        self.max_text_length = max_text_length
        self.max_non_ascii_ratio = max_non_ascii_ratio
    
    def check(self, sample: SyntheticSample) -> PreFilterResult:
        """
        Run fast programmatic checks on a sample.
        
        Args:
            sample: The synthetic sample to pre-filter.
            
        Returns:
            PreFilterResult indicating pass/fail and any issues.
        """
        issues: list[str] = []
        
        # Check 1: Text length bounds
        text_len = len(sample.text)
        if text_len < self.min_text_length:
            issues.append(f"Text too short: {text_len} < {self.min_text_length}")
        elif text_len > self.max_text_length:
            issues.append(f"Text too long: {text_len} > {self.max_text_length}")
        
        # Check 2: Seed PII must be present in text
        if sample.seed_pii_value not in sample.text:
            issues.append(f"Seed PII '{sample.seed_pii_value}' not found in text")
        
        # Check 3: Entity spans must be valid positions
        for entity in sample.entities:
            if entity.start < 0 or entity.end > len(sample.text):
                issues.append(
                    f"Invalid span [{entity.start}:{entity.end}] for text length {len(sample.text)}"
                )
            elif entity.start >= entity.end:
                issues.append(f"Invalid span [{entity.start}:{entity.end}]: start >= end")
            else:
                # Check span text matches entity text
                actual_text = sample.text[entity.start:entity.end]
                if actual_text != entity.text:
                    issues.append(
                        f"Span text mismatch at [{entity.start}:{entity.end}]: "
                        f"expected '{entity.text}', got '{actual_text}'"
                    )
        
        # Check 4: At least one entity required
        if len(sample.entities) == 0:
            issues.append("No entities annotated")
        
        # Check 5: Text should be primarily English (basic heuristic)
        non_ascii = sum(1 for c in sample.text if ord(c) > 127)
        non_ascii_ratio = non_ascii / len(sample.text) if sample.text else 0
        if non_ascii_ratio > self.max_non_ascii_ratio:
            issues.append(
                f"High non-ASCII ratio: {non_ascii_ratio:.2%} > {self.max_non_ascii_ratio:.0%}"
            )
        
        return PreFilterResult(
            passes_prefilter=len(issues) == 0,
            issues=issues,
        )


# Run pre-filter on all samples
prefilter = ProgrammaticPreFilter()

prefilter_results: list[tuple[SyntheticSample, PreFilterResult]] = []
passed_prefilter: list[SyntheticSample] = []
failed_prefilter: list[tuple[SyntheticSample, PreFilterResult]] = []

for sample in tqdm(samples, desc="Pre-filtering"):
    result = prefilter.check(sample)
    prefilter_results.append((sample, result))
    
    if result.passes_prefilter:
        passed_prefilter.append(sample)
    else:
        failed_prefilter.append((sample, result))

print(f"\n‚úì Pre-filter complete")
print(f"  Passed: {len(passed_prefilter)}")
print(f"  Failed: {len(failed_prefilter)}")
print(f"  Pass rate: {len(passed_prefilter) / len(samples) * 100:.1f}%")

if failed_prefilter:
    print("\nSample pre-filter failures (first 5):")
    for sample, result in failed_prefilter[:5]:
        print(f"  {sample.generation_id}: {result.issues}")

Pre-filtering:   0%|          | 0/10836 [00:00<?, ?it/s]


‚úì Pre-filter complete
  Passed: 10836
  Failed: 0
  Pass rate: 100.0%


## Validation System Prompt and Request Formatting

OpenAI GPT-5.1 validation prompt and request formatting.

The validation system prompt instructs GPT-5.1 to perform deep semantic
analysis of each synthetic sample, checking:

-   Text coherence and naturalness
-   Entity label correctness
-   Span boundary accuracy
-   Feature dimension characteristic verification


In [5]:
# System prompt for GPT-5.1 validation
VALIDATION_SYSTEM_PROMPT: str = """You are an expert data quality validator for PII (Personally Identifiable Information) detection training data.

Your task is to validate synthetic training samples for a Named Entity Recognition (NER) model. Each sample contains:
1. A text passage (should be natural, coherent English)
2. Entity annotations with character-level spans and labels
3. A claimed "feature dimension" representing the type of NER challenge

FEATURE DIMENSIONS (from Singh & Narayanan 2025):
- basic: Straightforward, well-formatted entities in clear context
- contextual: Entities requiring disambiguation (e.g., "Apple" as company vs fruit)
- noisy: Real-world imperfections (typos, OCR errors, abbreviations, formatting issues)
- evolving: New/emerging PII formats (crypto addresses, UPI IDs, modern handles)
- multilingual: PII in international formats embedded in English text
- adversarial: Intentionally confusing inputs designed to fool NER models

PII TYPES TO VALIDATE:
NAME, EMAIL, PHONE, DATE_OF_BIRTH, POSTAL_CODE, CREDIT_CARD, BANK_ACCOUNT,
DRIVER_LICENSE, PASSPORT_NUMBER, NATIONAL_IDENTITY_SSN_AADHAR, OTHER_NATIONAL_IDENTITY,
TAX_IDENTIFICATION, VEHICLE_REGISTRATION, INSURANCE_NUMBER, BANK_UPI_ID,
NAMES_OF_PLACES_OR_NOUNS

YOUR VALIDATION CRITERIA:

1. TEXT COHERENCE (1-5 scale):
   - 5: Perfectly natural, indistinguishable from human-written text
   - 4: Minor awkwardness but clearly understandable
   - 3: Noticeable issues but usable for training
   - 2: Significant problems affecting training quality
   - 1: Incoherent, unusable garbage
   
2. ENTITY VALIDATION:
   - Is the label semantically correct for the entity text?
   - Are the span boundaries accurate (no missing/extra characters)?
   - Is the PII format realistic for its claimed locale?
   - Could this entity be confused with something else?

3. DIMENSION VALIDATION:
   - Does the sample ACTUALLY exhibit the claimed dimension's characteristics?
   - For "noisy": Are there realistic typos/OCR errors/formatting issues?
   - For "contextual": Is there genuine ambiguity requiring context?
   - For "adversarial": Would this actually fool an NER model?

4. OVERALL VALIDITY:
   - ERROR (invalid): Critical issues that would harm model training
   - WARNING (valid with issues): Minor issues, usable but suboptimal
   - PASS (valid): Good quality sample ready for training

Respond with a JSON object matching this exact schema:
{
    "is_valid": boolean,
    "severity": "pass" | "warning" | "error",
    "text_coherence_score": 1-5,
    "text_coherence_issues": ["issue1", "issue2", ...],
    "entity_validations": [
        {
            "entity_text": "...",
            "entity_label": "...",
            "is_correct_label": boolean,
            "is_correct_boundary": boolean,
            "suggested_label": "..." or null,
            "issue_description": "..." or null
        }
    ],
    "dimension_validation": {
        "claimed_dimension": "...",
        "exhibits_characteristics": boolean,
        "characteristics_found": ["..."],
        "missing_characteristics": ["..."]
    },
    "overall_assessment": "Free-form assessment",
    "suggested_fixes": ["fix1", "fix2", ...]
}

Be rigorous but fair. We need high-quality training data, but don't reject samples over trivial issues."""


def format_sample_for_validation(sample: SyntheticSample) -> str:
    """
    Format a synthetic sample into a prompt for GPT-5.1 validation.
    
    This creates a structured representation of the sample that includes
    all information needed for thorough validation.
    
    Args:
        sample: The synthetic sample to validate.
        
    Returns:
        Formatted string representation for the validation prompt.
    """
    entities_formatted: list[str] = []
    for entity in sample.entities:
        entity_text = sample.text[entity.start:entity.end]
        entities_formatted.append(
            f"  - '{entity_text}' [{entity.start}:{entity.end}] ‚Üí {entity.label}"
        )
    
    return f"""SAMPLE TO VALIDATE:

TEXT:
\"\"\"{sample.text}\"\"\"

ENTITY ANNOTATIONS:
{chr(10).join(entities_formatted)}

CLAIMED FEATURE DIMENSION: {sample.feature_dimension.value}
SEED PII TYPE: {sample.seed_pii_type}
SEED PII VALUE: {sample.seed_pii_value}
SEED PII LOCALE: {sample.seed_pii_locale or "unspecified"}
SCENARIO: {sample.scenario}
TYPE VARIANT: {sample.type_variant}

Validate this sample according to your criteria."""


# Test with a sample
if passed_prefilter:
    test_sample = passed_prefilter[0]
    print("Sample validation prompt (first sample):")
    print("-" * 60)
    print(format_sample_for_validation(test_sample))
    print("-" * 60)

Sample validation prompt (first sample):
------------------------------------------------------------
SAMPLE TO VALIDATE:

TEXT:
"""Guten Tag Herr M√ºller, vielen Dank f√ºr Ihre Fintech-App-Anmeldung. Wir haben Ihren deutschen Reisepass C41531085 verifiziert. Bitte best√§tigen Sie die UPI-ID netflixuser@paytm und Ihr Ethereum-Wallet 0x742d35Cc6634C0532925a3b8D87e061Df4B15f62 f√ºr die Auszahlungen. Der 2FA-Code ist 847392, g√ºltig f√ºr 5 Minuten. Falls Probleme, kontaktieren Sie support@fintech.de."""

ENTITY ANNOTATIONS:
  - 'C41531085' [103:112] ‚Üí PASSPORT_NUMBER
  - 'netflixuser@paytm' [158:175] ‚Üí UPI_ID
  - '0x742d35Cc6634C0532925a3b8D87e061Df4B15f62' [200:242] ‚Üí CRYPTO_WALLET
  - '847392' [282:288] ‚Üí TOTP_CODE

CLAIMED FEATURE DIMENSION: evolving
SEED PII TYPE: PASSPORT_NUMBER
SEED PII VALUE: C41531085
SEED PII LOCALE: de_DE
SCENARIO: Fintech app onboarding email verifying passport and linking modern digital payment identifiers for a German user
TYPE VARIANT: standard

Vali

## Test validation with a small sample set (synchronous, single-threaded).

This cell validates a handful of samples using direct API calls to verify:

1. The Responses API format is correct
2. GPT-5.1 returns properly structured JSON responses
3. The response parsing works correctly
4. The validation logic produces sensible results

Run this BEFORE running the full concurrent validation to catch issues early.


In [6]:
import json
from enum import Enum
from typing import Any

from openai import OpenAI
from pydantic import BaseModel, Field
from tqdm.auto import tqdm


# =============================================================================
# VALIDATION RESULT SCHEMAS (self-contained definitions for testing)
# =============================================================================

class ValidationSeverity(str, Enum):
    """Severity levels for validation issues."""
    PASS = "pass"
    WARNING = "warning"
    ERROR = "error"


class EntityValidation(BaseModel):
    """Validation result for a single entity annotation."""
    entity_text: str
    entity_label: str
    is_correct_label: bool
    is_correct_boundary: bool
    suggested_label: str | None = None
    issue_description: str | None = None


class DimensionValidation(BaseModel):
    """Validation result for feature dimension characteristics."""
    claimed_dimension: str
    exhibits_characteristics: bool
    characteristics_found: list[str] = Field(default_factory=list)
    missing_characteristics: list[str] = Field(default_factory=list)


class OpenAIValidationResult(BaseModel):
    """Complete validation result from OpenAI GPT-5.1 analysis."""
    is_valid: bool
    severity: ValidationSeverity
    text_coherence_score: int = Field(ge=1, le=5)
    text_coherence_issues: list[str] = Field(default_factory=list)
    entity_validations: list[EntityValidation] = Field(default_factory=list)
    dimension_validation: DimensionValidation
    overall_assessment: str
    suggested_fixes: list[str] = Field(default_factory=list)


# =============================================================================
# TEST CONFIGURATION
# =============================================================================

TEST_SAMPLE_COUNT: int = 3  # Keep small for quick testing
TEST_MODEL: str = MODEL_ID
TEST_REASONING_EFFORT: str = REASONING_EFFORT

# Verify prerequisites
if not passed_prefilter:
    raise ValueError("No samples passed pre-filter. Run Cell 4 first.")

# Select diverse test samples (one per dimension if possible)
test_samples: list[SyntheticSample] = []
seen_dimensions: set[str] = set()

for sample in passed_prefilter:
    dim = sample.feature_dimension.value
    if dim not in seen_dimensions and len(test_samples) < TEST_SAMPLE_COUNT:
        test_samples.append(sample)
        seen_dimensions.add(dim)

# Fill remaining slots
while len(test_samples) < TEST_SAMPLE_COUNT and len(test_samples) < len(passed_prefilter):
    for sample in passed_prefilter:
        if sample not in test_samples:
            test_samples.append(sample)
            break

print(f"Testing with {len(test_samples)} samples:")
for i, sample in enumerate(test_samples, 1):
    print(f"  {i}. {sample.generation_id} ({sample.feature_dimension.value}, {sample.seed_pii_type})")


# =============================================================================
# VALIDATION FUNCTION (Responses API)
# =============================================================================

def validate_single_sample_sync(
    sample: SyntheticSample,
    client: OpenAI,
    model: str = TEST_MODEL,
    reasoning_effort: str = TEST_REASONING_EFFORT,
) -> tuple[OpenAIValidationResult | None, dict[str, Any], str | None]:
    """
    Validate a single sample synchronously using the Responses API.
    
    Args:
        sample: The synthetic sample to validate.
        client: OpenAI client instance.
        model: Model ID to use.
        reasoning_effort: Reasoning effort level.
        
    Returns:
        Tuple of (parsed_result, raw_info, error_message).
    """
    user_prompt = format_sample_for_validation(sample)
    content: str | None = None
    
    try:
        response = client.responses.create(
            model=model,
            input=[
                {
                    "role": "developer",
                    "content": [{"type": "input_text", "text": VALIDATION_SYSTEM_PROMPT}],
                },
                {
                    "role": "user",
                    "content": [{"type": "input_text", "text": user_prompt}],
                },
            ],
            reasoning={"effort": reasoning_effort},
            text={"format": {"type": "json_object"}},
            max_output_tokens=2000,
        )
        
        content = response.output_text
        if not content:
            return None, {}, "Empty response (output_text is None)"
        
        result_dict = json.loads(content)
        parsed_result = OpenAIValidationResult.model_validate(result_dict)
        
        tokens = 0
        if response.usage:
            tokens = response.usage.input_tokens + response.usage.output_tokens
        
        raw_info = {
            "model": response.model,
            "id": response.id,
            "usage": {
                "input_tokens": response.usage.input_tokens if response.usage else 0,
                "output_tokens": response.usage.output_tokens if response.usage else 0,
                "total_tokens": tokens,
            },
            "raw_content": content,
        }
        
        return parsed_result, raw_info, None
        
    except json.JSONDecodeError as e:
        return None, {"raw_content": content}, f"JSON parse error: {e}"
    except Exception as e:
        return None, {}, f"API error: {type(e).__name__}: {e}"


# =============================================================================
# RUN TEST
# =============================================================================

print(f"\n{'='*70}")
print("RUNNING TEST VALIDATION (Responses API)")
print(f"Model: {TEST_MODEL} | Reasoning: {TEST_REASONING_EFFORT}")
print(f"{'='*70}\n")

test_results: list[tuple[SyntheticSample, OpenAIValidationResult | None, dict[str, Any], str | None]] = []

for sample in tqdm(test_samples, desc="Validating"):
    result, raw_info, error = validate_single_sample_sync(sample, client)
    test_results.append((sample, result, raw_info, error))

# Display results
total_success = 0
total_tokens = 0

for i, (sample, result, raw_info, error) in enumerate(test_results, 1):
    print(f"\n{'‚îÄ'*50}")
    print(f"SAMPLE {i}: {sample.generation_id}")
    print(f"  Dimension: {sample.feature_dimension.value} | PII: {sample.seed_pii_type}")
    print(f"  Text: {sample.text[:60]}...")
    
    if error:
        print(f"\n  ‚ùå ERROR: {error}")
    else:
        total_success += 1
        total_tokens += raw_info["usage"]["total_tokens"]
        
        print(f"\n  ‚úÖ RESULT:")
        print(f"     Severity: {result.severity.value} | Coherence: {result.text_coherence_score}/5")
        print(f"     Valid: {result.is_valid} | Exhibits Dimension: {result.dimension_validation.exhibits_characteristics}")
        print(f"     Assessment: {result.overall_assessment[:80]}...")
        print(f"     Tokens: {raw_info['usage']['total_tokens']}")

# Summary
print(f"\n{'='*70}")
print("TEST SUMMARY")
print(f"{'='*70}")
print(f"  Successful: {total_success}/{len(test_samples)}")
print(f"  Total tokens: {total_tokens}")

if total_success > 0:
    avg_tokens = total_tokens / total_success
    est_total = avg_tokens * len(passed_prefilter)
    est_time_min = len(passed_prefilter) / 40 * 25 / 60
    print(f"  Avg tokens/sample: {avg_tokens:.0f}")
    print(f"\n  üìà FULL RUN ESTIMATES ({len(passed_prefilter)} samples):")
    print(f"     Total tokens: ~{est_total:,.0f}")
    print(f"     Time (40 concurrent): ~{est_time_min:.0f} minutes")

if total_success == len(test_samples):
    print(f"\n‚úÖ All tests passed! Safe to proceed with full validation.")
else:
    print(f"\n‚ö†Ô∏è  Some tests failed. Review errors before proceeding.")

Testing with 3 samples:
  1. evolving_PASSPORT_NUMBER_e519c0e4 (evolving, PASSPORT_NUMBER)
  2. multilingual_OTHER_NATIONAL_IDENTITY_b00d3f36 (multilingual, OTHER_NATIONAL_IDENTITY)
  3. basic_PASSPORT_NUMBER_06a75db0 (basic, PASSPORT_NUMBER)

RUNNING TEST VALIDATION (Responses API)
Model: gpt-5.1-2025-11-13 | Reasoning: low



Validating:   0%|          | 0/3 [00:00<?, ?it/s]


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
SAMPLE 1: evolving_PASSPORT_NUMBER_e519c0e4
  Dimension: evolving | PII: PASSPORT_NUMBER
  Text: Guten Tag Herr M√ºller, vielen Dank f√ºr Ihre Fintech-App-Anme...

  ‚úÖ RESULT:
     Valid: False | Exhibits Dimension: True
     Assessment: The text is coherent, natural German and fits the described fintech onboarding s...
     Tokens: 2569

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
SAMPLE 2: multilingual_OTHER_NATIONAL_IDENTITY_b00d3f36
  Dimension: multilingual | PII: OTHER_NATIONAL_IDENTITY
  Text: Dear Mr. Patel, thank you for submitting your employment app...

  ‚úÖ RESULT:
     Valid: True | Exhibits Dimension: True
     Assessment: The text is coherent, natural, and suitable for training. Entity boundaries are ...
     Toke

## Concurrent async validation using the Responses API.

Uses asyncio with a semaphore to run 40 requests simultaneously.

With 40 concurrent requests and ~25s average per request:

-   10,836 samples √∑ 40 = 271 batches
-   271 √ó 25s = 6,775s ‚âà 113 minutes ‚âà 2 hours


In [None]:
import asyncio
import json
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from typing import Any

from openai import AsyncOpenAI
from pydantic import BaseModel, Field
from tqdm.asyncio import tqdm_asyncio

# Handle nested event loops in Jupyter
try:
    import nest_asyncio
    nest_asyncio.apply()
except ImportError:
    pass  # Not needed if not in nested loop situation


# =============================================================================
# VALIDATION RESULT SCHEMAS
# =============================================================================

class ValidationSeverity(str, Enum):
    """Severity levels for validation issues."""
    PASS = "pass"
    WARNING = "warning"
    ERROR = "error"


class EntityValidation(BaseModel):
    """Validation result for a single entity annotation."""
    entity_text: str
    entity_label: str
    is_correct_label: bool
    is_correct_boundary: bool
    suggested_label: str | None = None
    issue_description: str | None = None


class DimensionValidation(BaseModel):
    """Validation result for feature dimension characteristics."""
    claimed_dimension: str
    exhibits_characteristics: bool
    characteristics_found: list[str] = Field(default_factory=list)
    missing_characteristics: list[str] = Field(default_factory=list)


class OpenAIValidationResult(BaseModel):
    """Complete validation result from OpenAI GPT-5.1 analysis."""
    is_valid: bool
    severity: ValidationSeverity
    text_coherence_score: int = Field(ge=1, le=5)
    text_coherence_issues: list[str] = Field(default_factory=list)
    entity_validations: list[EntityValidation] = Field(default_factory=list)
    dimension_validation: DimensionValidation
    overall_assessment: str
    suggested_fixes: list[str] = Field(default_factory=list)


# =============================================================================
# RESULT WRAPPER
# =============================================================================

@dataclass
class ValidationResultWrapper:
    """
    Container for validation results with metadata.
    
    Attributes:
        sample: Original synthetic sample.
        result: Parsed validation result (None if failed).
        error: Error message if failed.
        tokens_used: Total tokens consumed.
        duration_seconds: Request duration.
    """
    sample: SyntheticSample
    result: OpenAIValidationResult | None = None
    error: str | None = None
    tokens_used: int = 0
    duration_seconds: float = 0.0


# =============================================================================
# CONFIGURATION
# =============================================================================

MAX_CONCURRENT: int = 40
MAX_RETRIES: int = 3
RETRY_DELAY_BASE: float = 2.0
CHECKPOINT_EVERY: int = 500

async_client = AsyncOpenAI(api_key=OPENAI_API_KEY)

print(f"{'='*70}")
print("CONCURRENT VALIDATION CONFIGURATION")
print(f"{'='*70}")
print(f"  Model: {MODEL_ID}")
print(f"  Reasoning effort: {REASONING_EFFORT}")
print(f"  Concurrent requests: {MAX_CONCURRENT}")
print(f"  Samples to validate: {len(passed_prefilter)}")
print(f"  Checkpoint interval: every {CHECKPOINT_EVERY} samples")
est_minutes = len(passed_prefilter) / MAX_CONCURRENT * 25 / 60
print(f"  Estimated time: ~{est_minutes:.0f} minutes ({est_minutes/60:.1f} hours)")


# =============================================================================
# ASYNC VALIDATION FUNCTION
# =============================================================================

async def validate_sample_async(
    sample: SyntheticSample,
    client: AsyncOpenAI,
    semaphore: asyncio.Semaphore,
    model: str = MODEL_ID,
    reasoning_effort: str = REASONING_EFFORT,
) -> ValidationResultWrapper:
    """
    Validate a single sample asynchronously with retry logic.
    
    Uses semaphore to limit concurrency. Implements exponential backoff
    for transient errors like rate limits and timeouts.
    """
    user_prompt = format_sample_for_validation(sample)
    start_time = asyncio.get_event_loop().time()
    
    async with semaphore:
        last_error: str | None = None
        
        for attempt in range(MAX_RETRIES):
            try:
                response = await client.responses.create(
                    model=model,
                    input=[
                        {
                            "role": "developer",
                            "content": [{"type": "input_text", "text": VALIDATION_SYSTEM_PROMPT}],
                        },
                        {
                            "role": "user",
                            "content": [{"type": "input_text", "text": user_prompt}],
                        },
                    ],
                    reasoning={"effort": reasoning_effort},
                    text={"format": {"type": "json_object"}},
                    max_output_tokens=2000,
                )
                
                content = response.output_text
                if not content:
                    raise ValueError("Empty response (output_text is None)")
                
                result_dict = json.loads(content)
                parsed_result = OpenAIValidationResult.model_validate(result_dict)
                
                tokens = 0
                if response.usage:
                    tokens = response.usage.input_tokens + response.usage.output_tokens
                
                duration = asyncio.get_event_loop().time() - start_time
                
                return ValidationResultWrapper(
                    sample=sample,
                    result=parsed_result,
                    tokens_used=tokens,
                    duration_seconds=duration,
                )
                
            except json.JSONDecodeError as e:
                last_error = f"JSON parse error: {e}"
                break  # Don't retry JSON errors
                
            except Exception as e:
                last_error = f"{type(e).__name__}: {e}"
                
                # Check if retryable
                err_str = str(e).lower()
                retryable = any(x in err_str for x in [
                    "rate limit", "timeout", "connection", "server error",
                    "503", "502", "429", "500"
                ])
                
                if retryable and attempt < MAX_RETRIES - 1:
                    delay = RETRY_DELAY_BASE * (2 ** attempt)
                    await asyncio.sleep(delay)
                else:
                    break
        
        duration = asyncio.get_event_loop().time() - start_time
        return ValidationResultWrapper(
            sample=sample,
            error=last_error,
            duration_seconds=duration,
        )


# =============================================================================
# CHECKPOINT FUNCTION
# =============================================================================

def save_checkpoint(
    results: list[ValidationResultWrapper],
    path: Path,
) -> None:
    """Save current results to checkpoint file."""
    with open(path, "w", encoding="utf-8") as f:
        for r in results:
            record = {
                "generation_id": r.sample.generation_id,
                "success": r.result is not None,
                "error": r.error,
                "tokens_used": r.tokens_used,
                "duration_seconds": r.duration_seconds,
            }
            if r.result:
                record["validation_result"] = r.result.model_dump()
            f.write(json.dumps(record, ensure_ascii=False) + "\n")


# =============================================================================
# MAIN VALIDATION RUNNER
# =============================================================================

async def run_validation(
    samples: list[SyntheticSample],
    client: AsyncOpenAI,
) -> list[ValidationResultWrapper]:
    """Run concurrent validation on all samples with progress tracking."""
    
    semaphore = asyncio.Semaphore(MAX_CONCURRENT)
    tasks = [validate_sample_async(s, client, semaphore) for s in samples]
    
    results: list[ValidationResultWrapper] = []
    checkpoint_count = 0
    
    for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Validating"):
        result = await coro
        results.append(result)
        
        # Periodic checkpoint
        if len(results) % CHECKPOINT_EVERY == 0:
            checkpoint_count += 1
            cp_path = BATCH_OUTPUT_DIR / f"checkpoint_{checkpoint_count:03d}.jsonl"
            save_checkpoint(results, cp_path)
            
    return results


# =============================================================================
# EXECUTE VALIDATION
# =============================================================================

print(f"\n{'='*70}")
print("STARTING VALIDATION")
print(f"{'='*70}")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Run the async validation
validation_results = await run_validation(passed_prefilter, async_client)

print(f"\nEnd time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


# =============================================================================
# PROCESS RESULTS
# =============================================================================

successful = [r for r in validation_results if r.result is not None]
failed = [r for r in validation_results if r.result is None]

total_tokens = sum(r.tokens_used for r in validation_results)
total_duration = sum(r.duration_seconds for r in validation_results)
avg_duration = total_duration / len(validation_results) if validation_results else 0

print(f"\n{'='*70}")
print("VALIDATION COMPLETE")
print(f"{'='*70}")
print(f"  Total samples: {len(validation_results)}")
print(f"  Successful: {len(successful)} ({len(successful)/len(validation_results)*100:.1f}%)")
print(f"  Failed: {len(failed)} ({len(failed)/len(validation_results)*100:.1f}%)")
print(f"  Total tokens: {total_tokens:,}")
print(f"  Avg duration/sample: {avg_duration:.1f}s")

if failed:
    print(f"\nFailed samples (first 10):")
    error_counts: dict[str, int] = {}
    for r in failed:
        err_type = r.error.split(":")[0] if r.error else "Unknown"
        error_counts[err_type] = error_counts.get(err_type, 0) + 1
    for err_type, count in sorted(error_counts.items(), key=lambda x: -x[1])[:5]:
        print(f"  {err_type}: {count}")


# =============================================================================
# SAVE FINAL RESULTS
# =============================================================================

final_path = BATCH_OUTPUT_DIR / "validation_results_final.jsonl"
with open(final_path, "w", encoding="utf-8") as f:
    for r in validation_results:
        record = {
            "generation_id": r.sample.generation_id,
            "sample": r.sample.model_dump(),
            "success": r.result is not None,
            "error": r.error,
            "tokens_used": r.tokens_used,
            "duration_seconds": r.duration_seconds,
        }
        if r.result:
            record["validation_result"] = r.result.model_dump()
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"\n‚úì Saved: {final_path}")


# =============================================================================
# CREATE OUTPUT FOR DOWNSTREAM CELLS
# =============================================================================

# This is the format expected by Cell 10 (Generate Validation Report)
successful_validations: list[tuple[SyntheticSample, OpenAIValidationResult]] = [
    (r.sample, r.result) for r in successful
]

# Also track failed validations for Cell 11
failed_validations: list[tuple[SyntheticSample, str]] = [
    (r.sample, r.error or "Unknown error") for r in failed
]

print(f"‚úì Created successful_validations: {len(successful_validations)} entries")
print(f"‚úì Created failed_validations: {len(failed_validations)} entries")
print(f"\n‚Üí Ready for Cell 10 (Generate Validation Report)")

## Generate Validation Report

Generate comprehensive validation report from the results.

This provides aggregate statistics, per-dimension breakdowns, and
actionable insights for understanding synthetic data quality.


In [None]:
def generate_validation_report(
    results: list[tuple[SyntheticSample, OpenAIValidationResult]],
) -> dict[str, Any]:
    """
    Generate a comprehensive validation report.
    
    Args:
        results: List of (sample, result) tuples from validation.
        
    Returns:
        Dictionary containing aggregate statistics and breakdowns.
    """
    total = len(results)
    if total == 0:
        return {"error": "No results to report"}
    
    passed = sum(1 for _, r in results if r.severity == ValidationSeverity.PASS)
    warnings = sum(1 for _, r in results if r.severity == ValidationSeverity.WARNING)
    errors = sum(1 for _, r in results if r.severity == ValidationSeverity.ERROR)
    
    # Coherence score distribution
    coherence_scores = [r.text_coherence_score for _, r in results]
    avg_coherence = sum(coherence_scores) / len(coherence_scores)
    
    # Per-dimension breakdown
    dimension_stats: dict[str, dict[str, int]] = {}
    for sample, result in results:
        dim = sample.feature_dimension.value
        if dim not in dimension_stats:
            dimension_stats[dim] = {
                "total": 0,
                "pass": 0,
                "warning": 0,
                "error": 0,
                "exhibits_dim": 0,
            }
        dimension_stats[dim]["total"] += 1
        dimension_stats[dim][result.severity.value] += 1
        if result.dimension_validation.exhibits_characteristics:
            dimension_stats[dim]["exhibits_dim"] += 1
    
    # Per-PII-type breakdown
    pii_type_stats: dict[str, dict[str, int]] = {}
    for sample, result in results:
        pii_type = sample.seed_pii_type
        if pii_type not in pii_type_stats:
            pii_type_stats[pii_type] = {
                "total": 0,
                "pass": 0,
                "warning": 0,
                "error": 0,
            }
        pii_type_stats[pii_type]["total"] += 1
        pii_type_stats[pii_type][result.severity.value] += 1
    
    # Most common issues
    all_issues: list[str] = []
    for _, result in results:
        all_issues.extend(result.text_coherence_issues)
        for ev in result.entity_validations:
            if ev.issue_description:
                all_issues.append(ev.issue_description)
    
    issue_counts = Counter(all_issues).most_common(20)
    
    # Entity label accuracy
    total_entities = 0
    correct_labels = 0
    correct_boundaries = 0
    for _, result in results:
        for ev in result.entity_validations:
            total_entities += 1
            if ev.is_correct_label:
                correct_labels += 1
            if ev.is_correct_boundary:
                correct_boundaries += 1
    
    return {
        "summary": {
            "total_samples": total,
            "passed": passed,
            "warnings": warnings,
            "errors": errors,
            "pass_rate": passed / total if total > 0 else 0,
            "usable_rate": (passed + warnings) / total if total > 0 else 0,
        },
        "coherence": {
            "average_score": avg_coherence,
            "score_distribution": {
                score: coherence_scores.count(score) for score in range(1, 6)
            },
        },
        "entity_accuracy": {
            "total_entities": total_entities,
            "correct_labels": correct_labels,
            "correct_boundaries": correct_boundaries,
            "label_accuracy": correct_labels / total_entities if total_entities > 0 else 0,
            "boundary_accuracy": correct_boundaries / total_entities if total_entities > 0 else 0,
        },
        "per_dimension": dimension_stats,
        "per_pii_type": pii_type_stats,
        "top_issues": issue_counts,
    }


def print_validation_report(report: dict[str, Any]) -> None:
    """
    Print a formatted validation report to stdout.
    
    Args:
        report: Report dictionary from generate_validation_report.
    """
    print("=" * 80)
    print("OPENAI GPT-5.1 VALIDATION REPORT")
    print("=" * 80)
    
    s = report["summary"]
    print(f"\nOVERALL SUMMARY:")
    print(f"  Total samples validated: {s['total_samples']}")
    print(f"  Passed: {s['passed']} ({s['pass_rate']*100:.1f}%)")
    print(f"  Warnings: {s['warnings']}")
    print(f"  Errors: {s['errors']}")
    print(f"  Usable (pass + warning): {s['usable_rate']*100:.1f}%")
    
    c = report["coherence"]
    print(f"\nTEXT COHERENCE:")
    print(f"  Average score: {c['average_score']:.2f}/5.0")
    print(f"  Distribution: {c['score_distribution']}")
    
    e = report["entity_accuracy"]
    print(f"\nENTITY ANNOTATION ACCURACY:")
    print(f"  Total entities validated: {e['total_entities']}")
    print(f"  Correct labels: {e['correct_labels']} ({e['label_accuracy']*100:.1f}%)")
    print(f"  Correct boundaries: {e['correct_boundaries']} ({e['boundary_accuracy']*100:.1f}%)")
    
    print(f"\nPER-DIMENSION BREAKDOWN:")
    for dim, stats in sorted(report["per_dimension"].items()):
        if stats["total"] > 0:
            exhibit_rate = stats["exhibits_dim"] / stats["total"] * 100
            pass_rate = stats["pass"] / stats["total"] * 100
            print(f"  {dim}:")
            print(
                f"    Total: {stats['total']}, Pass: {stats['pass']} ({pass_rate:.1f}%), "
                f"Warning: {stats['warning']}, Error: {stats['error']}"
            )
            print(f"    Exhibits dimension characteristics: {exhibit_rate:.1f}%")
    
    print(f"\nPER-PII-TYPE BREAKDOWN (top 10 by error rate):")
    pii_stats = report["per_pii_type"]
    # Sort by error rate descending
    sorted_pii = sorted(
        pii_stats.items(),
        key=lambda x: x[1]["error"] / x[1]["total"] if x[1]["total"] > 0 else 0,
        reverse=True,
    )
    for pii_type, stats in sorted_pii[:10]:
        if stats["total"] > 0:
            error_rate = stats["error"] / stats["total"] * 100
            print(
                f"  {pii_type}: {stats['total']} samples, "
                f"Pass: {stats['pass']}, Warning: {stats['warning']}, "
                f"Error: {stats['error']} ({error_rate:.1f}%)"
            )
    
    print(f"\nTOP 20 ISSUES:")
    for i, (issue, count) in enumerate(report["top_issues"], 1):
        truncated = issue[:70] + "..." if len(issue) > 70 else issue
        print(f"  {i:2}. [{count:3}x] {truncated}")
    
    print("=" * 80)


# Generate and print report
report = generate_validation_report(successful_validations)
print_validation_report(report)

# Save report to JSON
report_path = BATCH_OUTPUT_DIR / "validation_report.json"
with open(report_path, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)
print(f"\n‚úì Saved report: {report_path}")

## Filter Valid Samples and Save

Samples that pass validation (or have only warnings) are kept.
Samples with coherence score < 3 are filtered out per `PLAN.md` requirements.


In [None]:
def filter_valid_samples(
    validation_results: list[tuple[SyntheticSample, OpenAIValidationResult]],
    include_warnings: bool = True,
    min_coherence_score: int = 3,
) -> tuple[list[SyntheticSample], list[tuple[SyntheticSample, OpenAIValidationResult, str]]]:
    """
    Filter samples based on validation results.
    
    Args:
        validation_results: List of (sample, result) tuples.
        include_warnings: Whether to include samples with warnings.
        min_coherence_score: Minimum coherence score to pass (default 3 per PLAN.md).
        
    Returns:
        Tuple of (valid_samples, rejected_with_reasons).
    """
    valid: list[SyntheticSample] = []
    rejected: list[tuple[SyntheticSample, OpenAIValidationResult, str]] = []
    
    for sample, result in validation_results:
        rejection_reason = None
        
        # Check coherence score threshold
        if result.text_coherence_score < min_coherence_score:
            rejection_reason = f"Coherence score {result.text_coherence_score} < {min_coherence_score}"
        
        # Check severity
        elif result.severity == ValidationSeverity.ERROR:
            rejection_reason = f"Severity ERROR: {result.overall_assessment[:100]}"
        
        elif result.severity == ValidationSeverity.WARNING and not include_warnings:
            rejection_reason = f"Severity WARNING (not included): {result.overall_assessment[:100]}"
        
        # Check dimension characteristics
        elif not result.dimension_validation.exhibits_characteristics:
            rejection_reason = (
                f"Does not exhibit {result.dimension_validation.claimed_dimension} characteristics"
            )
        
        if rejection_reason:
            rejected.append((sample, result, rejection_reason))
        else:
            valid.append(sample)
    
    return valid, rejected


# Filter samples
valid_samples, rejected_samples = filter_valid_samples(
    successful_validations,
    include_warnings=True,
    min_coherence_score=3,
)

print(f"‚úì Filtering complete")
print(f"  Valid samples: {len(valid_samples)}")
print(f"  Rejected samples: {len(rejected_samples)}")
print(f"  Pre-filter failures: {len(failed_prefilter)}")
print(f"  API/parse failures: {len(failed_validations)}")

total_input = len(samples)
total_valid = len(valid_samples)
print(f"\n  Final valid rate: {total_valid}/{total_input} ({total_valid/total_input*100:.1f}%)")

# Show rejection reasons breakdown
if rejected_samples:
    reason_counts: Counter[str] = Counter()
    for _, _, reason in rejected_samples:
        # Categorize reason
        if "Coherence score" in reason:
            reason_counts["Low coherence score"] += 1
        elif "ERROR" in reason:
            reason_counts["Severity ERROR"] += 1
        elif "characteristics" in reason:
            reason_counts["Missing dimension characteristics"] += 1
        else:
            reason_counts["Other"] += 1
    
    print("\nRejection reasons breakdown:")
    for reason, count in reason_counts.most_common():
        print(f"  {reason}: {count}")

## Export Validated Samples

Export validated samples in multiple formats for downstream use.

The validated samples are saved in JSONL, CSV, and Parquet formats
for flexibility in the training pipeline.


In [None]:
def export_validated_samples(
    valid_samples: list[SyntheticSample],
    rejected_samples: list[tuple[SyntheticSample, OpenAIValidationResult, str]],
    output_dir: Path,
) -> dict[str, Path]:
    """
    Export validated samples in multiple formats.
    
    Args:
        valid_samples: List of samples that passed validation.
        rejected_samples: List of (sample, result, reason) tuples for rejected samples.
        output_dir: Directory to save output files.
        
    Returns:
        Dictionary mapping format names to file paths.
    """
    exported: dict[str, Path] = {}
    
    # 1. Valid samples - JSONL
    valid_jsonl_path = output_dir / "validated_samples.jsonl"
    with open(valid_jsonl_path, "w", encoding="utf-8") as f:
        for sample in valid_samples:
            f.write(json.dumps(sample.model_dump(), ensure_ascii=False) + "\n")
    exported["valid_jsonl"] = valid_jsonl_path
    print(f"‚úì Exported valid samples (JSONL): {valid_jsonl_path}")
    
    # 2. Valid samples - CSV (flattened)
    valid_csv_data = []
    for sample in valid_samples:
        valid_csv_data.append({
            "generation_id": sample.generation_id,
            "text": sample.text,
            "feature_dimension": sample.feature_dimension.value,
            "seed_pii_type": sample.seed_pii_type,
            "seed_pii_value": sample.seed_pii_value,
            "seed_pii_locale": sample.seed_pii_locale,
            "scenario": sample.scenario,
            "type_variant": sample.type_variant,
            "num_entities": len(sample.entities),
            "entities_json": json.dumps([e.model_dump() for e in sample.entities]),
            "timestamp": sample.timestamp,
        })
    
    valid_df = pd.DataFrame(valid_csv_data)
    valid_csv_path = output_dir / "validated_samples.csv"
    valid_df.to_csv(valid_csv_path, index=False, encoding="utf-8")
    exported["valid_csv"] = valid_csv_path
    print(f"‚úì Exported valid samples (CSV): {valid_csv_path}")
    
    # 3. Valid samples - Parquet
    try:
        valid_parquet_path = output_dir / "validated_samples.parquet"
        valid_df.to_parquet(valid_parquet_path, index=False)
        exported["valid_parquet"] = valid_parquet_path
        print(f"‚úì Exported valid samples (Parquet): {valid_parquet_path}")
    except Exception as e:
        print(f"  Note: Parquet export skipped ({e})")
    
    # 4. Rejected samples - JSONL with reasons
    rejected_jsonl_path = output_dir / "rejected_samples.jsonl"
    with open(rejected_jsonl_path, "w", encoding="utf-8") as f:
        for sample, result, reason in rejected_samples:
            record = {
                "sample": sample.model_dump(),
                "validation_result": result.model_dump(),
                "rejection_reason": reason,
            }
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
    exported["rejected_jsonl"] = rejected_jsonl_path
    print(f"‚úì Exported rejected samples (JSONL): {rejected_jsonl_path}")
    
    # 5. Pre-filter failures - JSONL
    prefilter_failures_path = output_dir / "prefilter_failures.jsonl"
    with open(prefilter_failures_path, "w", encoding="utf-8") as f:
        for sample, result in failed_prefilter:
            record = {
                "sample": sample.model_dump(),
                "prefilter_issues": result.issues,
            }
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
    exported["prefilter_failures"] = prefilter_failures_path
    print(f"‚úì Exported pre-filter failures (JSONL): {prefilter_failures_path}")
    
    # 6. Per-dimension valid samples
    for dimension in FeatureDimension:
        dim_samples = [s for s in valid_samples if s.feature_dimension == dimension]
        if dim_samples:
            dim_path = output_dir / f"validated_{dimension.value}.json"
            with open(dim_path, "w", encoding="utf-8") as f:
                json.dump(
                    [s.model_dump() for s in dim_samples],
                    f,
                    indent=2,
                    ensure_ascii=False,
                )
            exported[f"valid_{dimension.value}"] = dim_path
    print(f"‚úì Exported dimension-specific JSON files")
    
    return exported


# Export all files
exported_files = export_validated_samples(
    valid_samples,
    rejected_samples,
    BATCH_OUTPUT_DIR,
)

print(f"\nTotal files exported: {len(exported_files)}")
print("\nExported files:")
for name, path in exported_files.items():
    size_kb = path.stat().st_size / 1024
    print(f"  {name}: {path} ({size_kb:.1f} KB)")

## Summary Statistics

Final summary of the validation pipeline results.


In [None]:
print("=" * 80)
print("VALIDATION PIPELINE SUMMARY")
print("=" * 80)

print(f"\nüìä INPUT:")
print(f"   Total synthetic samples: {len(samples)}")

print(f"\nüîç PRE-FILTER (Programmatic):")
print(f"   Passed: {len(passed_prefilter)}")
print(f"   Failed: {len(failed_prefilter)}")
print(f"   Pass rate: {len(passed_prefilter)/len(samples)*100:.1f}%")

print(f"\nü§ñ GPT-5.1 VALIDATION:")
print(f"   Samples validated: {len(successful_validations)}")
print(f"   API/parse failures: {len(failed_validations)}")

print(f"\n‚úÖ FINAL OUTPUT:")
print(f"   Valid samples: {len(valid_samples)}")
print(f"   Rejected (semantic): {len(rejected_samples)}")
print(f"   Final valid rate: {len(valid_samples)/len(samples)*100:.1f}%")

print(f"\nüìÅ OUTPUT FILES:")
print(f"   Directory: {BATCH_OUTPUT_DIR}")
print(f"   Main file: validated_samples.jsonl")

print(f"\nüìà QUALITY METRICS:")
if report.get("coherence"):
    print(f"   Avg coherence score: {report['coherence']['average_score']:.2f}/5.0")
if report.get("entity_accuracy"):
    print(f"   Entity label accuracy: {report['entity_accuracy']['label_accuracy']*100:.1f}%")
    print(f"   Entity boundary accuracy: {report['entity_accuracy']['boundary_accuracy']*100:.1f}%")

# Distribution of valid samples by dimension
valid_by_dim = Counter(s.feature_dimension.value for s in valid_samples)
print(f"\nüìä VALID SAMPLES BY DIMENSION:")
for dim in sorted(valid_by_dim.keys()):
    print(f"   {dim}: {valid_by_dim[dim]}")

print("=" * 80)
print("‚úì Validation pipeline complete. Ready for Notebook 3 (training).")
print("=" * 80)