# VSDF Privacy Safeguards

Evaluate how the differential privacy budget influences fidelity and privacy risk when generating synthetic data.

In [None]:
import pandas as pd
from vsdf import SchemaLearner, ConstraintSpecification, ConstraintCompiler, ConstraintDrivenSampler, ConstraintVerifier

In [None]:
reference = pd.DataFrame({
    'age': [25, 32, 40, 28, 36, 52, 47, 30, 45, 38],
    'income': [50000, 62000, 58000, 52000, 61000, 75000, 68000, 54000, 72000, 59000],
    'segment': ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'B', 'A', 'B'],
    'city': ['Denver', 'Denver', 'Boulder', 'Denver', 'Boulder', 'Denver', 'Boulder', 'Denver', 'Boulder', 'Denver'],
})
schema = SchemaLearner().learn(reference)
compiler = ConstraintCompiler(schema)

In [None]:
results = []
for epsilon in [2.0, 5.0, 8.0, 12.0]:
    spec = ConstraintSpecification(
        marginal_columns=['segment', 'city'],
        correlation_pairs=[('age', 'income')],
        marginal_tolerance=0.1,
        correlation_tolerance=0.1,
        denial_predicates=['age < 21 and income > 60000'],
        denial_tolerance=0.0,
        dp_epsilon=epsilon,
    )
    constraints = compiler.learn(reference, spec)
    sampler = ConstraintDrivenSampler(schema, constraints, random_state=123)
    synthetic = sampler.sample(200)
    report = ConstraintVerifier(constraints, privacy_threshold=0.1).verify(synthetic, reference)
    fidelity_gap = max(report.marginal_distances.values() or [0.0])
    results.append({
        'epsilon': epsilon,
        'privacy_risk': report.privacy_risk,
        'max_marginal_distance': fidelity_gap,
        'correlation_delta': report.correlation_deltas.get(('age', 'income'), 0.0),
    })
summary = pd.DataFrame(results)
summary

Lower epsilon values inject more noise, reducing privacy risk at the cost of slightly larger fidelity gaps. Tune the privacy threshold and tolerances to meet policy requirements.