In [4]:

import pandas as pd
import numpy as np
from mbi import Dataset, FactoredInference
from scipy.sparse import csc_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

# Placeholder for NIST-MST selection function (adapt from Team RMcKenna's code)
def select_measure_generate(dataset, epsilon, cliques, public_data):
    """
    Simulate NIST-MST select-measure-generate process.
    In practice, use Team RMcKenna's implementation (https://github.com/ryan112358/private-pgm).
    """
    measurements = []
    for cl in cliques:
        proj = dataset.project(cl)
        measurements.append((cl, proj))
    return measurements

# Load HW1 dataset
data_path = r"C:\Users\USER\Documents\NTUST\2nd Sem\Data Privacy\anonymized_adult.csv"
data = pd.read_csv(data_path)

# Preprocess: Handle k-anonymized data
# Assume columns include age, workclass, education, occupation, sex, income (adjust if different)
columns = data.columns
print("Dataset columns:", columns)  # Debug: Verify columns

# Discretize numerical attributes (e.g., age, if not already generalized)
if 'age' in columns:
    data['age'] = pd.cut(data['age'], bins=10, labels=False, include_lowest=True).astype(str)
if 'hours-per-week' in columns:
    data['hours-per-week'] = pd.cut(data['hours-per-week'], bins=10, labels=False, include_lowest=True).astype(str)

# Handle missing/suppressed values (common in k-anonymized data)
data = data.fillna('missing')  # Treat missing as a category
data = data.astype(str)  # Convert all to categorical for NIST-MST

# Initialize MBI Dataset object
dataset = Dataset(data)

# Parameters
epsilon = 1.0  # Privacy budget (moderate privacy)
cliques = [('age', 'sex'), ('education', 'occupation'), ('workclass', 'income')]  # Example marginals
# Simulated public dataset for mutual information (in practice, use a similar public dataset)
public_data = Dataset(data.sample(frac=0.1, random_state=42))

# Step 1: Select marginals using MST (simplified)
measurements = select_measure_generate(dataset, epsilon, cliques, public_data)

# Step 2: Measure selected marginals with Gaussian noise
noisy_measurements = []
for cl, proj in measurements:
    query = proj.datavector()
    sensitivity = np.sqrt(2)  # L2 sensitivity for marginals
    noise = np.random.normal(0, sensitivity / epsilon, query.size)
    noisy_measurements.append((csc_matrix(query + noise), cl))

# Step 3: Generate synthetic data using Private-PGM
engine = FactoredInference(dataset.domain, iters=1000)
synthetic_data = engine.estimate(noisy_measurements).synthetic_data(rows=len(data))

# Save synthetic dataset
synthetic_data.to_csv("synthetic_anonymized_adult.csv", index=False)
print("Synthetic dataset generated and saved as 'synthetic_anonymized_adult.csv'")

# Evaluation: Total Variation Distance (TVD) for marginals
def tvd(original, synthetic, column):
    orig_counts = original[column].value_counts(normalize=True)
    synth_counts = synthetic[column].value_counts(normalize=True)
    common_idx = orig_counts.index.intersection(synth_counts.index)
    return 0.5 * np.sum(np.abs(orig_counts[common_idx] - synth_counts[common_idx]))

# Compute TVD for selected columns (adjust based on actual columns)
tvd_results = {}
for col in ['age', 'sex', 'education']:
    if col in columns:
        tvd_results[col] = tvd(data, synthetic_data, col)
        print(f"TVD for {col}: {tvd_results[col]:.4f}")

# Evaluation: Classification accuracy (predict income, if available)
if 'income' in columns:
    # Prepare data for classification
    X = data.drop('income', axis=1)
    y = data['income']
    X_synthetic = synthetic_data.drop('income', axis=1)
    y_synthetic = synthetic_data['income']

    # Encode categorical variables
    le = LabelEncoder()
    y = le.fit_transform(y)
    y_synthetic = le.transform(y_synthetic)
    X = pd.get_dummies(X)
    X_synthetic = pd.get_dummies(X_synthetic)

    # Align columns
    X_synthetic = X_synthetic.reindex(columns=X.columns, fill_value=0)

    # Split original data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train on synthetic data, test on original
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_synthetic, y_synthetic)
    accuracy = clf.score(X_test, y_test)
    print(f"Classification accuracy (synthetic data): {accuracy:.4f}")

    # Baseline: Train and test on original data
    clf.fit(X_train, y_train)
    baseline_accuracy = clf.score(X_test, y_test)
    print(f"Classification accuracy (original data): {baseline_accuracy:.4f}")

ModuleNotFoundError: No module named 'mbi'