In [None]:
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import jensenshannon
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from ganblr import GANBLR  # Assuming GANBLR is saved in a module named ganblr

# Utility function to compute descriptive statistics comparison
def compute_statistics(real_data, synthetic_data):
    real_mean = np.mean(real_data, axis=0)
    real_var = np.var(real_data, axis=0)
    synthetic_mean = np.mean(synthetic_data, axis=0)
    synthetic_var = np.var(synthetic_data, axis=0)
    
    mean_diff = np.abs(real_mean - synthetic_mean)
    var_diff = np.abs(real_var - synthetic_var)
    
    print("\nMean Differences between Real and Synthetic Data:")
    print(mean_diff)
    print("\nVariance Differences between Real and Synthetic Data:")
    print(var_diff)
    
    return mean_diff, var_diff

# Utility function to compute distribution similarity using Kolmogorov-Smirnov test
def compute_distribution_similarity(real_data, synthetic_data):
    ks_results = [ks_2samp(real_data[:, i], synthetic_data[:, i]) for i in range(real_data.shape[1])]
    ks_statistic = np.array([result.statistic for result in ks_results])
    ks_pvalue = np.array([result.pvalue for result in ks_results])
    
    print("\nKolmogorov-Smirnov Test Results:")
    print("KS Statistic:", ks_statistic)
    print("P-Value:", ks_pvalue)
    
    return ks_statistic, ks_pvalue

# Utility function to compute Jensen-Shannon divergence
def compute_js_divergence(real_data, synthetic_data):
    js_divergence = []
    for i in range(real_data.shape[1]):
        real_dist = np.histogram(real_data[:, i], bins=20, density=True)[0]
        synthetic_dist = np.histogram(synthetic_data[:, i], bins=20, density=True)[0]
        jsd = jensenshannon(real_dist, synthetic_dist)
        js_divergence.append(jsd)
    
    js_divergence = np.array(js_divergence)
    
    print("\nJensen-Shannon Divergence between Real and Synthetic Data:")
    print(js_divergence)
    
    return js_divergence

# Privacy evaluation: Membership Inference Attack
def membership_inference_attack(real_data, synthetic_data, model, test_size=0.2):
    real_data = shuffle(real_data, random_state=42)
    synthetic_data = shuffle(synthetic_data, random_state=42)

    # Train a binary classifier to distinguish between real and synthetic data
    labels = np.concatenate([np.ones(len(real_data)), np.zeros(len(synthetic_data))])
    combined_data = np.vstack([real_data, synthetic_data])
    
    X_train, X_test, y_train, y_test = train_test_split(combined_data, labels, test_size=test_size, random_state=42)
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    auc = roc_auc_score(y_test, predictions)
    
    print("\nMembership Inference Attack Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC: {auc:.4f}")
    
    return accuracy, auc

# Load Dataset (e.g., UCI Adult Dataset)
from sklearn.datasets import fetch_openml
dataset = fetch_openml(name='adult', version=2)
X, y = dataset.data, dataset.target

# Encode categorical data
ordinal_encoder = OrdinalEncoder(dtype=int, handle_unknown='use_encoded_value', unknown_value=-1)
X = ordinal_encoder.fit_transform(X)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y).astype(int)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize GANBLR model and fit on training data
ganblr = GANBLR()
ganblr.fit(X_train, y_train, k=2, batch_size=32, epochs=50, warmup_epochs=10, verbose=1)

# Generate synthetic data
synthetic_data = ganblr.sample(size=len(X_train))
synthetic_X, synthetic_y = synthetic_data[:, :-1], synthetic_data[:, -1]

# Data Quality Evaluation
mean_diff, var_diff = compute_statistics(X_train, synthetic_X)
ks_statistic, ks_pvalue = compute_distribution_similarity(X_train, synthetic_X)
js_divergence = compute_js_divergence(X_train, synthetic_X)

# Privacy Evaluation: Membership Inference Attack
logistic_model = LogisticRegression()
mia_accuracy, mia_auc = membership_inference_attack(X_train, synthetic_X, logistic_model)

# Plotting results for better visualization
def plot_results(mean_diff, var_diff, ks_statistic, js_divergence):
    fig, axs = plt.subplots(2, 2, figsize=(15, 10))
    
    axs[0, 0].bar(range(len(mean_diff)), mean_diff)
    axs[0, 0].set_title('Mean Differences')
    
    axs[0, 1].bar(range(len(var_diff)), var_diff)
    axs[0, 1].set_title('Variance Differences')
    
    axs[1, 0].bar(range(len(ks_statistic)), ks_statistic)
    axs[1, 0].set_title('KS Statistic')
    
    axs[1, 1].bar(range(len(js_divergence)), js_divergence)
    axs[1, 1].set_title('Jensen-Shannon Divergence')
    
    plt.show()

plot_results(mean_diff, var_diff, ks_statistic, js_divergence)