In [2]:
import os
import numpy as np 
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Flatten, Embedding, multiply, LeakyReLU, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.legacy import Adam
import tensorflow.keras.backend as K

# ---------------------- Load and Preprocess ARFF ----------------------
data, meta = arff.loadarff("data/adult 1.arff")
df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode('utf-8').replace("\\", "").replace("'", "").strip() if isinstance(x, bytes) else x)
df['class'] = df['class'].map({'<=50K': 0, '>50K': 1})
df.dropna(inplace=True)

X = df.drop('class', axis=1)
y = df['class']
X = pd.get_dummies(X, drop_first=True)

scaler = MinMaxScaler(feature_range=(-1, 1))
X_scaled = scaler.fit_transform(X)

# ---------------------- JSD Utility ----------------------
def compute_jsd_columnwise(real, fake, bins=50):
    jsd_scores = []
    for i in range(real.shape[1]):
        try:
            r_hist, _ = np.histogram(real[:, i], bins=bins, density=True)
            f_hist, _ = np.histogram(fake[:, i], bins=bins, density=True)
            r_hist += 1e-8
            f_hist += 1e-8
            jsd = jensenshannon(r_hist, f_hist, base=2)
            jsd_scores.append(jsd)
        except:
            continue
    return np.mean(jsd_scores)

# ---------------------- Improved WCGAN-GP ----------------------
class ImprovedWCGANGP:
    def __init__(self, data_dim, num_classes, latent_dim=32, gp_weight=10):
        self.latent_dim = latent_dim
        self.data_dim = data_dim
        self.num_classes = num_classes
        self.gp_weight = gp_weight
        optimizer = Adam(0.0001, beta_1=0.5, beta_2=0.9)

        self.generator = self.build_generator()
        self.critic = self.build_critic()

        self.critic.trainable = True
        self.critic.compile(loss=self.wasserstein_loss, optimizer=optimizer)

        self.critic.trainable = False
        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        fake_data = self.generator([noise, label])
        validity = self.critic([fake_data, label])
        self.combined = Model([noise, label], validity)
        self.combined.compile(loss=self.wasserstein_loss, optimizer=optimizer)

    def wasserstein_loss(self, y_true, y_pred):
        return K.mean(y_true * y_pred)

    def gradient_penalty(self, real_samples, fake_samples, labels):
        alpha = tf.random.uniform([real_samples.shape[0], 1], 0.0, 1.0)
        interpolated = alpha * real_samples + (1 - alpha) * fake_samples
        with tf.GradientTape() as gp_tape:
            gp_tape.watch(interpolated)
            validity_interpolated = self.critic([interpolated, labels])
        grads = gp_tape.gradient(validity_interpolated, interpolated)
        grad_l2 = tf.sqrt(tf.reduce_sum(tf.square(grads), axis=1))
        gp = tf.reduce_mean((grad_l2 - 1.0) ** 2)
        return gp

    def build_generator(self):
        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.latent_dim)(label))
        model_input = multiply([noise, label_embedding])
        x = Dense(128)(model_input)
        x = LeakyReLU(0.2)(x)
        x = BatchNormalization()(x)
        x = Dense(256)(x)
        x = LeakyReLU(0.2)(x)
        x = BatchNormalization()(x)
        x = Dense(512)(x)
        x = LeakyReLU(0.2)(x)
        x = BatchNormalization()(x)
        output = Dense(self.data_dim, activation='tanh')(x)
        return Model([noise, label], output)

    def build_critic(self):
        data_input = Input(shape=(self.data_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.data_dim)(label))
        model_input = multiply([data_input, label_embedding])
        x = Dense(512)(model_input)
        x = LeakyReLU(0.2)(x)
        x = Dense(256)(x)
        x = LeakyReLU(0.2)(x)
        x = Dense(128)(x)
        x = LeakyReLU(0.2)(x)
        output = Dense(1)(x)
        return Model([data_input, label], output)

    def train(self, X_train, y_train, epochs=150, batch_size=64, n_critic=5):
        valid = -np.ones((batch_size, 1))
        fake = np.ones((batch_size, 1))
        for epoch in range(epochs):
            for _ in range(n_critic):
                idx = np.random.randint(0, X_train.shape[0], batch_size)
                real_samples = X_train[idx]
                labels = y_train[idx].reshape(-1, 1)

                noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
                fake_samples = self.generator.predict([noise, labels], verbose=0)

                d_loss_real = self.critic.train_on_batch([real_samples, labels], valid)
                d_loss_fake = self.critic.train_on_batch([fake_samples, labels], fake)
                gp = self.gradient_penalty(real_samples, fake_samples, labels)
                d_loss = 0.5 * (d_loss_real + d_loss_fake) + self.gp_weight * gp

            g_loss = self.combined.train_on_batch([noise, labels], valid)

            if epoch % 50 == 0:
                print(f"Epoch {epoch}/{epochs} | D Loss: {d_loss:.4f} | G Loss: {g_loss:.4f}")

# ---------------------- Create Folder for Synthetic Data ----------------------
folder_path = "Adult - generated data"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)  # Create the folder if it doesn't exist

# ---------------------- Evaluation Setup ----------------------
kf = KFold(n_splits=2, shuffle=True, random_state=42)
repeats = 3
sample_fraction = 0.5
models = {
    'LR': LogisticRegression(max_iter=200),
    'MLP': MLPClassifier(max_iter=200),
    'RF': RandomForestClassifier(),
    'XGB': XGBClassifier(eval_metric='logloss')
}
results = []

for repeat in range(repeats):
    print(f"\n--- Repeat {repeat + 1} ---")
    for fold, (train_index, test_index) in enumerate(kf.split(X_scaled)):
        print(f"\n--- Fold {fold + 1} ---")
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y.iloc[train_index].values, y.iloc[test_index].values

        wcgan = ImprovedWCGANGP(data_dim=X_train.shape[1], num_classes=2)
        wcgan.train(X_train, y_train, epochs=150)

        train_df = pd.DataFrame(X_train)
        train_df['income'] = y_train
        majority = train_df[train_df['income'] == 0]
        minority = train_df[train_df['income'] == 1]
        minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
        balanced_train = pd.concat([majority, minority_upsampled])

        X_balanced = balanced_train.drop('income', axis=1).values
        y_balanced = balanced_train['income'].values.reshape(-1, 1)

        # Ensure sample size is 50% of the training set size
        sample_size = int(0.5 * len(X_balanced))

        noise = np.random.normal(0, 1, (sample_size, wcgan.latent_dim))
        y_synthetic_labels = y_balanced[:sample_size]
        fake_samples = wcgan.generator.predict([noise, y_synthetic_labels], verbose=0)

        fake_df = pd.DataFrame(fake_samples, columns=X.columns)
        fake_df = pd.DataFrame(scaler.inverse_transform(fake_df), columns=X.columns)
        fake_df = fake_df.clip(lower=X.min(), upper=X.max(), axis=1)
        fake_df = pd.DataFrame(scaler.fit_transform(fake_df), columns=X.columns)

        # Save only the synthetic samples (fake data) to the "Adult - generated data" folder
        synthetic_file_path = os.path.join(folder_path, f'synthetic_samples_repeat_{repeat + 1}_fold_{fold + 1}.csv')
        fake_df.to_csv(synthetic_file_path, index=False)
        print(f"Saved synthetic samples for Repeat {repeat + 1}, Fold {fold + 1} to {synthetic_file_path}")

        for model_name, model in models.items():
            model.fit(X_balanced, y_balanced)
            preds = model.predict(X_test)
            acc = accuracy_score(y_test, preds)

            jsd = compute_jsd_columnwise(X_train, fake_df.values)
            wd = np.mean([
                wasserstein_distance(X_train[:, i], fake_df.values[:, i])
                for i in range(X_train.shape[1])
            ])

            results.append({
                "Repeat": repeat + 1,
                "Fold": fold + 1,
                "Model": model_name,
                "TSTR Accuracy": acc,
                "JSD": jsd,
                "Wasserstein": wd
            })

# ---------------------- Results Summary ----------------------
results_df = pd.DataFrame(results)
print("\n--- Final Evaluation Summary ---")
print(results_df.groupby(["Repeat", "Fold", "Model"]).mean())


  df = df.applymap(lambda x: x.decode('utf-8').replace("\\", "").replace("'", "").strip() if isinstance(x, bytes) else x)



--- Repeat 1 ---

--- Fold 1 ---
Epoch 0/150 | D Loss: 9.6523 | G Loss: -0.0149
Epoch 50/150 | D Loss: 8653.6406 | G Loss: -53.7885
Epoch 100/150 | D Loss: 380491.5000 | G Loss: -852.5272
Saved synthetic samples for Repeat 1, Fold 1 to Adult - generated data\synthetic_samples_repeat_1_fold_1.csv


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  model.fit(X_balanced, y_balanced)



--- Fold 2 ---
Epoch 0/150 | D Loss: 9.6407 | G Loss: -0.0285
Epoch 50/150 | D Loss: 15193.4482 | G Loss: -42.0088
Epoch 100/150 | D Loss: 1548128.7500 | G Loss: -1044.2056
Saved synthetic samples for Repeat 1, Fold 2 to Adult - generated data\synthetic_samples_repeat_1_fold_2.csv


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  model.fit(X_balanced, y_balanced)



--- Repeat 2 ---

--- Fold 1 ---
Epoch 0/150 | D Loss: 9.6421 | G Loss: -0.0185
Epoch 50/150 | D Loss: 9826.2441 | G Loss: -65.3757
Epoch 100/150 | D Loss: 502604.5625 | G Loss: -1109.1370
Saved synthetic samples for Repeat 2, Fold 1 to Adult - generated data\synthetic_samples_repeat_2_fold_1.csv


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  model.fit(X_balanced, y_balanced)



--- Fold 2 ---
Epoch 0/150 | D Loss: 9.6736 | G Loss: -0.0134
Epoch 50/150 | D Loss: 11246.0918 | G Loss: -62.3670
Epoch 100/150 | D Loss: 1526327.1250 | G Loss: -1166.4490
Saved synthetic samples for Repeat 2, Fold 2 to Adult - generated data\synthetic_samples_repeat_2_fold_2.csv


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  model.fit(X_balanced, y_balanced)



--- Repeat 3 ---

--- Fold 1 ---
Epoch 0/150 | D Loss: 9.6782 | G Loss: -0.0208
Epoch 50/150 | D Loss: 11025.9092 | G Loss: -66.6659
Epoch 100/150 | D Loss: 732354.0625 | G Loss: -1170.7943
Saved synthetic samples for Repeat 3, Fold 1 to Adult - generated data\synthetic_samples_repeat_3_fold_1.csv


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return n/db/n.sum(), bin_edges
  return n/db/n.sum(), bin_edges
  y = column_or_1d(y, warn=True)
  return n/db/n.sum(), bin_edges
  return n/db/n.sum(), bin_edges
  model.fit(X_balanced, y_balanced)
  return n/db/n.sum(), bin_edges
  return n/db/n.sum(), bin_edges
  return n/db/n.sum(), bin_edges
  return n/db/n.sum(), bin_edges



--- Fold 2 ---
Epoch 0/150 | D Loss: 9.7140 | G Loss: -0.0052
Epoch 50/150 | D Loss: 6827.8765 | G Loss: -31.5947
Epoch 100/150 | D Loss: 391324.0938 | G Loss: -774.8862


  fake_df = fake_df.clip(lower=X.min(), upper=X.max(), axis=1)


Saved synthetic samples for Repeat 3, Fold 2 to Adult - generated data\synthetic_samples_repeat_3_fold_2.csv


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  model.fit(X_balanced, y_balanced)



--- Final Evaluation Summary ---
                   TSTR Accuracy       JSD  Wasserstein
Repeat Fold Model                                      
1      1    LR          0.835429  0.999038     0.508194
            MLP         0.850538  0.999038     0.508194
            RF          0.831416  0.999038     0.508194
            XGB         0.832972  0.999038     0.508194
       2    LR          0.837967  0.998517     0.419534
            MLP         0.836862  0.998517     0.419534
            RF          0.825765  0.998517     0.419534
            XGB         0.833586  0.998517     0.419534
2      1    LR          0.835429  0.998620     0.422201
            MLP         0.824782  0.998620     0.422201
            RF          0.831334  0.998620     0.422201
            XGB         0.832972  0.998620     0.422201
       2    LR          0.837967  0.998052     0.563770
            MLP         0.825028  0.998052     0.563770
            RF          0.826092  0.998052     0.563770
            XG