In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.layers import Input, Dense, Flatten, Embedding, multiply, LeakyReLU, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# ---------------------- PREPROCESSING ----------------------
print("Loading and preprocessing car dataset...")
data = pd.read_csv("car.csv")  # Update path as needed
target_column = 'Class'

# Label encode all categorical columns including target
encoders = {}
df_encoded = data.copy()
for col in df_encoded.columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    encoders[col] = le

X = df_encoded.drop(columns=[target_column])
y = df_encoded[target_column]

scaler = MinMaxScaler(feature_range=(-1, 1))
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)

# ---------------------- IMPROVED WCGAN-GP MODEL ----------------------
class ImprovedWCGANGP:
    def __init__(self, data_dim, num_classes, latent_dim=32):
        self.latent_dim = latent_dim
        self.data_dim = data_dim
        self.num_classes = num_classes
        optimizer = Adam(0.0001, beta_1=0.5, beta_2=0.9)
        
        self.generator = self.build_generator()
        self.critic = self.build_critic()

        self.critic.compile(loss=self.wasserstein_loss, optimizer=optimizer)

        self.critic.trainable = False
        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        fake_data = self.generator([noise, label])
        validity = self.critic([fake_data, label])
        self.combined = Model([noise, label], validity)
        self.combined.compile(loss=self.wasserstein_loss, optimizer=optimizer)

    def wasserstein_loss(self, y_true, y_pred):
        return tf.reduce_mean(y_true * y_pred)

    def build_generator(self):
        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.latent_dim)(label))
        model_input = multiply([noise, label_embedding])

        x = Dense(64)(model_input)
        x = LeakyReLU(negative_slope=0.2)(x)
        x = Dropout(0.3)(x)

        x = Dense(128)(x)
        x = LeakyReLU(negative_slope=0.2)(x)
        x = Dropout(0.3)(x)

        output = Dense(self.data_dim, activation='linear')(x)

        return Model([noise, label], output)

    def build_critic(self):
        data_input = Input(shape=(self.data_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.data_dim)(label))
        model_input = multiply([data_input, label_embedding])

        x = Dense(128)(model_input)
        x = LeakyReLU(negative_slope=0.2)(x)
        x = Dense(64)(x)
        x = LeakyReLU(negative_slope=0.2)(x)
        output = Dense(1, activation='linear')(x)

        return Model([data_input, label], output)

    def train(self, X_train, y_train, epochs=300, batch_size=64):
        valid = -np.ones((batch_size, 1))
        fake = np.ones((batch_size, 1))
        for epoch in range(epochs):
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            real_samples, labels = X_train[idx], y_train.iloc[idx].values.reshape(-1, 1)

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            gen_samples = self.generator.predict([noise, labels], verbose=0)

            d_loss_real = self.critic.train_on_batch([real_samples, labels], valid)
            d_loss_fake = self.critic.train_on_batch([gen_samples, labels], fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            g_loss = self.combined.train_on_batch([noise, labels], valid)

            if epoch % 50 == 0:
                print(f"Epoch [{epoch}/{epochs}] | D Loss: {d_loss:.4f} | G Loss: {g_loss:.4f}")


# ---------------------- TRAIN THE GAN ----------------------
wcgan = ImprovedWCGANGP(data_dim=X_train.shape[1], num_classes=len(np.unique(y_train)))
wcgan.train(X_train, y_train, epochs=5000)

# ---------------------- GENERATE SYNTHETIC DATA ----------------------
def generate_samples(wcgan, n_samples):
    noise = np.random.normal(0, 1, (n_samples, wcgan.latent_dim))
    labels = np.random.randint(0, wcgan.num_classes, n_samples).reshape(-1, 1)
    return wcgan.generator.predict([noise, labels], verbose=0), labels

fake_samples, fake_labels = generate_samples(wcgan, X_train.shape[0])
fake_df = pd.DataFrame(fake_samples)
fake_labels = fake_labels.flatten()

# ---------------------- EVALUATE CLASSIFIERS ----------------------
models = {
    'LR': LogisticRegression(max_iter=300),
    'MLP': MLPClassifier(max_iter=300),
    'RF': RandomForestClassifier(),
    'XGBT': XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
}

results = []

for name, model in models.items():
    model.fit(fake_df, fake_labels)
    preds = model.predict(X_test)
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds, average='weighted', zero_division=0),
        "Recall": recall_score(y_test, preds, average='weighted'),
        "F1 Score": f1_score(y_test, preds, average='weighted')
    })

# ---------------------- DISPLAY RESULTS ----------------------
results_df = pd.DataFrame(results).set_index("Model")
print("\n📊 Evaluation Results on Real Test Set using GAN-Synthetic Data:")
print(results_df.round(6).to_string())

Loading and preprocessing car dataset...




Epoch [0/5000] | D Loss: 0.0036 | G Loss: 0.0000
Epoch [50/5000] | D Loss: 0.0023 | G Loss: -0.0000
Epoch [100/5000] | D Loss: 0.0022 | G Loss: -0.0001
Epoch [150/5000] | D Loss: 0.0023 | G Loss: -0.0001
Epoch [200/5000] | D Loss: 0.0023 | G Loss: -0.0002
Epoch [250/5000] | D Loss: 0.0023 | G Loss: -0.0002
Epoch [300/5000] | D Loss: 0.0024 | G Loss: -0.0003
Epoch [350/5000] | D Loss: 0.0025 | G Loss: -0.0004
Epoch [400/5000] | D Loss: 0.0025 | G Loss: -0.0005
Epoch [450/5000] | D Loss: 0.0026 | G Loss: -0.0007
Epoch [500/5000] | D Loss: 0.0027 | G Loss: -0.0009
Epoch [550/5000] | D Loss: 0.0028 | G Loss: -0.0011
Epoch [600/5000] | D Loss: 0.0029 | G Loss: -0.0013
Epoch [650/5000] | D Loss: 0.0031 | G Loss: -0.0016
Epoch [700/5000] | D Loss: 0.0032 | G Loss: -0.0019
Epoch [750/5000] | D Loss: 0.0034 | G Loss: -0.0022
Epoch [800/5000] | D Loss: 0.0036 | G Loss: -0.0026
Epoch [850/5000] | D Loss: 0.0038 | G Loss: -0.0030
Epoch [900/5000] | D Loss: 0.0040 | G Loss: -0.0034
Epoch [950/5000]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.




📊 Evaluation Results on Real Test Set using GAN-Synthetic Data:
       Accuracy  Precision    Recall  F1 Score
Model                                         
LR     0.031792   0.001011  0.031792  0.001959
MLP    0.170520   0.724697  0.170520  0.243441
RF     0.031792   0.001011  0.031792  0.001959
XGBT   0.031792   0.001011  0.031792  0.001959


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.layers import Input, Dense, Flatten, Embedding, multiply, LeakyReLU, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# ---------------------- PREPROCESSING ----------------------
print("Loading and preprocessing dataset...")
data = pd.read_csv("car.csv")  # Update path as needed

# Encode all columns
label_encoders = {}
for col in data.columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

X = data.drop("Class", axis=1)
y = data["Class"]
num_classes = len(np.unique(y))

scaler = MinMaxScaler(feature_range=(-1, 1))
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ---------------------- WCGAN-GP MODEL ----------------------
class ImprovedWCGANGP:
    def __init__(self, data_dim, num_classes, latent_dim=32):
        self.latent_dim = latent_dim
        self.data_dim = data_dim
        self.num_classes = num_classes
        optimizer = Adam(0.0001, beta_1=0.5, beta_2=0.9)

        self.generator = self.build_generator()
        self.critic = self.build_critic()

        self.critic.compile(loss=self.wasserstein_loss, optimizer=optimizer)

        self.critic.trainable = False
        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        fake_data = self.generator([noise, label])
        validity = self.critic([fake_data, label])
        self.combined = Model([noise, label], validity)
        self.combined.compile(loss=self.wasserstein_loss, optimizer=optimizer)

    def wasserstein_loss(self, y_true, y_pred):
        return tf.reduce_mean(y_true * y_pred)

    def build_generator(self):
        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.latent_dim)(label))
        model_input = multiply([noise, label_embedding])

        x = Dense(128)(model_input)
        x = LeakyReLU(negative_slope=0.2)(x)
        x = Dropout(0.3)(x)

        x = Dense(256)(x)
        x = LeakyReLU(negative_slope=0.2)(x)
        x = Dropout(0.3)(x)

        x = Dense(512)(x)
        x = LeakyReLU(negative_slope=0.2)(x)
        x = Dropout(0.3)(x)

        output = Dense(self.data_dim, activation='linear')(x)

        return Model([noise, label], output)

    def build_critic(self):
        data_input = Input(shape=(self.data_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.data_dim)(label))
        model_input = multiply([data_input, label_embedding])

        x = Dense(512)(model_input)
        x = LeakyReLU(negative_slope=0.2)(x)
        x = Dense(256)(x)
        x = LeakyReLU(negative_slope=0.2)(x)
        x = Dense(128)(x)
        x = LeakyReLU(negative_slope=0.2)(x)
        output = Dense(1, activation='linear')(x)

        return Model([data_input, label], output)

    def train(self, X_train, y_train, epochs=1000, batch_size=64):
        valid = -np.ones((batch_size, 1))
        fake = np.ones((batch_size, 1))

        for epoch in range(epochs):
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            real_samples, labels = X_train[idx], y_train.iloc[idx]

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            gen_samples = self.generator.predict([noise, labels], verbose=0)

            d_loss_real = self.critic.train_on_batch([real_samples, labels], valid)
            d_loss_fake = self.critic.train_on_batch([gen_samples, labels], fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            g_loss = self.combined.train_on_batch([noise, labels], valid)

            if epoch % 100 == 0:
                print(f"Epoch [{epoch}/{epochs}] | D Loss: {d_loss:.4f} | G Loss: {g_loss:.4f}")

# Instantiate and train
wcgan = ImprovedWCGANGP(data_dim=X_train.shape[1], num_classes=num_classes)
wcgan.train(X_train, y_train, epochs=5000)

# ---------------------- GENERATE SYNTHETIC DATA ----------------------
def generate_samples(wcgan, n_samples):
    noise = np.random.normal(0, 1, (n_samples, wcgan.latent_dim))
    labels = np.random.randint(0, wcgan.num_classes, n_samples).reshape(-1, 1)
    return wcgan.generator.predict([noise, labels], verbose=0), labels

fake_samples, fake_labels = generate_samples(wcgan, X_train.shape[0])

# ---------------------- EVALUATION ----------------------
models = {
    'LR': LogisticRegression(max_iter=200),
    'MLP': MLPClassifier(max_iter=200),
    'RF': RandomForestClassifier(),
    'XGBT': XGBClassifier(eval_metric='logloss')
}

results = []

for name, model in models.items():
    model.fit(fake_samples, fake_labels.ravel())
    preds = model.predict(X_test)
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds, average='macro', zero_division=0),
        "Recall": recall_score(y_test, preds, average='macro'),
        "F1 Score": f1_score(y_test, preds, average='macro')
    })

results_df = pd.DataFrame(results)
print("\n--- Structured Evaluation Results (Synthetic Data) ---")
print(results_df.to_string(index=False))

Loading and preprocessing dataset...




Epoch [0/5000] | D Loss: 0.0013 | G Loss: 0.0000
Epoch [100/5000] | D Loss: 0.0009 | G Loss: 0.0000
Epoch [200/5000] | D Loss: 0.0009 | G Loss: 0.0000
Epoch [300/5000] | D Loss: 0.0009 | G Loss: 0.0000
Epoch [400/5000] | D Loss: 0.0009 | G Loss: 0.0000
Epoch [500/5000] | D Loss: 0.0009 | G Loss: 0.0000
Epoch [600/5000] | D Loss: 0.0009 | G Loss: 0.0000
Epoch [700/5000] | D Loss: 0.0009 | G Loss: 0.0000
Epoch [800/5000] | D Loss: 0.0009 | G Loss: 0.0000
Epoch [900/5000] | D Loss: 0.0009 | G Loss: -0.0000
Epoch [1000/5000] | D Loss: 0.0009 | G Loss: -0.0000
Epoch [1100/5000] | D Loss: 0.0009 | G Loss: -0.0000
Epoch [1200/5000] | D Loss: 0.0009 | G Loss: -0.0000
Epoch [1300/5000] | D Loss: 0.0009 | G Loss: -0.0001
Epoch [1400/5000] | D Loss: 0.0010 | G Loss: -0.0003
Epoch [1500/5000] | D Loss: 0.0012 | G Loss: -0.0005
Epoch [1600/5000] | D Loss: 0.0014 | G Loss: -0.0011
Epoch [1700/5000] | D Loss: 0.0018 | G Loss: -0.0019
Epoch [1800/5000] | D Loss: 0.0025 | G Loss: -0.0031
Epoch [1900/50