In [None]:
from katabatic.models.ctabganp import CTABGANPAdapter, CTABGANP, preprocess_data, postprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the adapter with a specific privacy setting
data_path = "data/car/car.csv"
ctabgan_adapter = CTABGANPAdapter(type='continuous', raw_csv_path = data_path)
ctabgan_adapter.load_model(
                 test_ratio = 0.20,
                 categorical_columns = ['Buying','Maint','Doors','Persons','Lug_boot','Safety','Class'], 
                 log_columns = [],
                 mixed_columns= {},
                 general_columns = [],
                 non_categorical_columns = [],
                 integer_columns = [],
                 problem_type= {"Classification": 'Class'}) 

df = ctabgan_adapter.raw_df
x_train = ctabgan_adapter.x_train
y_train = ctabgan_adapter.y_train
x_test = ctabgan_adapter.x_test
y_test = ctabgan_adapter.y_test

'''
data_path = 'data/car/car.csv'
df = pd.read_csv(data_path)
print(df[3:10])
labelencoder=preprocessing.LabelEncoder()
df= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
print(df[3:10])
'''


2025-04-10 23:56:04,647 - katabatic.models.ctabganp - INFO - CTABGANP module initialized


---Initialise CTABGANP Model
mps
raw_df.shape: (1728, 7)


"\ndata_path = 'data/car/car.csv'\ndf = pd.read_csv(data_path)\nprint(df[3:10])\nlabelencoder=preprocessing.LabelEncoder()\ndf= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)\nprint(df[3:10])\n"

In [21]:
ctabgan_adapter.fit(epochs=100)

100%|██████████| 100/100 [00:41<00:00,  2.39it/s]

Finished training in 41.91309213638306  seconds.





In [15]:
synthetic_df = ctabgan_adapter.generate(size=1000)
for column in synthetic_df.columns:
    print(column, ":", synthetic_df[column].value_counts())

self.batch_size: 1000
Buying : Buying
1    617
0    436
3    383
2    292
Name: count, dtype: int64
Maint : Maint
1    605
0    384
2    379
3    360
Name: count, dtype: int64
Doors : Doors
3    580
0    533
1    331
2    284
Name: count, dtype: int64
Persons : Persons
0    606
2    580
1    542
Name: count, dtype: int64
Lug_boot : Lug_boot
0    776
1    504
2    448
Name: count, dtype: int64
Safety : Safety
0    762
1    503
2    463
Name: count, dtype: int64
Class : Class
2    1112
0     351
1     153
3     112
Name: count, dtype: int64


In [16]:
x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
y_sync_train = synthetic_df.iloc[ :, -1:].values

In [20]:

# TSTR (train synthetic test real)
tstr_score_lr  = LogisticRegression().fit(x_sync_train, y_sync_train).score(x_test, y_test)
tstr_score_rf  = RandomForestClassifier().fit(x_sync_train, y_sync_train).score(x_test, y_test)
tstr_score_mlp = MLPClassifier().fit(x_sync_train, y_sync_train).score(x_test, y_test)
y_sync_train = LabelEncoder().fit_transform(y_sync_train)
xgbt_classifier = XGBClassifier(eval_metric='logloss')
tstr_score_xgbt = xgbt_classifier.fit(x_sync_train, y_sync_train).score(x_test, y_test)

df_evaluate = pd.DataFrame([
    ['TSTR', tstr_score_lr, tstr_score_rf, tstr_score_mlp, tstr_score_xgbt]
], columns=['Evaluated Item', 'LR', 'RF', 'MLP', 'XGBT'])
print(df_evaluate)


  Evaluated Item        LR        RF       MLP      XGBT
0           TSTR  0.699422  0.630058  0.687861  0.586705


In [19]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=300, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "MLP Classifier": MLPClassifier(max_iter=300, random_state=42),
    "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(x_sync_train, y_sync_train)
    y_pred = model.predict(x_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, average='macro', zero_division=0)
    }
results_df = pd.DataFrame(results).T
print("Classifier Evaluation on Syn Data:")
print(results_df)

Classifier Evaluation on Syn Data:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.699422   0.174855  0.250000  0.205782
Random Forest        0.635838   0.297753  0.272035  0.272188
MLP Classifier       0.690751   0.258408  0.253542  0.221242
XGB Classifier       0.586705   0.286104  0.282365  0.282425
