In [1]:
from katabatic.models.TableGAN import TableGANAdapter, TableGAN, preprocess_data, postprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Initialize the adapter with a specific privacy setting
tablegan_adapter = TableGANAdapter(type='continuous', privacy_setting='high')
data_path = 'data/car/car.csv'
df = pd.read_csv(data_path)
print(df[3:10])
labelencoder=preprocessing.LabelEncoder()
df= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
print(df[3:10])


2025-04-10 20:34:12,997 - katabatic.models.TableGAN - INFO - TableGAN module initialized


  Buying  Maint Doors Persons Lug_boot Safety  Class
3  vhigh  vhigh     2       2      med    low  unacc
4  vhigh  vhigh     2       2      med    med  unacc
5  vhigh  vhigh     2       2      med   high  unacc
6  vhigh  vhigh     2       2      big    low  unacc
7  vhigh  vhigh     2       2      big    med  unacc
8  vhigh  vhigh     2       2      big   high  unacc
9  vhigh  vhigh     2       4    small    low  unacc
   Buying  Maint  Doors  Persons  Lug_boot  Safety  Class
3       3      3      0        0         1       1      2
4       3      3      0        0         1       2      2
5       3      3      0        0         1       0      2
6       3      3      0        0         0       1      2
7       3      3      0        0         0       2      2
8       3      3      0        0         0       0      2
9       3      3      0        1         2       1      2


In [2]:
x = df.copy().drop('Class', axis=1)
y = df['Class']

x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=42)

In [3]:
tablegan_adapter.fit(x_train, y_train, epochs=100, batch_size=64)

---FIT TableGAN Model with high privacy setting
---Initialise TableGAN Model
Epoch 10/100: [D loss: -3.7164] [G loss: 7.6249] [C loss: 0.6898]
Epoch 20/100: [D loss: -1.3554] [G loss: 0.7643] [C loss: 0.4933]
Epoch 30/100: [D loss: -1.1842] [G loss: 0.6978] [C loss: 0.3973]
Epoch 40/100: [D loss: -1.2413] [G loss: 0.9794] [C loss: 0.3513]
Epoch 50/100: [D loss: -1.2197] [G loss: 1.1847] [C loss: 0.3204]
Epoch 60/100: [D loss: -1.2909] [G loss: 0.8826] [C loss: 0.2881]
Epoch 70/100: [D loss: -1.2470] [G loss: 0.8594] [C loss: 0.2495]
Epoch 80/100: [D loss: -1.2695] [G loss: 0.9777] [C loss: 0.2070]
Epoch 90/100: [D loss: -1.2648] [G loss: 1.1397] [C loss: 0.1671]
Epoch 100/100: [D loss: -1.2069] [G loss: 1.3221] [C loss: 0.1335]


In [4]:
# Generate synthetic data
synthetic_data = tablegan_adapter.generate(size=1000)

---Generate from TableGAN Model


In [5]:
synthetic_df = pd.DataFrame(synthetic_data)
x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
y_sync_train = synthetic_df.iloc[ :, -1:].values

In [None]:
# TSTR (train synthetic test real)
tstr_score_lr  = LogisticRegression().fit(x_sync_train, y_sync_train).score(x_test, y_test)
tstr_score_rf  = RandomForestClassifier().fit(x_sync_train, y_sync_train).score(x_test, y_test)
tstr_score_mlp = MLPClassifier().fit(x_sync_train, y_sync_train).score(x_test, y_test)
y_sync_train = LabelEncoder().fit_transform(y_sync_train)
xgbt_classifier = XGBClassifier(eval_metric='logloss')
tstr_score_xgbt = xgbt_classifier.fit(x_sync_train, y_sync_train).score(x_test, y_test)

# TRTR (train real test real)
trtr_score_lr  = LogisticRegression().fit(x_train, y_train).score(x_test, y_test)
trtr_score_rf  = RandomForestClassifier().fit(x_train, y_train).score(x_test, y_test)
trtr_score_mlp = MLPClassifier().fit(x_train, y_train).score(x_test, y_test)
xgbt_classifier = XGBClassifier(eval_metric='logloss', use_label_encoder=True)
trtr_score_xgbt = xgbt_classifier.fit(x_train, y_train).score(x_test, y_test)
df_evaluate = pd.DataFrame([
    ['TSTR', tstr_score_lr, tstr_score_rf, tstr_score_mlp, tstr_score_xgbt],
    ['TRTR', trtr_score_lr,trtr_score_rf,trtr_score_mlp, trtr_score_xgbt]
], columns=['Evaluated Item', 'LR', 'RF', 'MLP', 'XGBT'])
print(df_evaluate)



  Evaluated Item        LR        RF       MLP      XGBT
0           TSTR  0.031792  0.072254  0.031792  0.089595
1           TRTR  0.658960  0.965318  0.930636  0.979769


Parameters: { "use_label_encoder" } are not used.

