In [None]:
from katabatic.models.TableGAN import TableGANAdapter, TableGAN, preprocess_data, postprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Initialize the adapter with a specific privacy setting
tablegan_adapter = TableGANAdapter(type='continuous', privacy_setting='high')
data_path = 'data/letter/letter-recognition.data'
df = pd.read_csv(data_path, header = None)
labelencoder=LabelEncoder()
df= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
x = df.drop(0, axis = 1).values
y = df[0].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=42, shuffle=True, stratify=y)
print(f"{type(x_train)}, {x_train.shape}")
print(f"{type(y_train)}, {y_train.shape}")
print(f"{type(x_test)}, {x_test.shape}")
print(f"{type(y_test)}, {y_test.shape}")


2025-04-13 10:26:42,402 - katabatic.models.TableGAN - INFO - TableGAN module initialized


<class 'numpy.ndarray'>, (16000, 16)
<class 'numpy.ndarray'>, (16000,)
<class 'numpy.ndarray'>, (4000, 16)
<class 'numpy.ndarray'>, (4000,)


In [2]:
tablegan_adapter.fit(x_train, y_train, epochs=100, batch_size=64)

---FIT TableGAN Model with high privacy setting
---Initialise TableGAN Model
Epoch 10/100: [D loss: -0.4345] [G loss: 1.9206] [C loss: 0.9790]
Epoch 20/100: [D loss: -0.3769] [G loss: 2.0223] [C loss: 0.7003]
Epoch 30/100: [D loss: -0.3881] [G loss: 1.5136] [C loss: 0.5387]
Epoch 40/100: [D loss: -0.4160] [G loss: 1.2035] [C loss: 0.4098]
Epoch 50/100: [D loss: -0.4586] [G loss: 1.1152] [C loss: 0.3109]
Epoch 60/100: [D loss: -0.5038] [G loss: 1.1858] [C loss: 0.2431]
Epoch 70/100: [D loss: -0.5389] [G loss: 0.9937] [C loss: 0.1948]
Epoch 80/100: [D loss: -0.5740] [G loss: 0.8050] [C loss: 0.1595]
Epoch 90/100: [D loss: -0.6054] [G loss: 0.7671] [C loss: 0.1327]
Epoch 100/100: [D loss: -0.6294] [G loss: 0.7079] [C loss: 0.1106]


In [7]:
# Generate synthetic data
synthetic_data = tablegan_adapter.generate(size=1000)
#synthetic_data = tablegan_adapter.generate(size=100)

---Generate from TableGAN Model


In [8]:
synthetic_df = pd.DataFrame(synthetic_data)
x_sync_train = synthetic_df.drop(0, axis = 1).values
y_sync_train = synthetic_df[0].values
#x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
#y_sync_train = synthetic_df.iloc[ :, -1:].values
print(f"{type(x_sync_train)}, {x_sync_train.shape}")
print(f"{type(y_sync_train)}, {y_sync_train.shape}")

<class 'numpy.ndarray'>, (1000, 16)
<class 'numpy.ndarray'>, (1000,)


In [10]:
# TSTR (train synthetic test real)
y_sync_train = LabelEncoder().fit_transform(y_sync_train)
tstr_score_lr  = LogisticRegression().fit(x_sync_train, y_sync_train).score(x_test, y_test)
tstr_score_rf  = RandomForestClassifier().fit(x_sync_train, y_sync_train).score(x_test, y_test)
tstr_score_mlp = MLPClassifier().fit(x_sync_train, y_sync_train).score(x_test, y_test)
xgbt_classifier = XGBClassifier(eval_metric='logloss')
tstr_score_xgbt = xgbt_classifier.fit(x_sync_train, y_sync_train).score(x_test, y_test)

# TRTR (train real test real)
trtr_score_lr  = LogisticRegression().fit(x_train, y_train).score(x_test, y_test)
trtr_score_rf  = RandomForestClassifier().fit(x_train, y_train).score(x_test, y_test)
trtr_score_mlp = MLPClassifier().fit(x_train, y_train).score(x_test, y_test)
xgbt_classifier = XGBClassifier(eval_metric='logloss', use_label_encoder=True)
trtr_score_xgbt = xgbt_classifier.fit(x_train, y_train).score(x_test, y_test)
df_evaluate = pd.DataFrame([
    ['TSTR', tstr_score_lr, tstr_score_rf, tstr_score_mlp, tstr_score_xgbt],
    ['TRTR', trtr_score_lr,trtr_score_rf,trtr_score_mlp, trtr_score_xgbt]
], columns=['Evaluated Item', 'LR', 'RF', 'MLP', 'XGBT'])
print(df_evaluate)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



  Evaluated Item       LR       RF     MLP    XGBT
0           TSTR  0.00000  0.00125  0.0000  0.0395
1           TRTR  0.76275  0.96900  0.9295  0.9635
