In [6]:
from katabatic.models.ctabganp import CTABGANPAdapter, CTABGANP, preprocess_data, postprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data_path = 'data/shuttle/shuttle.tst'
headers = ['f0','f1','f2','f3','f4','f5','f6','f7','f8','class']
ctabgan_adapter = CTABGANPAdapter(type='continuous', raw_csv_path = data_path)
ctabgan_adapter.load_model(
                 test_ratio = 0.20,
                 headers=headers,
                 sep=" ",
                 categorical_columns = [], 
                 log_columns = [],
                 mixed_columns= {},
                 general_columns = [],
                 non_categorical_columns = [],
                 integer_columns = ['f0','f1','f2','f3','f4','f5','f6','f7','f8','class'],
                 problem_type= {"Classification": 'class'}) 

df = ctabgan_adapter.raw_df
x_train = ctabgan_adapter.x_train
y_train = ctabgan_adapter.y_train
x_test = ctabgan_adapter.x_test
y_test = ctabgan_adapter.y_test

---Initialise CTABGANP Model


In [7]:
ctabgan_adapter.fit(epochs=100)

100%|██████████| 100/100 [12:10<00:00,  7.31s/it]

Finished training in 740.4923467636108  seconds.





In [8]:
synthetic_df = ctabgan_adapter.generate(size=1000)

self.batch_size: 1000


In [9]:
x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
y_sync_train = synthetic_df.iloc[ :, -1:].values

In [13]:
y_sync_train = LabelEncoder().fit_transform(y_sync_train)

models = {
    "Logistic Regression": LogisticRegression(max_iter=300, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "MLP Classifier": MLPClassifier(max_iter=300, random_state=42),
    "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(x_sync_train, y_sync_train)
    y_pred = model.predict(x_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, average='macro', zero_division=0)
    }
results_df = pd.DataFrame(results).T
print("Classifier Evaluation on Syn Data:")
print(results_df)

Classifier Evaluation on Syn Data:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.010345   0.011521  0.001867  0.003213
Random Forest        0.003448   0.003239  0.000622  0.001044
MLP Classifier       0.001034   0.000985  0.000187  0.000314
XGB Classifier       0.004483   0.004173  0.000809  0.001355
