In [2]:
from katabatic.models.ctabganp import CTABGANPAdapter, CTABGANP, preprocess_data, postprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def preprocessing_raw_df(raw_df):
    labelencoder=preprocessing.LabelEncoder()
    raw_df = raw_df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
    return raw_df

data_path = 'data/Nursery/Nursery.csv'
ctabgan_adapter = CTABGANPAdapter(type='continuous', raw_csv_path = data_path)
ctabgan_adapter.load_model(
                 test_ratio = 0.20,
                 categorical_columns = ['parents','has_nurs','form','children','housing','finance','social','health','Target'], 
                 log_columns = [],
                 mixed_columns= {},
                 general_columns = [],
                 non_categorical_columns = [],
                 integer_columns = [],
                 problem_type= {"Classification": 'Target'},
                 preprocessing_raw_df=preprocessing_raw_df) 

df = ctabgan_adapter.raw_df
x_train = ctabgan_adapter.x_train
y_train = ctabgan_adapter.y_train
x_test = ctabgan_adapter.x_test
y_test = ctabgan_adapter.y_test

---Initialise CTABGANP Model


In [3]:
ctabgan_adapter.fit(epochs=100)

100%|██████████| 100/100 [12:39<00:00,  7.59s/it]

Finished training in 761.4685468673706  seconds.





In [4]:
synthetic_df = ctabgan_adapter.generate(size=1000)
for column in synthetic_df.columns:
    print(column, ":", synthetic_df[column].value_counts())

self.batch_size: 1000
 parents :  parents
1.997078    1
1.999967    1
1.995806    1
1.001567    1
0.002103    1
           ..
1.997890    1
1.996102    1
0.998354    1
1.998714    1
0.999985    1
Name: count, Length: 12960, dtype: int64
has_nurs : has_nurs
0    2840
1    2743
4    2592
3    2494
2    2291
Name: count, dtype: int64
form : form
2    3996
1    3613
0    2997
3    2354
Name: count, dtype: int64
children : children
1    3444
2    3384
3    3113
0    3019
Name: count, dtype: int64
housing : housing
0    4594
1    4591
2    3775
Name: count, dtype: int64
finance : finance
1    6829
0    6131
Name: count, dtype: int64
social : social
2    4774
0    4142
1    4044
Name: count, dtype: int64
health : health
2    4427
0    4371
1    4162
Name: count, dtype: int64
Target : Target
1    4800
0    4472
3    3512
4     172
2       4
Name: count, dtype: int64


In [5]:
x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
y_sync_train = synthetic_df.iloc[ :, -1:].values

In [8]:
y_sync_train = LabelEncoder().fit_transform(y_sync_train)
models = {
    "Logistic Regression": LogisticRegression(max_iter=300, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "MLP Classifier": MLPClassifier(max_iter=300, random_state=42),
    "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(x_sync_train, y_sync_train)
    y_pred = model.predict(x_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, average='macro', zero_division=0)
    }
results_df = pd.DataFrame(results).T
print("Classifier Evaluation on Syn Data:")
print(results_df)

Classifier Evaluation on Syn Data:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.760031   0.571717  0.582136  0.575903
Random Forest        0.856096   0.615720  0.534190  0.536397
MLP Classifier       0.884645   0.668151  0.679209  0.671762
XGB Classifier       0.850309   0.645871  0.652166  0.645332
