In [1]:
from katabatic.models.ctabganp import CTABGANPAdapter, CTABGANP, preprocess_data, postprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def preprocessing_raw_df(raw_df):
    raw_df['CCAvg'] = raw_df['CCAvg'].str.replace("/",".")
    raw_df["CCAvg"] = raw_df["CCAvg"].astype(str).astype(float)
    raw_df["Experience"] =abs(raw_df["Experience"])
    labelencoder=preprocessing.LabelEncoder()
    raw_df = raw_df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
    return raw_df

data_path = 'data/loan/Bank_Personal_Loan.csv'
ctabgan_adapter = CTABGANPAdapter(type='continuous', raw_csv_path = data_path)
ctabgan_adapter.load_model(
                 test_ratio = 0.20,
                 categorical_columns = [], 
                 log_columns = [],
                 mixed_columns= {},
                 general_columns = ['CCAvg'],
                 non_categorical_columns = [],
                 integer_columns = ['ID','Age','Experience','Income','ZIP Code','Family','Education','Mortgage','Personal Loan','Securities Account','CD Account','Online','CreditCard'],
                 problem_type= {"Classification": 'CreditCard'},                 
                 preprocessing_raw_df=preprocessing_raw_df) 

df = ctabgan_adapter.raw_df
x_train = ctabgan_adapter.x_train
y_train = ctabgan_adapter.y_train
x_test = ctabgan_adapter.x_test
y_test = ctabgan_adapter.y_test

'''
data_path = 'data/car/car.csv'
df = pd.read_csv(data_path)
print(df[3:10])
labelencoder=preprocessing.LabelEncoder()
df= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
print(df[3:10])
'''




2025-04-12 18:32:35,267 - katabatic.models.ctabganp - INFO - CTABGANP module initialized


---Initialise CTABGANP Model


"\ndata_path = 'data/car/car.csv'\ndf = pd.read_csv(data_path)\nprint(df[3:10])\nlabelencoder=preprocessing.LabelEncoder()\ndf= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)\nprint(df[3:10])\n"

In [2]:
ctabgan_adapter.fit(epochs=100)

100%|██████████| 100/100 [11:05<00:00,  6.66s/it]

Finished training in 672.2980937957764  seconds.





In [3]:
synthetic_df = ctabgan_adapter.generate(size=1000)
for column in synthetic_df.columns:
    print(column, ":", synthetic_df[column].value_counts())

self.batch_size: 1000
ID : ID
4160    8
2459    6
2158    6
3157    6
594     6
       ..
4050    1
4580    1
1456    1
1311    1
3019    1
Name: count, Length: 3126, dtype: int64
Age : Age
63    247
62    234
61    197
53    173
56    167
60    165
54    164
65    163
64    158
58    150
55    144
57    143
49    140
52    136
50    134
51    133
46    130
45    127
59    126
48    124
43    122
30    118
47    115
31    112
32    112
33    104
29    101
42    101
44    100
34     95
41     94
40     84
66     80
35     78
38     77
36     70
39     65
37     60
28     51
27     44
67     27
26     20
25     11
24      3
23      1
Name: count, dtype: int64
Experience : Experience
34    188
36    179
22    177
3     170
4     167
35    166
33    155
32    150
20    147
26    145
25    144
23    141
37    140
5     139
18    130
21    130
15    129
24    126
28    122
19    121
2     120
16    118
31    118
38    116
10    115
30    113
14    109
17    108
29    106
12    105
13    104


In [4]:
x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
y_sync_train = synthetic_df.iloc[ :, -1:].values

In [5]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=300, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "MLP Classifier": MLPClassifier(max_iter=300, random_state=42),
    "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(x_sync_train, y_sync_train)
    y_pred = model.predict(x_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, average='macro', zero_division=0)
    }
results_df = pd.DataFrame(results).T
print("Classifier Evaluation on Syn Data:")
print(results_df)

Classifier Evaluation on Syn Data:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression     0.706   0.353000  0.500000  0.413834
Random Forest           0.717   0.649021  0.553444  0.535590
MLP Classifier          0.294   0.147000  0.500000  0.227202
XGB Classifier          0.690   0.590429  0.556156  0.552724
