In [1]:
from katabatic.models.ctabganp import CTABGANPAdapter, CTABGANP, preprocess_data, postprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def preprocessing_raw_df(raw_df):
    labelencoder=preprocessing.LabelEncoder()
    raw_df = raw_df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
    return raw_df

data_path = "data/Magic/magic_gamma.csv"
ctabgan_adapter = CTABGANPAdapter(type='continuous', raw_csv_path = data_path)
ctabgan_adapter.load_model(
                 test_ratio = 0.20,
                 categorical_columns = ['class'], 
                 log_columns = [],
                 mixed_columns= {},
                 general_columns = [],
                 non_categorical_columns = ['fLength','fWidth','fSize','fConc','fConc1','fAsym','fM3Long','fM3Trans','fAlpha','fDist'],
                 integer_columns = [],
                 problem_type= {"Classification": 'class'},
                 preprocessing_raw_df=preprocessing_raw_df) 

df = ctabgan_adapter.raw_df
x_train = ctabgan_adapter.x_train
y_train = ctabgan_adapter.y_train
x_test = ctabgan_adapter.x_test
y_test = ctabgan_adapter.y_test

'''
data_path = 'data/car/car.csv'
df = pd.read_csv(data_path)
print(df[3:10])
labelencoder=preprocessing.LabelEncoder()
df= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
print(df[3:10])
'''


2025-04-12 19:09:43,764 - katabatic.models.ctabganp - INFO - CTABGANP module initialized


---Initialise CTABGANP Model


"\ndata_path = 'data/car/car.csv'\ndf = pd.read_csv(data_path)\nprint(df[3:10])\nlabelencoder=preprocessing.LabelEncoder()\ndf= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)\nprint(df[3:10])\n"

In [2]:
ctabgan_adapter.fit(epochs=100)

100%|██████████| 100/100 [51:55<00:00, 31.15s/it] 

Finished training in 3135.7605905532837  seconds.





In [3]:
synthetic_df = ctabgan_adapter.generate(size=1000)
for column in synthetic_df.columns:
    print(column, ":", synthetic_df[column].value_counts())

self.batch_size: 1000
fLength : fLength
70.646540    1
56.190448    1
20.118097    1
35.731656    1
21.679917    1
            ..
28.365525    1
69.013954    1
24.902097    1
74.475946    1
70.983330    1
Name: count, Length: 19020, dtype: int64
fWidth : fWidth
8.204658     2
27.337040    1
14.981843    1
17.384717    1
13.466804    1
            ..
17.211399    1
18.586560    1
37.877297    1
9.519795     1
39.685986    1
Name: count, Length: 19019, dtype: int64
fSize : fSize
2.853829    2
3.911169    1
2.333172    1
3.132010    1
2.589564    1
           ..
3.189424    1
2.784278    1
2.783940    1
2.669791    1
3.158091    1
Name: count, Length: 19019, dtype: int64
fConc : fConc
0.499937    2
0.104237    1
0.722729    1
0.261524    1
0.490917    1
           ..
0.269816    1
0.384679    1
0.316606    1
0.631864    1
0.079714    1
Name: count, Length: 19019, dtype: int64
fConc1 : fConc1
0.053386    1
0.252644    1
0.268351    1
0.309780    1
0.215763    1
           ..
0.212337    1


In [4]:
x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
y_sync_train = synthetic_df.iloc[ :, -1:].values

In [6]:
y_sync_train = LabelEncoder().fit_transform(y_sync_train)
models = {
    "Logistic Regression": LogisticRegression(max_iter=300, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "MLP Classifier": MLPClassifier(max_iter=300, random_state=42),
    "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(x_sync_train, y_sync_train)
    y_pred = model.predict(x_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, average='macro', zero_division=0)
    }
results_df = pd.DataFrame(results).T
print("Classifier Evaluation on Syn Data:")
print(results_df)

Classifier Evaluation on Syn Data:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.798896   0.786963  0.759936  0.769573
Random Forest        0.840694   0.829133  0.816618  0.822094
MLP Classifier       0.818875   0.835837  0.762526  0.780939
XGB Classifier       0.836488   0.822871  0.814912  0.818552
