In [1]:
from katabatic.models.ctabganp import CTABGANPAdapter, CTABGANP, preprocess_data, postprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Initialize the adapter with a specific privacy setting
#data_path = 'data/car/car.csv'
data_path = "Real_Datasets/Adult.csv"
ctabgan_adapter = CTABGANPAdapter(type='continuous', raw_csv_path = data_path)
ctabgan_adapter.load_model(
                 test_ratio = 0.20,
                 categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income'], 
                 log_columns = [],
                 mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]},
                 general_columns = ["age"],
                 non_categorical_columns = [],
                 integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week'],
                 problem_type= {"Classification": 'income'}) 

df = ctabgan_adapter.raw_df
print(df[3:10])
x_train = ctabgan_adapter.x_train
x_test = ctabgan_adapter.y_train
y_train = ctabgan_adapter.x_test
y_test = ctabgan_adapter.y_test

'''
data_path = 'data/car/car.csv'
df = pd.read_csv(data_path)
print(df[3:10])
labelencoder=preprocessing.LabelEncoder()
df= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
print(df[3:10])
'''


2025-04-10 14:23:59,638 - katabatic.models.ctabganp - INFO - CTABGANP module initialized


---Initialise CTABGANP Model
   age  workclass  fnlwgt  education  marital-status  occupation  \
3   44          4  160323         15               2           7   
4   18          0  103497         15               4           0   
5   34          4  198693          0               4           8   
6   29          0  227026         11               4           0   
7   63          6  104626         14               2          10   
8   24          4  369667         15               4           8   
9   55          4  104996          5               2           3   

   relationship  race  gender  capital-gain  capital-loss  hours-per-week  \
3             0     2       1          7688             0              40   
4             3     4       0             0             0              30   
5             1     4       1             0             0              30   
6             4     2       1             0             0              40   
7             0     4       1          31

"\ndata_path = 'data/car/car.csv'\ndf = pd.read_csv(data_path)\nprint(df[3:10])\nlabelencoder=preprocessing.LabelEncoder()\ndf= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)\nprint(df[3:10])\n"

In [2]:
print(type(x_train), x_train.shape)
print(x_train[:2])
print(type(y_train), y_train.shape)
print(y_train)
print(type(x_test), x_test.shape)
print(x_train[:2])
print(type(y_test), y_test.shape)
print(y_test)



<class 'pandas.core.frame.DataFrame'> (39073, 14)
       age  workclass  fnlwgt  education  marital-status  occupation  \
34342   71          4   77253         11               4           6   
18559   17          4  329783          0               4          12   

       relationship  race  gender  capital-gain  capital-loss  hours-per-week  \
34342             1     4       1             0             0              17   
18559             2     4       0             0             0              10   

       native-country  income  
34342              39       0  
18559              39       0  
<class 'pandas.core.series.Series'> (39073,)
34342    0
18559    0
12477    0
560      0
3427     0
        ..
38073    1
16306    1
26860    1
20602    0
42656    0
Name: income, Length: 39073, dtype: int64
<class 'pandas.core.frame.DataFrame'> (9769, 13)
       age  workclass  fnlwgt  education  marital-status  occupation  \
34342   71          4   77253         11               4        

In [3]:
ctabgan_adapter.fit(epochs=100)

100%|██████████| 100/100 [2:14:41<00:00, 80.81s/it]  

Finished training in 8103.610156297684  seconds.





In [4]:
synthetic_df = ctabgan_adapter.generate(size=1000)

self.batch_size: 1000


In [5]:
synthetic_df[3:10]

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
3,31,4,180291,9,0,4,1,4,1,0,0,45,39,0
4,57,7,117640,11,2,11,0,4,1,0,0,40,39,1
5,66,2,168316,11,2,3,0,4,1,0,0,50,39,1
6,27,4,220625,15,4,5,1,4,1,0,0,50,39,0
7,50,4,231368,11,0,4,4,4,0,0,0,40,28,0
8,33,4,254069,11,2,13,0,4,1,0,0,40,39,1
9,59,4,134456,11,3,3,1,4,1,0,0,40,39,0


In [6]:
x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
y_sync_train = synthetic_df.iloc[ :, -1:].values

In [7]:
# TSTR (train synthetic test real)
tstr_score_lr  = LogisticRegression().fit(x_sync_train, y_sync_train).score(x_test, y_test)
tstr_score_rf  = RandomForestClassifier().fit(x_sync_train, y_sync_train).score(x_test, y_test)
tstr_score_mlp = MLPClassifier().fit(x_sync_train, y_sync_train).score(x_test, y_test)
y_sync_train = LabelEncoder().fit_transform(y_sync_train)
xgbt_classifier = XGBClassifier(eval_metric='logloss')
tstr_score_xgbt = xgbt_classifier.fit(x_sync_train, y_sync_train).score(x_test, y_test)

# TRTR (train real test real)
trtr_score_lr  = LogisticRegression().fit(x_train, y_train).score(x_test, y_test)
trtr_score_rf  = RandomForestClassifier().fit(x_train, y_train).score(x_test, y_test)
trtr_score_mlp = MLPClassifier().fit(x_train, y_train).score(x_test, y_test)
xgbt_classifier = XGBClassifier(eval_metric='logloss', use_label_encoder=True)
trtr_score_xgbt = xgbt_classifier.fit(x_train, y_train).score(x_test, y_test)
df_evaluate = pd.DataFrame([
    ['TSTR', tstr_score_lr, tstr_score_rf, tstr_score_mlp, tstr_score_xgbt],
    ['TRTR', trtr_score_lr,trtr_score_rf,trtr_score_mlp, trtr_score_xgbt]
], columns=['Evaluated Item', 'LR', 'RF', 'MLP', 'XGBT'])
print(df_evaluate)

TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=[0 1] and y_pred=['0' '1']. Make sure that the predictions provided by the classifier coincides with the true labels.