In [None]:
'''
import pandas as pd
headers = ['letter','f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15']
raw_csv_path = 'data/letter/letter-recognition.data'
raw_df = pd.read_csv(raw_csv_path, names=headers)
print(raw_df[10:])
raw_df2 = pd.read_csv(raw_csv_path)
print(raw_df2[10:])
'''

In [None]:
from katabatic.models.ctabganp import CTABGANPAdapter, CTABGANP, preprocess_data, postprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the adapter with a specific privacy setting
data_path = 'data/letter/letter-recognition.data'
ctabgan_adapter = CTABGANPAdapter(type='continuous', raw_csv_path = data_path)
headers = ['letter','f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15']
ctabgan_adapter.load_model(
                 test_ratio = 0.20,
                 headers=headers,
                 categorical_columns = ['letter'], 
                 log_columns = [],
                 mixed_columns= {},
                 general_columns = [],
                 non_categorical_columns = ['letter','f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15'],
                 integer_columns = [],
                 problem_type= {"Classification": 'letter'}) 

df = ctabgan_adapter.raw_df
x_train = ctabgan_adapter.x_train
y_train = ctabgan_adapter.y_train
x_test = ctabgan_adapter.x_test
y_test = ctabgan_adapter.y_test

---Initialise CTABGANP Model


"\ndata_path = 'data/car/car.csv'\ndf = pd.read_csv(data_path)\nprint(df[3:10])\nlabelencoder=preprocessing.LabelEncoder()\ndf= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)\nprint(df[3:10])\n"

In [10]:
ctabgan_adapter.fit(epochs=100)

100%|██████████| 100/100 [21:06<00:00, 12.67s/it]

Finished training in 1285.5108559131622  seconds.





In [11]:
synthetic_df = ctabgan_adapter.generate(size=1000)
for column in synthetic_df.columns:
    print(column, ":", synthetic_df[column].value_counts())

self.batch_size: 1000
f0 : f0
3.999686    2
4.997896    2
7.952455    1
3.001439    1
3.999295    1
           ..
6.000132    1
2.997576    1
1.986836    1
5.002874    1
1.998025    1
Name: count, Length: 19998, dtype: int64
f1 : f1
6.997688     2
9.001662     2
11.551180    1
10.896780    1
6.992893     1
            ..
3.423263     1
5.984914     1
9.000145     1
10.197598    1
1.328091     1
Name: count, Length: 19998, dtype: int64
f2 : f2
8.989765     2
7.004435     2
10.405760    1
3.994481     1
5.996553     1
            ..
9.033621     1
3.992681     1
3.996502     1
6.006312     1
2.018149     1
Name: count, Length: 19998, dtype: int64
f3 : f3
5.999217    2
5.998837    2
7.999971    1
4.995864    1
8.003322    1
           ..
4.001180    1
6.001867    1
4.998339    1
4.001642    1
2.003202    1
Name: count, Length: 19998, dtype: int64
f4 : f4
6.005200    2
6.002215    2
6.006940    2
3.001230    2
1.998286    1
           ..
4.998499    1
1.999157    1
2.002722    1
3.002217  

In [12]:
x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
y_sync_train = synthetic_df.iloc[ :, -1:].values

In [16]:
y_sync_train = LabelEncoder().fit_transform(y_sync_train)
models = {
    "Logistic Regression": LogisticRegression(max_iter=300, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "MLP Classifier": MLPClassifier(max_iter=300, random_state=42),
    "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(x_sync_train, y_sync_train)
    y_pred = model.predict(x_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, average='macro', zero_division=0)
    }
results_df = pd.DataFrame(results).T
print("Classifier Evaluation on Syn Data:")
print(results_df)

Classifier Evaluation on Syn Data:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression   0.02300   0.041824  0.023104  0.022129
Random Forest         0.03700   0.051307  0.036919  0.037914
MLP Classifier        0.03875   0.057785  0.038801  0.038089
XGB Classifier        0.03375   0.050068  0.033670  0.035493
