In [1]:
imports_path = ".\\imports.ipynb"
tableGAN_path = ".\\tableGAN.ipynb"
utils_path = ".\\utils.ipynb"

%run "$imports_path"

In [2]:
%run "$utils_path"
%run "$tableGAN_path"

image_dir = ".\\Images"
model_dir = ".\\Model\\tableGAN_ckpt\\compare_pred_on_synthetic_adult\\"
dataset_dir = ".\\Datasets\\"
dataset_train_path = os.path.join(dataset_dir, "df_adult_edited_train.csv")
dataset_test_path = os.path.join(dataset_dir, "df_adult_edited_test.csv")
dataset_gen_dir = os.path.join(dataset_dir, "Generated_for_pred_eval_adult//")

data= pd.read_csv('Datasets\\df_adult_edited.csv')
discrete_columns = data.columns[data.dtypes == "object"]
data_train, data_test = train_test_split(data, test_size=0.4, random_state=1)
print(f"Train size: {data_train.shape[0]}, Test size: {data_test.shape[0]}")

data_train.to_csv(dataset_train_path)
data_test.to_csv(dataset_test_path)

Train size: 19536, Test size: 13025


In [3]:
n_epochs = 20
n_critic = 10
adam_lr = 0.002
adam_beta1 = 0.5
noise_discrete_unif_max = 0

batch_size = 500
ckpt_every = 1
loss_plot_update_every = 10

n_synthetic_datasets = 5

retrain = False

In [4]:
%run "$tableGAN_path"
tg = TableGAN(data, n_critic = n_critic, adam_lr = adam_lr, adam_beta1 = adam_beta1,
              quantile_transformation_int = False, quantile_rand_transformation = False,
             noise_discrete_unif_max = noise_discrete_unif_max)

tg_qt = TableGAN(data, n_critic = n_critic, adam_lr = adam_lr, adam_beta1 = adam_beta1,
              quantile_transformation_int = True, quantile_rand_transformation = False,
             noise_discrete_unif_max = noise_discrete_unif_max)

tg_qtr = TableGAN(data, n_critic = n_critic, adam_lr = adam_lr, adam_beta1 = adam_beta1,
              quantile_transformation_int = True, quantile_rand_transformation = True,
             noise_discrete_unif_max = noise_discrete_unif_max)

In [5]:
%run "$tableGAN_path"
def generate_multiple_datasets(tgan, dataset_dir, subfolder = None):
    if not subfolder is None:
        dataset_dir = os.path.join(dataset_dir, subfolder) 
    os.makedirs(dataset_dir, exist_ok = True)
    for i in tqdm(range(n_synthetic_datasets), desc = "Generated datasets"):
        
        tgan.train(n_epochs, batch_size = batch_size, restart_training = True, plot_loss = False,
                 progress_bar = True, progress_bar_desc = f"Progress generating dataset {i+1}")
        fake_train = tgan.generate_data()
        fake_train.to_csv(os.path.join(dataset_dir, f"gen{i}.csv"))
    

In [6]:
if retrain:
    generate_multiple_datasets(tg, dataset_gen_dir, subfolder = "tabGAN")

In [7]:
if retrain:
    generate_multiple_datasets(tg_qt, dataset_gen_dir, subfolder = "tabGAN-qt")

In [8]:
if retrain:
    generate_multiple_datasets(tg_qtr, dataset_gen_dir, subfolder = "tabGAN-qtr")

In [9]:
if retrain:
    dataset_dir_ctgan = os.path.join(dataset_gen_dir, "CTGAN") 
    os.makedirs(dataset_dir_ctgan, exist_ok = True)
    for i in tqdm(range(n_synthetic_datasets), desc = "Generated datasets"):

        ctgan = CTGANSynthesizer(epochs=n_epochs, discriminator_steps=n_critic)
        ctgan.fit(data_train, tg.columns_discrete)
        fake_train = ctgan.sample(data_train.shape[0])
        fake_train.to_csv(os.path.join(dataset_dir_ctgan, f"gen{i}.csv"))
        del ctgan

In [6]:
%run "$utils_path"

subfolders = ["tabGAN", "tabGAN-qt", "tabGAN-qtr", "CTGAN", "TGAN", "tabFairGAN", "tabFairGAN-orig"]
evaluate_tabGAN_through_prediction(data_train, data_test, dataset_gen_dir, subfolders, n_synthetic_datasets)

  0%|          | 0/36 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,SD Accuracy,SD AUC
0,Train dataset,0.82856,0.870968,0.0,0.0
1,tabGAN,0.808614,0.842037,0.008287,0.005719
2,tabGAN-qt,0.813221,0.839334,0.004017,0.00671
3,tabGAN-qtr,0.808384,0.837567,0.006518,0.006155
4,CTGAN,0.806787,0.821325,0.005266,0.004287
5,TGAN,0.71005,0.529143,0.024137,0.103481
6,tabFairGAN,0.798403,0.815467,0.001824,0.005832
7,tabFairGAN-orig,0.759432,0.572379,0.004786,0.035611


In [None]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['income'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['income'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['income'], dtrain_predprob))

    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)


modelfit(xgb1, train, predictors)

In [None]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=0,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2, train, predictors)