# Imports

In [2]:
import sys
import warnings
warnings.simplefilter(action='ignore', category=Warning)
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, auc, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.calibration import CalibrationDisplay
from sklearn.calibration import CalibratedClassifierCV

import argparse


In [4]:
def get_column_types(df, num_unique_threshold=20):
    """
    This function takes a dataframe and a threshold for the number of unique values
    and returns a dictionary with the numerical and discrete columns.
    """
    cat_cols = [col for col in df.columns if df[col].dtype == 'object']
    other_cols = [col for col in df.columns if col not in cat_cols]
    other_cols_cat = [col for col in other_cols if df[col].nunique() <= num_unique_threshold]
    discrete_cols = cat_cols + other_cols_cat
    numerical_cols = [col for col in df.columns if col not in discrete_cols]

    return {"numerical_cols": numerical_cols, "discrete_cols": discrete_cols}


### Models


In [7]:
def xgboost(X_train, y_train, EPOCHS=200):
    # defining the XGBoost train and test loaders
    xgb_train = xgb.DMatrix(X_train, y_train, enable_categorical=True)

    # defining the hyperparameters and training the model
    n = EPOCHS
    params = {
        'objective': 'binary:logistic',
        
    }

    model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=n)

    # Predicting on the training set
    preds_train = model.predict(xgb_train)
    y_pred_train = [round(pred) for pred in preds_train]
    print(sum(y_pred_train))
    accuracy = accuracy_score(y_train, y_pred_train)
    print('Training Accuracy of the model is:', accuracy*100)
    
    return model

In [8]:
def xgb_predict(model, X_test, y_test, threshold=0.5):
    preds = model.predict(xgb.DMatrix(X_test))
    y_pred = [pred>=threshold for pred in preds]
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy*100)
    preds = np.array([1-preds, preds]).T
    return accuracy*100

# Train with real data

In [10]:
parser = argparse.ArgumentParser()
parser.add_argument('--dataname', type=str, default='adult')
parser.add_argument('--target', type=str, default='income')
parser.add_argument('--method', type=str, default='tabsyn')
args = parser.parse_args()

DATANAME = args.dataname
TARGET = args.target
METHOD = args.method

usage: ipykernel_launcher.py [-h] [--dataname DATANAME] [--target TARGET]
                             [--method METHOD]
ipykernel_launcher.py: error: unrecognized arguments: --f=/mnt/home/swethamagesh/.local/share/jupyter/runtime/kernel-v2-3483620xXRrhFivm2mh.json


SystemExit: 2

In [11]:
real_data = pd.read_csv(f'data/{DATANAME}/original.csv')

NameError: name 'DATANAME' is not defined

In [10]:
X_train = real_data.drop(TARGET, axis=1)
y_train = real_data[TARGET]
X_test = X_train[:200]
y_test = y_train[:200]

xgboost_model = xgboost(X_train, y_train)

8377
Training Accuracy of the model is: 86.62376719295918
Cat features ['workclass', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']




Learning rate set to 0.027336
0:	learn: 0.6636915	total: 414ms	remaining: 6m 53s
1:	learn: 0.6353602	total: 559ms	remaining: 4m 39s
2:	learn: 0.6115886	total: 1.2s	remaining: 6m 39s
3:	learn: 0.5902828	total: 1.6s	remaining: 6m 38s
4:	learn: 0.5685559	total: 1.66s	remaining: 5m 30s
5:	learn: 0.5502685	total: 1.72s	remaining: 4m 44s
6:	learn: 0.5329101	total: 1.82s	remaining: 4m 17s
7:	learn: 0.5166893	total: 2.45s	remaining: 5m 3s
8:	learn: 0.5019977	total: 2.86s	remaining: 5m 15s
9:	learn: 0.4888244	total: 2.96s	remaining: 4m 53s
10:	learn: 0.4764401	total: 3.02s	remaining: 4m 31s
11:	learn: 0.4656549	total: 3.09s	remaining: 4m 14s
12:	learn: 0.4550791	total: 3.64s	remaining: 4m 36s
13:	learn: 0.4460302	total: 4.18s	remaining: 4m 54s
14:	learn: 0.4376434	total: 4.28s	remaining: 4m 40s
15:	learn: 0.4297617	total: 4.33s	remaining: 4m 26s
16:	learn: 0.4214066	total: 4.39s	remaining: 4m 13s
17:	learn: 0.4138795	total: 4.85s	remaining: 4m 24s
18:	learn: 0.4074035	total: 5.48s	remaining: 4m

In [None]:
def acc_no_cond(no_cond):
    test0 = no_cond[no_cond[TARGET]==0]
    test1 = no_cond[no_cond[TARGET]==1]

    X_test0 = test0.drop(TARGET, axis=1)
    y_test0 = test0[TARGET]

    X_test1 = test1.drop(TARGET, axis=1)
    y_test1 = test1[TARGET]

    print('C0:', end='')
    xgb_predict(xgboost_model, X_test0, y_test0)
    print('C1:', end='')
    xgb_predict(xgboost_model, X_test1, y_test1)

def acc_cond(cond):
    # apply the condition cond == 2 gets 1 else 0
    cond[TARGET] = cond['cond'].apply(lambda x: 1 if x==2 else 0)
    cond[TARGET].value_counts()
    test0 = cond[cond['cond']==0]
    test1 = cond[cond['cond']==1]
    test2 = cond[cond['cond']==2]

    X_test0 = test0.drop([TARGET, 'cond'], axis=1)
    y_test0 = test0[TARGET]

    X_test1 = test1.drop([TARGET, 'cond'], axis=1)
    y_test1 = test1[TARGET]

    X_test2 = test2.drop([TARGET, 'cond'], axis=1)
    y_test2 = test2[TARGET]

    print('C00: ',end='')
    c0=xgb_predict(xgboost_model, X_test0, y_test0)
    print('C1:', end='')
    c2=xgb_predict(xgboost_model, X_test2, y_test2)


    

# CTABGAN+


## No cond

In [22]:
path = f'data/{DATANAME}/{METHOD}/syn_noord.csv'
no_cond = pd.read_csv(path)
no_cond[TARGET].value_counts()


income
0    510534
1      7166
Name: count, dtype: int64

## Cond

In [24]:
path = f'data/{DATANAME}/{METHOD}/syn_cond.csv'
cond = pd.read_csv(path)
cond['cond'].value_counts()


cond
0    501261
2     10558
1      5873
Name: count, dtype: int64

In [None]:
print('Accuracy of No ord ')
acc_no_cond(no_cond)
print('Accuracy of ORD')
acc_cond(cond)
