In [5]:
DATANAME = 'adult'
TARGET = 'income'
METHOD = 'tabsyn'

## xgboost


In [4]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Modelling
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV

In [11]:
def xgboost(X_train, y_train, X_test, y_test, EPOCHS=200):
    xgb_train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
    xgb_test = xgb.DMatrix(X_test, y_test, enable_categorical=True)

    n = EPOCHS
    params = {
        'objective': 'binary:logistic',
        'learning_rate': 0.1,
    }

    model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=n)

    # Predicting on the training set
    preds_train = model.predict(xgb_train)
    y_pred_train = [round(pred) for pred in preds_train]
    print(sum(y_pred_train))
    accuracy = accuracy_score(y_train, y_pred_train)
    print('Accuracy of the model is:', accuracy*100)

    # predicting on the test set
    preds = model.predict(xgb_test)
    y_pred = [round(pred) for pred in preds]
    print(sum(y_pred))
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy of the model is:', accuracy*100)

### ORD After synthesis

In [None]:
# load original unb
unb_data = pd.read_csv(f'../data/{DATANAME}/imbalanced_noord.csv')
X_train = unb_data.drop(columns=[TARGET])
y_train = unb_data[TARGET]
 #hyperparemeter tuning
param_dist = {'n_estimators': [50]}
rf = RandomForestClassifier()
rand_search = RandomizedSearchCV(rf, 
                                param_distributions = param_dist, 
                                n_iter=5, 
                                cv=5)
rand_search.fit(X_train, y_train)
real_model = rand_search.best_estimator_
print('Best hyperparameters:',  rand_search.best_params_)

In [21]:
def boundary_function_helper(X_train, y_train, X_test, y_test, test, THRESHOLD=0.4):
    EPOCHS = 200

   #hyperparemeter tuning
    param_dist = {'n_estimators': [50]}
    rf = RandomForestClassifier()
    rand_search = RandomizedSearchCV(rf, 
                                    param_distributions = param_dist, 
                                    n_iter=5, 
                                    cv=5)
    rand_search.fit(X_train, y_train)
    best_rf = rand_search.best_estimator_
    print('Best hyperparameters:',  rand_search.best_params_)

    y_pred = best_rf.predict(X_test)
    y_pred_train = best_rf.predict(X_train)
    proba_test = best_rf.predict_proba(X_test)
    proba_train = best_rf.predict_proba(X_train)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    print(f'Training accuray is: {accuracy_train}')
    print(f'Testing accuray is: {accuracy}')
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot()

    uncertain_train = 0
    for prob in proba_train:
        if prob[0] >= THRESHOLD and prob[1] >= THRESHOLD:
            uncertain_train += 1 
    uncertain_test = 0
    for prob in proba_test:
        if prob[0] >= THRESHOLD and prob[1] >= THRESHOLD:
            uncertain_test += 1 
    print(f"{uncertain_train} out of {len(proba_train)} samples have higher uncertainity from the training set")
    print(f"{uncertain_test} out of {len(proba_test)} samples have higher uncertainity from the test set")

    boundary = []
    for i in range(len(y_test)):
        if (y_test[i] != y_pred[i]) or (proba_test[i][0] >= THRESHOLD and proba_test[i][1] >= THRESHOLD):
            boundary.append(i)
    print(f"There are {len(boundary)} points that have been wrongly predicted or are at the boundary")
        # Finding boundary samples from the test set by thresholding on the prediction probabilities or checking if a sample has been incorrectly predicted
    
    # creating a temporary dataframe that contains isBoundary=1 for the samples in the boundary list and 0 for others
    df1 = test
    df1['isBoundary'] = 0
    for wrong in boundary:
        df1['isBoundary'][wrong] = 1
    print(df1['isBoundary'].value_counts())

    return df1
def find_boundary(df, TARGET,  RANDOM_STATE=42, threshold=0.4):
        
    start = [0, len(df)//2]
    end = [len(df)//2, len(df)]
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()

    df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

    for i in range(2):
        print(f"Split {i+1}")
        train = df.drop([k for k in range(start[i], end[i])], axis=0)
        test = df.drop(train.index, axis=0)
        train = train.reset_index(drop=True)
        test = test.reset_index(drop=True)
        # train = pd.concat([train[train[TARGET] == 1], train[train[TARGET] == 0].sample(n=majority_class_samples, random_state=RANDOM_STATE)], axis=0)
        X_train = train.drop(TARGET, axis=1)
        y_train = train[TARGET]
        X_test = test.drop(TARGET, axis=1)
        y_test = test[TARGET]

        # get boundary dataframe
        if i == 0:
            df1 = boundary_function_helper(X_train, y_train, X_test, y_test, test, threshold)
        else:
            df2 = boundary_function_helper(X_train, y_train, X_test, y_test, test, threshold)
    return pd.concat([df1, df2], axis=0)

In [None]:
# find overlap in synthetic_noord dataset trained on entire real dataset
# remove c01 and train on synthetic_noord

synthetic_noord = pd.read_csv(f'../data/{DATANAME}/{METHOD}/syn_noord.csv')
synthetic_noord = synthetic_noord.sample(frac=1, random_state=42)

no_ord_bndry = find_boundary(synthetic_noord, TARGET, threshold=0.3)



In [23]:
class_0 = no_ord_bndry.loc[(no_ord_bndry[TARGET] == 0 ) & (no_ord_bndry['isBoundary'] == 0), no_ord_bndry.columns != 'isBoundary']
class_1 = no_ord_bndry[no_ord_bndry[TARGET] == 1]
class_1.drop(columns='isBoundary', inplace=True)

test_method1 = pd.concat([class_0, class_1], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_1.drop(columns='isBoundary', inplace=True)


In [24]:
test_method1

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,30.0,4,43550.890,7,0,2,-1,1,0,0.0,0.0000,40.000000,1,0
1,33.0,6,59467.965,13,0,6,0,0,0,0.0,0.0000,44.776950,1,0
2,65.0,4,303210.560,5,2,2,1,0,0,0.0,0.0000,40.000000,1,0
4,35.0,6,174580.780,9,0,6,0,0,0,0.0,0.0000,59.270763,1,0
5,46.0,4,263935.720,5,2,2,1,1,0,0.0,0.0000,40.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15370,38.0,2,167023.700,13,2,6,1,0,0,0.0,1977.5924,46.000000,1,1
15382,51.0,4,203756.200,12,2,6,1,0,0,0.0,0.0000,40.000000,1,1
15409,45.0,4,230600.140,13,2,2,1,0,0,0.0,0.0000,20.000000,1,1
15440,53.0,4,211121.560,9,2,6,1,0,0,0.0,0.0000,40.000000,1,1


In [25]:
# sample equal amts of class 0 and class 1 from test_method1 
class_0 = test_method1[test_method1[TARGET] == 0].sample(n=len(class_1), random_state=42)
test_method1 = pd.concat([class_0, class_1], axis=0)
test_method1 = test_method1.sample(frac=1, random_state=42)

X_train = test_method1.drop(columns=[TARGET])
y_train = test_method1[TARGET]


In [13]:
# get test from test.csv
test = pd.read_csv(f'../data/{DATANAME}/test.csv')
X_test = test.drop(columns=[TARGET])
y_test = test[TARGET]

In [27]:
xgboost(X_train, y_train, X_test, y_test)

480
Accuracy of the model is: 81.9634703196347
2167
Accuracy of the model is: 77.575


### ORD Before synthesis

In [6]:
pre_cond = pd.read_csv(f'../data/{DATANAME}/{METHOD}/syn_ord.csv')
pre_cond

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,cond
0,42.0,2,384161.977373,14,-1,7,2,0,1,0.0,0.0,40.0,1,0
1,30.0,4,226836.884092,10,0,6,3,0,0,0.0,0.0,40.0,1,0
2,59.0,4,117146.652459,10,2,2,1,-1,0,0.0,0.0,40.0,0,0
3,38.0,2,38555.556500,13,0,7,0,0,1,0.0,0.0,40.0,1,0
4,18.0,4,107401.151217,10,0,2,-1,0,1,0.0,0.0,40.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55215,23.0,4,185512.696245,13,0,3,3,0,1,0.0,0.0,40.0,1,0
55216,22.0,4,26873.719695,10,0,4,0,0,1,0.0,0.0,40.0,1,0
55217,36.0,4,87562.446755,9,-1,2,2,0,0,0.0,0.0,40.0,1,0
55218,29.0,4,118735.947026,9,0,2,0,0,0,0.0,0.0,37.0,0,0


In [7]:
# sample equal amts of class 0 and class 1 from pre_cond

class_1 = pre_cond[pre_cond['cond'] == 1]
class_0 = pre_cond[pre_cond['cond'] == 0].sample(n=len(class_1), random_state=42)
pre_cond = pd.concat([class_0, class_1], axis=0)
pre_cond = pre_cond.reset_index(drop=True)


In [9]:
pre_cond[TARGET] = pre_cond['cond'].apply(lambda x: 1 if x == 2 else 0)
pre_cond.drop(columns=['cond'], inplace=True)

In [None]:
pre_cond_x = pre_cond.drop(TARGET, axis=1)
pre_cond_y = pre_cond[TARGET]

xgboost(pre_cond_x, pre_cond_y, X_test, y_test)