# Federated Boosted Decision Trees with Differential Privacy
### Maddock et al

In [1]:
import pandas as pd
from collections import Counter

from experiments.experiment_helpers.data_loader import DataLoader

In [2]:
REMOVE_MISSING = True # the paper removes missing values
data_path = './data/'


credit1_df = pd.read_csv(data_path + "Kaggle_Credit_1/credit1-training.csv")
credit1_df = credit1_df.drop('Unnamed: 0', axis=1)

if REMOVE_MISSING:
    num_rows = credit1_df.shape[0]
    credit1_df = credit1_df.dropna(how='any')
    print("[Data Loader] Removing nans from Credit 1", num_rows, "vs", credit1_df.shape[0])

credit1_y = credit1_df["SeriousDlqin2yrs"]
credit1_X = credit1_df.drop("SeriousDlqin2yrs", axis=1)
credit1_counter = Counter(credit1_y)

[Data Loader] Removing nans from Credit 1 150000 vs 120269


In [3]:
print('num samples', len(credit1_df))
print('num columns', len(credit1_df.columns))
credit1_df.dtypes

num samples 120269
num columns 11


SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

In [4]:
credit1_df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


## Credit 1 Features

In [6]:
credit1_X.describe()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0,120269.0
mean,5.899873,51.289792,0.381769,26.598777,6670.221,8.758475,0.211925,1.054519,0.187829,0.851832
std,257.040685,14.426684,3.499234,424.446457,14384.67,5.172835,3.465276,1.149273,3.447901,1.148391
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.035084,40.0,0.0,0.143388,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.177282,51.0,0.0,0.296023,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.579428,61.0,0.0,0.482559,8249.0,11.0,0.0,2.0,0.0,2.0
max,50708.0,103.0,98.0,61106.5,3008750.0,58.0,98.0,54.0,98.0,20.0


## Credit 1 Lables

In [8]:
# binary classification, its sufficient to know what the distribution is of the dataset
pd.DataFrame(credit1_y.value_counts())

Unnamed: 0,SeriousDlqin2yrs
0,111912
1,8357


## Training

In [16]:
import sys
sys.path.append("../")

from federated_gbdt.models.gbdt.private_gbdt import PrivateGBDT
from experiments.experiment_helpers.data_loader import DataLoader
from sklearn.metrics import roc_auc_score

X_train = credit1_X
y_train = credit1_y

In [17]:
# XGBoost Training (No DP)
xgb_model = PrivateGBDT(num_trees=100, epsilon=0)
xgb_model = xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict_proba(credit1_X)[:, 1]

roc_auc_score(y_train, y_pred)

0.8382241335743097