In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split

In [None]:
data_features = pd.read_csv("data_with_features.csv")
#data_features.set_index(data_features['account_id'], inplace=True)

In [None]:
data_features.info()

In [None]:
data_features.columns

In [None]:
# XGBoost needs categorical values to be one-hot encoded

#loan
fico_bins_ds = pd.get_dummies(data_features['fico_bin'], prefix='FICO')
# ntriggers_bin
ntrigger_bins_df= pd.get_dummies(data_features['ntriggers_bin'], prefix='NTRIGGERS')
one_hot_features = fico_bins_ds.join(ntrigger_bins_df)
# num_times_neg_bin
num_times_neg_bins_df = pd.get_dummies(data_features['num_times_neg_bin'], prefix="NUMtimesNEG")
one_hot_features = one_hot_features.join(num_times_neg_bins_df)
# max_days_neg_bin
max_days_neg_bins_df = pd.get_dummies(data_features['max_days_neg_bin'], prefix="MAXdaysNEG")
one_hot_features = one_hot_features.join(max_days_neg_bins_df)
# ck_returns_bin
ck_returns_bins_df = pd.get_dummies(data_features['ck_returns_bin'], prefix="CTRETURNS")
one_hot_features = one_hot_features.join(ck_returns_bins_df)
# ave_bal6_binned/ave_bal3_binned
aveBal6_bins_df = pd.get_dummies(data_features['avg_bal6_binned'], prefix='AVEBAL6')
one_hot_features = one_hot_features.join(aveBal6_bins_df)
#
aveBal3_bins_df = pd.get_dummies(data_features['avg_bal3_binned'], prefix='AVEBAL3')
one_hot_features = one_hot_features.join(aveBal3_bins_df)
#
print("Shape of one-hot features:", one_hot_features.shape)

In [None]:
# Join one-hot features to general dataset.
data_features2 = data_features.join(one_hot_features)

In [None]:
print("size of data_features2:", data_features2.shape)
print("Columns in data_features2", data_features2.columns)

In [None]:
# Keep features needed for xg boost model

# Drop these columns to prepare data for xgboost
drop_columns = ['fico_bin', 'ntriggers_bin', 'num_times_neg_bin', 'max_days_neg_bin', 'ck_returns_bin',
                'avg_bal6_binned', 'avg_bal3_binned']
drop_columns2 = ['fico_b', 'ntriggers', 'num_times_neg', 'max_days_neg', 'ck_returns', 'avg_bal6', 'avg_bal3'] + drop_columns
# optional drop
drop_columns3 = ['account_id'] + drop_columns2

In [None]:
analytic_dataset = data_features2.drop(drop_columns2, axis=1)
print(analytic_dataset.shape)

# inspect using Excel - did it go okay?
analytic_dataset.to_csv("analytic_dataset.csv")

The story so far...
The data from Step 1, EDA and Feature Extraction phase, was read into Python. All of the categorical features had to be converted into one-hot encoding, since XGboost work best with numerical data.
After the one-hot encoded features were added to the analytical dataset, many of the original fields need to be removed before submitting to XGboost.

What to do next...
1. split data into charged-off data
2. select 20% of the active accounts for training.
3. prepare XGboost for imbalanced data
4. run XGboost
5. score active accounts for propensity to defraud
6. print out decision tree
7. print out features, and their relative importance

In [None]:
# 1.Split data into charged-off and non charged-off.
# Since the objective of the model is to identify accounts that are most likely to commit fraudulent activities,
# we will select accounts that are active.
active_accounts = analytic_dataset[analytic_dataset['status']=='Active']
data_CO = analytic_dataset[analytic_dataset['cos']==1]

In [None]:
tmp = pd.concat(["active_accounts", "data_CO"])

In [None]:
print("Active accounts shape:",active_accounts.shape)
print("Charged-off accounts shape:", data_CO.shape)

In [None]:
# Randomly select 20% of non-charged off accounts
np.random.seed(147)
active_accounts_frac20 = active_accounts.sample(frac=0.2)


In [None]:
active_accounts_frac20.shape

In [None]:
print(active_accounts_frac20.head())

In [None]:
# For training set, add the 20%-sample of active accounts with all of the charged-off accounts.
analytic_dataset_model = active_accounts_frac20.join(data_CO)

In [None]:
# xg_reg = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1,
#                           max_depth=5, alpha=10, n_estimators=10)
#
# params = {"objective": "binary:logistic", 'colsample_bytree': 0.3, 'learning_rate': 0.1,
#           'max_depth': 5, 'alpha': 10}
#
# cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
#                     num_boost_round=50, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)