In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv('dataset_1.csv')

In [32]:
train, test = train_test_split(data, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

In [33]:
X_train = train.drop('Target', axis=1)
y_train = train['Target']

In [34]:
X_val = val.drop('Target', axis=1)
y_val = val['Target']

In [35]:
X_test = test.drop('Target', axis=1)
y_test = test['Target']

In [36]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'max_depth': 10,
    'learning_rate': 0.1,
    'n_estimators': 100,
     'gamma' : 0.1,
    'reg_alpha' : 0.1,
    'reg_lamdba':0.1
}

In [37]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [43]:
model = xgb.train(params, dtrain, evals=[(dval, 'validation')], early_stopping_rounds=10)

Parameters: { "n_estimators", "reg_lamdba" } are not used.

[0]	validation-logloss:0.61053
[1]	validation-logloss:0.54228
[2]	validation-logloss:0.48371
[3]	validation-logloss:0.43636
[4]	validation-logloss:0.40194
[5]	validation-logloss:0.36575
[6]	validation-logloss:0.33482
[7]	validation-logloss:0.30867
[8]	validation-logloss:0.28473
[9]	validation-logloss:0.26478


In [45]:
dtest = xgb.DMatrix(X_test)

In [46]:
y_pred = model.predict(dtest)
y_pred = np.round(y_pred)

In [47]:
accuracy = accuracy_score(y_test, y_pred)

In [48]:
print('Accuracy: {:.2f}%'.format(accuracy * 100))

Accuracy: 100.00%


In [51]:
# initialize the XGBClassifier model with pruning parameters
clf = XGBClassifier(
    max_depth=3,          # Maximum tree depth
    learning_rate=0.1,    # Learning rate
    n_estimators=100,     # Number of trees in the forest
    gamma=0.1,            # Minimum loss reduction to make a further partition
    reg_alpha=0.1,        # L1 regularization on leaf weights
    reg_lambda=0.1,       # L2 regularization on leaf weights
    subsample=0.8,        # Subsample ratio of the training set
    colsample_bytree=0.8  # Subsample ratio of columns when constructing each tree
)

In [52]:
# train the XGBClassifier model on the training set
clf.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.1, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [53]:
# evaluate the accuracy of the model on the testing set
accuracy = clf.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.55%


In [54]:
# extract the decision tree from the trained model
tree = clf.get_booster().get_dump()[0]


In [55]:
print(tree)

0:[LastStatementMinimumPaymentDueAmount<59.5] yes=1,no=2,missing=1
	1:[LastStatementMinimumPaymentDueAmount<31.4599991] yes=3,no=4,missing=3
		3:[LastStatementMinimumPaymentDueAmount<6.05000019] yes=7,no=8,missing=7
			7:leaf=-0.199711815
			8:leaf=-0.178434268
		4:[LastStatementBalanceAmount<70.3050003] yes=9,no=10,missing=9
			9:leaf=0.181250006
			10:leaf=-0.17327936
	2:[LastStatementPaymentTotalAmount<107.5] yes=5,no=6,missing=5
		5:[LastStatementPurchaseAmount<458] yes=11,no=12,missing=11
			11:leaf=0.183807835
			12:leaf=-0.0904761925
		6:[LastStatementMinimumPaymentDueAmount<286] yes=13,no=14,missing=13
			13:leaf=-0.193611786
			14:leaf=0.0534296036



In [60]:
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image

In [68]:
# clf = xgb.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, gamma=0.1,
#                         reg_alpha=0.1, reg_lambda=0.1, subsample=0.8, colsample_bytree=0.8)
# clf.fit(X_train, y_train)

# # plot the first tree
# plt.figure(figsize=(40, 20))
# xgb.plot_tree(clf, num_trees=0, rankdir='LR')
# plt.show()
