In [209]:
import pandas as pd
import math
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
%matplotlib inline

In [22]:
# Start by importing the data
bank = pd.read_csv('bank-full.csv', delimiter=';')

In [35]:
# We'll clean up the data a bit by encoding the categories using a label encoder
lb_make = LabelEncoder()

categories = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']
for col in categories: 
    bank[str(col)+"_code"] = lb_make.fit_transform(bank[col])

X = bank.drop(['y_code', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y'],axis=1)
y = bank['y_code']

## Decision tree

In [94]:
# We then split the data into a training and testing set and train the decision tree 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [95]:
predictions = dtree.predict(X_test)
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.93      0.93      0.93      7979
          1       0.45      0.45      0.45      1064

avg / total       0.87      0.87      0.87      9043



In [220]:
# Let's look at the feature importance
print(dtree.feature_importances_)
print(dtree.features)

[ 0.10095648  0.1179351   0.09254255  0.28902364  0.03600549  0.054445
  0.01136128  0.04592145  0.02054748  0.0216739   0.00127859  0.02274056
  0.00823583  0.01861549  0.10006475  0.05865239]


AttributeError: 'DecisionTreeClassifier' object has no attribute 'features'

In [176]:
# We will now look at how the accuracy of the tree is affected by variations in the max tree depth

# We then split the data into a training and testing set and train the decision tree 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

for i in range(1, 2):    
    dtree = DecisionTreeClassifier(max_depth=i)
    dtree.fit(X_train,y_train)
    predictions = dtree.predict(X_test)
    print(accuracy_score(y_test, predictions))

0.881897600354


In [189]:
# Calculate ROC AUC
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)
print(roc_auc_score(y_test, predictions))
print(accuracy_score(y_test, predictions))

0.710364452824
0.875926130709


## Random forest classifier

In [177]:
# We will now use a random forest classifier to see if we can get a better estimate, note that we use the same x and y data sets
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

In [46]:
# Comparing the two trees, we can see that although there is only a small improvement is total precision, our ability to predict a positive 'yes' outcome has dramatically improved
print(classification_report(y_test,rfc_pred))

             precision    recall  f1-score   support

          0       0.93      0.97      0.95      8005
          1       0.65      0.44      0.53      1038

avg / total       0.90      0.91      0.90      9043



In [49]:
cv_results_3 = cross_val_score(rfc, X, y, cv=3)
print(np.mean(cv_results_3))

0.632650391773


In [50]:
cv_results_5 = cross_val_score(rfc, X, y, cv=5)
print(np.mean(cv_results_5))

0.696743010511


In [51]:
cv_results_10 = cross_val_score(rfc, X, y, cv=10)
print(np.mean(cv_results_5))

0.696743010511


In [182]:
# We will now look at how the accuracy of the tree is affected by variations in the max tree depth

# We then split the data into a training and testing set and train the decision tree 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

for i in range(1, 2):    
    dtree = RandomForestClassifier(n_estimators=50, max_depth=i)
    dtree.fit(X_train,y_train)
    predictions = dtree.predict(X_test)
    print(accuracy_score(y_test, predictions))

0.880902355413


In [188]:
# Calculate ROC AUC
dtree = RandomForestClassifier(n_estimators=500)
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)
print(roc_auc_score(y_test, predictions))
print(accuracy_score(y_test, predictions))

0.693724093414
0.903903571824


## XGBoost

In [276]:
categories = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']
for col in categories: 
    bank[str(col)+"_code"] = lb_make.fit_transform(bank[col])

X = bank.drop(['y_code', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y'],axis=1)
y = bank['y_code']

X.head()
X = X.drop(['pdays', 'loan_code'], axis=1)
bank_dmatrix = xgb.DMatrix(data=X, label=y)

In [None]:
# Create the parameter dictionary for each tree (boosting round)
params = {"objective":"reg:logistic", "max_depth":8}

eta_vals = [0.15]

for curr_val in eta_vals:

    params["eta"] = curr_val
    
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=bank_dmatrix, params=params, nfold=15, num_boost_round=100, metrics="auc", as_pandas=True)

print(cv_results)