Make sure DMBA package is available

In [1]:
pip install dmba

Collecting dmba
  Downloading dmba-0.2.4-py3-none-any.whl.metadata (1.9 kB)
Downloading dmba-0.2.4-py3-none-any.whl (11.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.2.4


In [13]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pylab as plt
import dmba
from dmba import plotDecisionTree, classificationSummary, regressionSummary
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
mower_df = dmba.load_data('RidingMowers.csv')
mower_df.head()

In [None]:
classTree = DecisionTreeClassifier(random_state=0)
classTree.fit(mower_df.drop(columns=['Ownership']), mower_df['Ownership'])



In [None]:
plotDecisionTree(classTree, feature_names=mower_df.columns[:2], class_names=classTree.classes_)

In [None]:
classTree = DecisionTreeClassifier(random_state=0, max_depth=3)
classTree.fit(mower_df.drop(columns=['Ownership']), mower_df['Ownership'])

In [None]:
plotDecisionTree(classTree, feature_names=mower_df.columns[:2], class_names=classTree.classes_)

The order of the `values` vector in the boxes is the same as `classTree.classes_`.

In [None]:
bank_df = dmba.load_data('UniversalBank.csv')
bank_df.head()

In [4]:
bank_df = bank_df.drop(columns=['ID', 'ZIP Code'])

In [11]:
X = bank_df.drop(columns=['Personal Loan'])
y = bank_df['Personal Loan']

In [9]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [None]:
fullClassTree = DecisionTreeClassifier()
fullClassTree.fit(train_X, train_y)

In [None]:
plotDecisionTree(fullClassTree, feature_names=train_X.columns)

In [None]:
classificationSummary(train_y, fullClassTree.predict(train_X))
classificationSummary(test_y, fullClassTree.predict(test_X))

In [None]:
smallClassTree = DecisionTreeClassifier(max_depth=30, min_samples_split=20, min_impurity_decrease=0.01)
smallClassTree.fit(train_X, train_y)

plotDecisionTree(smallClassTree, feature_names=train_X.columns)

In [None]:
classificationSummary(train_y, smallClassTree.predict(train_X))
classificationSummary(test_y, smallClassTree.predict(test_X))

In [None]:
# If your DV is continuous, however the DV is binary
regressionSummary(train_y, smallClassTree.predict(train_X))
regressionSummary(test_y, smallClassTree.predict(test_X))

In [None]:
# Five-fold cross-validation of the full decision tree classifier
treeClassifier = DecisionTreeClassifier()
scores = cross_val_score(treeClassifier, train_X, train_y, cv=5)

In [None]:
print('Accuracy scores of each fold: ', [acc for acc in scores])
print('Accuracy:', scores.mean())


In [None]:
train_errors = []
test_errors = []

# Loop over different depths
max_depth_range = range(1, 21)  # Change this range if you want more or fewer depths
for depth in max_depth_range:
    # Initialize and train decision tree classifier with given max depth
    clf = DecisionTreeClassifier(max_depth=depth, random_state=42)
    clf.fit(train_X, train_y)

    # Predict on training set and test set
    y_train_pred = clf.predict(train_X)
    y_test_pred = clf.predict(test_X)

    # Calculate accuracy for training and test sets
    train_accuracy = accuracy_score(train_y, y_train_pred)
    test_accuracy = accuracy_score(test_y, y_test_pred)

    # Append 1 - accuracy to get the error
    train_errors.append(1 - train_accuracy)
    test_errors.append(1 - test_accuracy)

# Plotting the training and test errors
plt.figure(figsize=(10, 6))
plt.plot(max_depth_range, train_errors, label='Training Error', color='blue', marker='o')
plt.plot(max_depth_range, test_errors, label='Test Error', color='red', marker='o')
plt.title('Error vs Tree Depth')
plt.xlabel('Tree Depth')
plt.ylabel('Error (1 - Accuracy)')
plt.legend()
plt.grid(True)
plt.show()

# Random Forest

In [None]:
bank_df = dmba.load_data('UniversalBank.csv')
bank_df = bank_df.drop(columns=['ID', 'ZIP Code'])

In [None]:
X = bank_df.drop(columns=['Personal Loan'])
y = bank_df['Personal Loan']
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [None]:
rf = RandomForestClassifier(n_estimators=500, random_state=1)
rf.fit(train_X, train_y)

In [None]:
classificationSummary(test_y, rf.predict(test_X))

Variable importance plot

In [None]:
importances = rf.feature_importances_
df = pd.DataFrame({'feature': train_X.columns, 'importance': importances})
df = df.sort_values('importance')
print(df)

ax = df.plot(kind='barh', x='feature', legend=False)
ax.set_ylabel('')

plt.tight_layout()
plt.show()

**Boosted Trees**

In [None]:
boost = GradientBoostingClassifier()
boost.fit(train_X, train_y)
classificationSummary(test_y, boost.predict(test_X))