In [30]:
import pandas as pd

import numpy as np

import warnings

warnings.filterwarnings('ignore')

In [31]:
df_census = pd.read_csv('census_cleaned.csv')
X = df_census.iloc[:,:-1]

y = df_census.iloc[:,-1]

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [32]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(random_state=2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.8131679154894976

In [33]:
df_bikes = pd.read_csv('bike_rentals_cleaned.csv')
X_bikes = df_bikes.iloc[:,:-1]
y_bikes = df_bikes.iloc[:,-1]
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import cross_val_score

reg = DecisionTreeRegressor(random_state=2)

scores = cross_val_score(reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)

In [34]:
rmse = np.sqrt(-scores)

print('RMSE mean: %0.2f' % (rmse.mean()))

RMSE mean: 1233.36


In [35]:
# reg = DecisionTreeRegressor()
# reg.fit(X_train, y_train)
# y_pred = reg.predict(X_train)
# from sklearn.metrics import mean_squared_error
# reg_mse = mean_squared_error(y_train, y_pred)
# reg_rmse = np.sqrt(reg_mse)
# print(reg_rmse)

In [75]:
from sklearn.model_selection import GridSearchCV
params={'max_depth':[6,7,8,9,10],'min_samples_leaf':[3,5,7,9]}
reg = DecisionTreeRegressor(random_state=2)
grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)
grid_reg.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=2), n_jobs=-1,
             param_grid={'max_depth': [6, 7, 8, 9, 10],
                         'min_samples_leaf': [3, 5, 7, 9]},
             scoring='neg_mean_squared_error')

In [76]:
best_params = grid_reg.best_params_
print("Best params:", best_params)

Best params: {'max_depth': 9, 'min_samples_leaf': 7}


In [77]:
best_score = np.sqrt(-grid_reg.best_score_)
print("Training score: {:.3f}".format(best_score))

Training score: 888.905


In [78]:
best_model = grid_reg.best_estimator_
y_pred = best_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred)**0.5
print('Test score: {:.3f}'.format(rmse_test))

Test score: 878.538


In [61]:
def grid_search(params, reg=DecisionTreeRegressor(random_state=2)):

    grid_reg = GridSearchCV(reg, params,  

    scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    grid_reg.fit(X_train, y_train)

In [62]:
best_params = grid_reg.best_params_ 
print("Best params:", best_params)
best_score = np.sqrt(-grid_reg.best_score_)    
print("Training score: {:.3f}".format(best_score))

y_pred = grid_reg.predict(X_test)    
rmse_test = mean_squared_error(y_test, y_pred)**0.5

print('Test score: {:.3f}'.format(rmse_test))

Best params: {'min_samples_leaf': 8}
Training score: 896.083
Test score: 855.620


In [63]:
X_train.shape

(548, 12)

In [80]:
df_heart = pd.read_csv('heart_disease.csv')
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [81]:
X = df_heart.iloc[:,:-1]
y = df_heart.iloc[:,-1]
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [82]:
model = DecisionTreeClassifier(random_state=2)

scores = cross_val_score(model, X, y, cv=5)

print('Accuracy:', np.round(scores, 2))

print('Accuracy mean: %0.2f' % (scores.mean()))


Accuracy: [0.74 0.85 0.77 0.73 0.7 ]
Accuracy mean: 0.76


In [85]:
from sklearn.model_selection import RandomizedSearchCV
def randomized_search_clf(params, runs=20, clf=DecisionTreeClassifier(random_state=2)):

    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, cv=5, n_jobs=-1, random_state=2)
    rand_clf.fit(X_train, y_train)
    best_model = rand_clf.best_estimator_
    best_score = rand_clf.best_score_  
    print("Training score: {:.3f}".format(best_score))

    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print('Test score: {:.3f}'.format(accuracy))

    return best_model

In [86]:
randomized_search_clf(params={'criterion':['entropy', 'gini'],'splitter':['random', 'best'], 'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01],'min_samples_split':[2, 3, 4, 5, 6, 8, 10],'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],'max_features':['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],'max_depth':[None, 2,4,6,8],'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]})

Training score: 0.798
Test score: 0.855


DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=0.8,
                       max_leaf_nodes=45, min_samples_leaf=0.04,
                       min_samples_split=10, min_weight_fraction_leaf=0.05,
                       random_state=2)

In [87]:
randomized_search_clf(params={'max_depth':[None, 6, 7],'max_features':['auto', 0.78], 'max_leaf_nodes':[45, None], 'min_samples_leaf':[1, 0.035, 0.04, 0.045, 0.05],'min_samples_split':[2, 9, 10],'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],}, runs=100)

Training score: 0.802
Test score: 0.868


DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, random_state=2)

In [92]:
model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7, max_features=0.78, max_leaf_nodes=45, min_impurity_decrease=0.0, min_samples_leaf=0.045, min_samples_split=9, min_weight_fraction_leaf=0.06, random_state=2, splitter='best')

scores = cross_val_score(model, X, y, cv=5)

print('Accuracy:', np.round(scores, 2))

print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.82 0.9  0.8  0.8  0.78]
Accuracy mean: 0.82


In [94]:
best_clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,max_features=0.8, max_leaf_nodes=47,min_impurity_decrease=0.0,min_samples_leaf=1, min_samples_split=8,min_weight_fraction_leaf=0.05, random_state=2, splitter='best')

best_clf.fit(X, y)

best_clf.feature_importances_

array([0.04830121, 0.04008887, 0.47546568, 0.        , 0.        ,
       0.        , 0.        , 0.00976578, 0.        , 0.02445397,
       0.02316427, 0.1774694 , 0.20129082])

In [96]:
feature_dict = dict(zip(X.columns, best_clf.feature_importances_))
import operator
sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]

[('cp', 0.47546567857183675),
 ('thal', 0.20129082387838435),
 ('ca', 0.1774694042213901)]