# Random forest vs decision tree accuracy

## Applying decision tree to diabetes dataset


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv('https://storage.googleapis.com/scsu-data-science/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# replace zeros or NaN with averages for the following columns:
# Glucose, BloodPressure, SkinThickness, BMI, Insulin

for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']:
    df[col] = df[col].replace(0, np.NaN)
    mean = df[col].mean()
    df[col] = df[col].replace(np.NaN, mean)
    
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [None]:
# Define the X columns (independent variables) and y variable 
# (the label - what we want to predict)

# First 8 columns
X = df.loc[:, :'Age']

# Last column
y = df.loc[:, 'Outcome']

# Generate the train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, test_size = 0.2)

tree_clf = DecisionTreeClassifier(criterion='entropy', max_depth=5)
tree_clf.fit(X_train, y_train)
y_pred = tree_clf.predict(X_test)
print('Test accuracy: ', tree_clf.score(X_test, y_test))

Test accuracy:  0.7402597402597403


# Better accuracy with random forest? 

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(n_estimators = 100, random_state = 0)
forest_model.fit(X_train, y_train)
y_pred = forest_model.predict(X_test)

print('Test accuracy: ', forest_model.score(X_test, y_test))

Test accuracy:  0.8051948051948052


# k-fold cross validation to obtain better estimate of model accuracy

In [None]:
# k-fold cross validation for decision tree model

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

scores = cross_val_score(estimator = tree_clf,
                         X = X,
                         y = y,
                         cv = 5)

print('CV accuracy scores for decision tree model: ', scores)
print('Mean CV accuracy score: ', np.mean(scores))
print('95% confidence interval for CV accuracy score: ({}, {})'.format(
    np.mean(scores) - 2 * np.std(scores),
    np.mean(scores) + 2 * np.std(scores)))

CV accuracy scores for decision tree model:  [0.73376623 0.66883117 0.74675325 0.77124183 0.73202614]
Mean CV accuracy score:  0.7305237246413717
95% confidence interval for CV accuracy score: (0.6627588129633102, 0.7982886363194333)


In [None]:
# Let's try cross validation for the random forest model

scores = cross_val_score(estimator = forest_model,
                         X = X,
                         y = y,
                         cv = 5)

print('CV accuracy scores for random forest model: ', scores)
print('Mean CV accuracy score: ', np.mean(scores))
print('95% confidence interval for CV accuracy score: ({}, {})'.format(
    np.mean(scores) - 2 * np.std(scores),
    np.mean(scores) + 2 * np.std(scores)))

CV accuracy scores for random forest model:  [0.73376623 0.73376623 0.75974026 0.83660131 0.75163399]
Mean CV accuracy score:  0.7631016042780748
95% confidence interval for CV accuracy score: (0.6868591534308192, 0.8393440551253305)
