In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as scp
import IPython.display as dsp

from sklearn import tree, ensemble
from sklearn.model_selection import cross_val_score
from IPython.display import Image

import time

import pydotplus
import graphviz

__Attribute Information:__

1. Sample code number: id number (ID)
2. Clump Thickness: 1 - 10 (ct)
3. Uniformity of Cell Size: 1 - 10 (csu)
4. Uniformity of Cell Shape: 1 - 10 (cshu)
5. Marginal Adhesion: 1 - 10 (ma)
6. Single Epithelial Cell Size: 1 - 10 (secs)
7. Bare Nuclei: 1 - 10 (bn)
8. Bland Chromatin: 1 - 10 (bc)
9. Normal Nucleoli: 1 - 10 (nn)
10. Mitoses: 1 - 10 (mito)
11. Class: (2 for benign, 4 for malignant) (class)

In [31]:
breast = pd.read_csv('data/breast/breast-cancer-wisconsin.data.txt')

breast.columns = range(len(breast.columns))
breast.rename(columns = {0 : 'ID', 1 : 'ct', 2 : 'csu', 3 : 'cshu', 
                        4 : 'ma', 5 : 'secs', 6 : 'bn', 7 : 'bc',
                        8 : 'nn', 9 : 'mito', 10 : 'class'}, inplace = True)

breast.replace('?', np.nan, inplace = True)
breast.dropna(inplace = True)

breast.drop(['ID'], axis = 1, inplace = True)

#reset index
breast.index = range(len(breast))

breast = breast.apply(pd.to_numeric)

breast.head()

Unnamed: 0,ct,csu,cshu,ma,secs,bn,bc,nn,mito,class
0,5,4,4,5,7,10,3,2,1,2
1,3,1,1,1,2,2,3,1,1,2
2,6,8,8,1,3,4,3,7,1,2
3,4,1,1,3,2,1,3,1,1,2
4,8,10,10,8,7,10,9,7,1,4


In [32]:
X = breast[['ct','csu','cshu','ma','secs', 'bn', 'bc', 'nn', 'mito']]
Y = breast['class']

In [33]:
#Decision Tree

#Initialize tree
start_dt = time.time() #Timer start

decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy',
                                           max_features = 1,
                                           max_depth = 4)
#Fit tree
decision_tree.fit(X, Y)

end_dt = time.time() #Timer end

#Random Forest

#Initialize Random Forest

start_rf = time.time()
rfc = ensemble.RandomForestClassifier()

#Fit forest
rfc.fit(X, Y)
end_rf = time.time()

dt_scores = cross_val_score(decision_tree, X, Y, cv = 5)
rf_scores = cross_val_score(rfc, X, Y, cv = 5)

print('Decision Tree Stats:')
print('Decision Tree Scores:\n', dt_scores)
print('Average Score:', np.mean(dt_scores))
print('Runtime: ', end_dt - start_dt)

print('\nRandom Forest Stats:')
print('Random Forest Scores:\n',cross_val_score(rfc, X, Y, cv = 5))
print('Average Score:', np.mean(rf_scores))
print('Runtime: ', end_rf - start_rf)



Decision Tree Stats:
Decision Tree Scores:
 [ 0.86861314  0.94890511  0.97080292  0.96323529  0.96296296]
Average Score: 0.942903884993
Runtime:  0.005326032638549805

Random Forest Stats:
Random Forest Scores:
 [ 0.89781022  0.94890511  0.97080292  0.97794118  0.97777778]
Average Score: 0.960530111477
Runtime:  0.04987335205078125


In [35]:
#Render Tree (breaks on some computers)
try:
    dot_data = tree.export_graphviz(
        decision_tree, out_file = None,
        feature_names = X.columns,
        class_names = ['Benign', 'Malignant'],
        filled = True
    )
    graph = pydotplus.graph_from_dot_data(dot_data)
    Image(graph.create_png())
except:
    print('Tree render function not supported on this device.')

Tree render function not supported on this device.


# Decision Trees VS Random Forest

__Note:__ I did not seed my random number generators. Results will vary every time you run the program.

Decision Tree Stats:
Decision Tree Scores:
 [ 0.86861314  0.94890511  0.97080292  0.96323529  0.96296296]
Average Score: 0.942903884993
Runtime:  0.005326032638549805

Random Forest Stats:
Random Forest Scores:
 [ 0.89781022  0.94890511  0.97080292  0.97794118  0.97777778]
Average Score: 0.960530111477
Runtime:  0.04987335205078125

Eve though the random forest (comprised of many decision trees over sections of the data) intuitively seems more complex, the timing appears the same.