# 8-3-1 Fitting Classification Trees

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.externals.six import StringIO

df = pd.read_csv('../Data/Carseats.csv')
df.head(3)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,High
1,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes,Yes
2,11.22,111,48,16,260,83,Good,65,10,Yes,Yes,Yes
3,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes,Yes


Next we use Pandas to convert the `Sales` variable, which is continuous, to a binary variable called `High`. If `Sales` is less than or equal to 8, the value of `High` is `No`, otherwise it is `Yes`.

In [2]:
#needed only if High column is missing from df
#df['High'] =  pd.cut(df.Sales,bins=[-np.inf,8,np.inf],labels=['No','Yes'])
#df.head(3)

In [3]:
# define categorical variables with order: Bad < Medium < Good.
df['ShelveLoc'] = df.ShelveLoc.astype('category')
df.ShelveLoc.cat.reorder_categories(['Bad','Medium','Good'],inplace=True)
df['Urban'] = df.Urban.astype('category')
df['US'] = df.US.astype('category')
df['High'] = df.High.astype('category')
df.dtypes

Sales           float64
CompPrice         int64
Income            int64
Advertising       int64
Population        int64
Price             int64
ShelveLoc      category
Age               int64
Education         int64
Urban          category
US             category
High           category
dtype: object

In [4]:
# specify features used to construct tree
# convert category labels to integers
data = df.drop(['Sales','High'],axis=1)
data['ShelveLoc'] = data.ShelveLoc.cat.codes
data['Urban'] = data.Urban.cat.codes
data['US'] = data.US.cat.codes
data.head(3)

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
1,138,73,11,276,120,0,42,17,1,1
2,111,48,16,260,83,2,65,10,1,1
3,113,35,10,269,80,1,59,12,1,1


Next, we create and train a decision tree classifier called `TR`. We limit the number of leaf nodes to 27 to approximately match the tree classifier trained in ISLR. We also compute the classification error rate.

In [5]:
# train decision tree
x = data.values
y = df.High.cat.codes
TR = tree.DecisionTreeClassifier(max_leaf_nodes=27)
TR.fit(x,y)
y_pred = TR.predict(x)
error = 1 - accuracy_score(y,y_pred)
print('training error rate = %0.2f' % error)
print('tree depth = %d' % TR.tree_.max_depth)
print('total node count = %d' % TR.tree_.node_count)

training error rate = 0.07
tree depth = 9
total node count = 53


In [6]:
# print tree to file and use xdot to visualizes and print to PDF.
# (requires Graphviz to be installed)
# http://www.graphviz.org/Download
with open('Carseats-9.dot','w') as f:
    features = df.drop(['Sales','High'],axis=1).columns.values
    f = tree.export_graphviz(TR,out_file=f,
                         feature_names=features)

Next we avoid overfitting by using 10-fold cross-validation to determine the optimal tree depth. This is less accurate in general than the full tree pruning procedure described in the book (ISLR). Scikit learn does not have the ISLR pruning procedure implemented.

In [7]:
error_opt = 1.0
for m in range(20):
    TR = tree.DecisionTreeClassifier(max_depth=m+1)
    error = (1 - cross_val_score(TR,x,y,cv=10)).mean()
    if error_opt > error:
        error_opt = error
        max_depth_opt = m + 1
        print('max_depth = %d  test error rate %0.3f' % (max_depth_opt,error_opt))

max_depth = 1  test error rate 0.293
max_depth = 2  test error rate 0.280
max_depth = 4  test error rate 0.251


In [8]:
TR = tree.DecisionTreeClassifier(max_depth=4)
TR.fit(x,y)
with open('Carseats-4.dot','w') as f:
    features = df.drop(['Sales','High'],axis=1).columns.values
    f = tree.export_graphviz(TR,out_file=f,
                         feature_names=features)