# Decision Trees

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model as sk_lm
from sklearn import model_selection as sk_msel
from sklearn import metrics as sk_metrics
from sklearn import preprocessing as sk_pre
from sklearn import tree as sk_tree
import statsmodels.api as sm
import graphviz

### Classification

In [3]:
df_cs=pd.read_csv('data/Carseats.csv')

In [4]:
df_cs.drop(columns='Unnamed: 0', inplace=True)
df_cs.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [5]:
df_cs.ShelveLoc.unique()

array(['Bad', 'Good', 'Medium'], dtype=object)

In [6]:
# Binning Sales Variable and make other variables numeric
bins = [-np.inf,8,np.inf]
labels=[0,1]
df_cs['High']=pd.cut(df_cs['Sales'], bins=bins, labels=labels)
df_cs.Urban=df_cs.Urban.map({'Yes': 1, 'No': 0})
df_cs.US=df_cs.US.map({'Yes': 1, 'No': 0})
df_cs.ShelveLoc = df_cs.ShelveLoc.map({'Bad': 0, 'Medium': 1, 'Good': 2})
df_cs.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,High
0,9.5,138,73,11,276,120,0,42,17,1,1,1
1,11.22,111,48,16,260,83,2,65,10,1,1,1
2,10.06,113,35,10,269,80,1,59,12,1,1,1
3,7.4,117,100,4,466,97,1,55,14,1,1,0
4,4.15,141,64,3,340,128,0,38,13,1,0,0


In [7]:
df_cs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Sales        400 non-null    float64 
 1   CompPrice    400 non-null    int64   
 2   Income       400 non-null    int64   
 3   Advertising  400 non-null    int64   
 4   Population   400 non-null    int64   
 5   Price        400 non-null    int64   
 6   ShelveLoc    400 non-null    int64   
 7   Age          400 non-null    int64   
 8   Education    400 non-null    int64   
 9   Urban        400 non-null    int64   
 10  US           400 non-null    int64   
 11  High         400 non-null    category
dtypes: category(1), float64(1), int64(10)
memory usage: 35.0 KB


In [8]:
X=df_cs.drop(columns=['Sales', 'High'])
y=df_cs.High

In [9]:
tree_cs=sk_tree.DecisionTreeClassifier(max_depth=6)
tree_cs=tree_cs.fit(X,y)

In [10]:
# Training score
tree_cs.score(X,y)

0.92

In [11]:
# Cross Validation Score
np.mean(sk_msel.cross_val_score(tree_cs, X, y, cv=10))

0.7125

In [24]:
# Confusion Matrix (cross validated)
y_pre=sk_msel.cross_val_predict(tree_cs, X, y, cv=10)
sk_metrics.confusion_matrix(y, y_pre)

array([[181,  55],
       [ 64, 100]])

### Regression