# Challenge: If a tree falls in the forest...
Pick a dataset. It could be old or new. Then build the best decision tree you can.
Now try to match that with the simplest random forest you can. For our purposes measure simplicity with runtime.

In [18]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

# The models be used
from sklearn import ensemble
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

import time

In [2]:
# Import and Read Data
data = pd.read_csv('auto-mpg.csv')
data.head(1)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu


In [3]:
data['horsepower'] = pd.to_numeric(data['horsepower'], errors='coerce')

In [4]:
data.dropna(inplace=True)

In [5]:
new = data["car name"].str.split(" ", n = 1, expand = True)
data['brand'] = new[0]

data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,brand
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,chevrolet
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,buick
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,plymouth
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,amc
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,ford


In [16]:
data['mpg'].nunique()

127

In [6]:
# Dropping old Genre column 
data.drop(columns =["car name"], inplace = True) 

In [25]:
# A convenience for displaying visualizations.
# from IPython.display import Image

# Packages for rendering our tree.
# import pydotplus
# import graphviz



# Separate features from the target
X = data.drop('mpg', 1)
Y = data['mpg']
X = pd.get_dummies(X)
X = X.dropna(axis=1) 

# Initialize and train our tree.
dtc = tree.DecisionTreeRegressor(
    max_features=1,
    max_depth=7
)
dtc.fit(X, Y)

# Render our tree.
# dot_data = tree.export_graphviz(
#     decision_tree, out_file=None,
#     feature_names=customers.columns,
#     class_names=['Not Returning', 'Returning'],
#     filled=True
# )
# graph = pydotplus.graph_from_dot_data(dot_data)
# Image(graph.create_png())

"""""""""""
for whatever reason, I can't get graphviz to work. 
It's been installed, but I keep getting the error
ModuleNotFoundError: No module named 'graphviz'
"""""""""""


start_time = time.time()
print(cross_val_score(dtc, X, Y, cv=10))
print('\n')
print("Runtime --- %s seconds ---" % (time.time() - start_time))

[-0.22750116 -0.4804024   0.05503014  0.55182527  0.00171927 -0.09383339
 -0.70728115 -0.14086553  0.03085952 -0.08530729]


Runtime --- 0.07306289672851562 seconds ---


In [26]:
# Separate features from the target
rfc = ensemble.RandomForestRegressor()
X = data.drop('mpg', 1)
Y = data['mpg']
X = pd.get_dummies(X)
X = X.dropna(axis=1) 

rfc.fit(X,Y)

start_time = time.time()
print("Runtime --- %s seconds ---" % (time.time() - start_time))
print('\n')
print(cross_val_score(dtc, X, Y, cv=10))

Runtime --- 8.416175842285156e-05 seconds ---


[-0.19768142  0.38696305 -0.73795461 -0.38232816  0.61842622  0.02760237
  0.1083712  -0.23857036 -0.84386132 -0.36759726]


In [27]:
# Get the most important features
feature_imp = pd.Series(rfc.feature_importances_,index=X.columns).sort_values(ascending=False)[:10]
feature_imp

displacement        0.510473
weight              0.184383
model year          0.129564
cylinders           0.065107
horsepower          0.061144
acceleration        0.019095
brand_vw            0.007940
origin              0.004138
brand_volkswagen    0.002487
brand_oldsmobile    0.002474
dtype: float64

In [47]:
from sklearn.grid_search import GridSearchCV

# dtc = tree.DecisionTreeRegressor(
#     max_features=3,
#     max_depth=10
# )

features = X[['displacement', 'weight', 'model year', 'cylinders', 'horsepower']]
dtc.fit(features, Y)
# cross_val_score(dtc, features, Y, cv=10)

parameters = {
    'max_features':[1,3,5],
    'max_depth':[1,2,3,4,5,6,7,8,9,10]    
}

grid = GridSearchCV(dtc, parameters, cv=10, verbose=0)
#Fit the Data
grid.fit(features, Y)

GridSearchCV(cv=10, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=3,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [1, 3, 5], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [48]:
print(grid.best_score_)
print(grid.best_params_)

0.6626477688996976
{'max_depth': 6, 'max_features': 1}


In [56]:
dtc = tree.DecisionTreeRegressor(
    max_features=1,
    max_depth=6
)

features = X[['displacement', 'weight', 'model year', 'cylinders', 'horsepower', 'acceleration']]
dtc.fit(features, Y)

start_time = time.time()
print("Runtime --- %s seconds ---" % (time.time() - start_time))
print('\n')
cross_val_score(dtc, features, Y, cv=10)

Runtime --- 7.295608520507812e-05 seconds ---




array([0.51955017, 0.88392739, 0.57481244, 0.75213828, 0.58981028,
       0.89149944, 0.65133877, 0.54036133, 0.12033724, 0.21317073])

In [57]:
rfc = ensemble.RandomForestRegressor()

start_time = time.time()
print("Runtime --- %s seconds ---" % (time.time() - start_time))
print('\n')
cross_val_score(rfc, features, Y, cv=10)

Runtime --- 6.29425048828125e-05 seconds ---




array([0.86059663, 0.88368803, 0.80176605, 0.86518996, 0.75883005,
       0.88774647, 0.80285438, 0.75131535, 0.35191108, 0.39081134])