In [1]:
import math
import warnings

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score



# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd")

Now that you've learned about random forests and decision trees let's do an exercise in accuracy. You know that random forests are basically a collection of decision trees. But how do the accuracies of the two models compare?

So here's what you should do. Pick a dataset. It could be one you've worked with before or it could be a new one. Then build the best decision tree you can.

Now try to match that with the simplest random forest you can. For our purposes measure simplicity with runtime. Compare that to the runtime of the decision tree. This is imperfect but just go with it.

Hopefully out of this you'll see the power of random forests, but also their potential costs. Remember, in the real world you won't necessarily be dealing with thousands of rows. It could be millions, billions, or even more.

Submit a link to your models below.

In [2]:
df = pd.read_excel('nyc_crimes.xls', skiprows=4)

df = df.drop([348, 349, 350])

In [3]:
df.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Adams Village,1861.0,0.0,0.0,,0.0,0.0,0.0,12.0,2.0,10.0,0.0,0.0
1,Addison Town and Village,2577.0,3.0,0.0,,0.0,0.0,3.0,24.0,3.0,20.0,1.0,0.0
2,Akron Village,2846.0,3.0,0.0,,0.0,0.0,3.0,16.0,1.0,15.0,0.0,0.0
3,Albany,97956.0,791.0,8.0,,30.0,227.0,526.0,4090.0,705.0,3243.0,142.0,
4,Albion Village,6388.0,23.0,0.0,,3.0,4.0,16.0,223.0,53.0,165.0,5.0,


In [4]:
df = df.fillna(0)

In [5]:
df.describe()

Unnamed: 0,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,40037.632,201.595,1.566,0.0,5.865,72.902,121.261,792.606,119.684,637.017,35.905,1.006
std,450037.368,2815.269,18.304,0.0,60.425,1031.033,1706.132,7659.725,924.949,6346.054,403.424,7.885
min,526.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3003.0,2.0,0.0,0.0,0.0,0.0,1.0,40.5,6.0,31.0,0.0,0.0
50%,7233.5,6.0,0.0,0.0,0.0,1.0,4.0,112.5,17.5,94.0,2.0,0.0
75%,18427.5,22.0,0.0,0.0,2.0,5.0,14.0,341.0,51.25,287.25,7.0,0.0
max,8396126.0,52384.0,335.0,0.0,1112.0,19170.0,31767.0,141971.0,16606.0,117931.0,7434.0,132.0


In [6]:
# Select feature set and outcome of interest
X = df.drop(['City','Violent\ncrime'], axis=1)
y = df[['Violent\ncrime']]

# Instantiate our model
decision_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_features=1,
    max_depth=5)

decision_tree.fit(X, y)
y_pred = decision_tree.predict(X)

print(decision_tree.score(X, y))
print(cross_val_score(decision_tree, X, y, cv=10))

0.9997538988200193
[0.92552617 0.8710441  0.33407379 0.37860602 0.86385764 0.82455219
 0.09312366 0.67562443 0.975176   0.74460225]


In [7]:
tree_time = %timeit decision_tree.fit(X, y)

233 µs ± 16.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [8]:
df.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Adams Village,1861.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,2.0,10.0,0.0,0.0
1,Addison Town and Village,2577.0,3.0,0.0,0.0,0.0,0.0,3.0,24.0,3.0,20.0,1.0,0.0
2,Akron Village,2846.0,3.0,0.0,0.0,0.0,0.0,3.0,16.0,1.0,15.0,0.0,0.0
3,Albany,97956.0,791.0,8.0,0.0,30.0,227.0,526.0,4090.0,705.0,3243.0,142.0,0.0
4,Albion Village,6388.0,23.0,0.0,0.0,3.0,4.0,16.0,223.0,53.0,165.0,5.0,0.0


In [9]:
df.columns

Index(['City', 'Population', 'Violent\ncrime',
       'Murder and\nnonnegligent\nmanslaughter',
       'Rape\n(revised\ndefinition)1', 'Rape\n(legacy\ndefinition)2',
       'Robbery', 'Aggravated\nassault', 'Property\ncrime', 'Burglary',
       'Larceny-\ntheft', 'Motor\nvehicle\ntheft', 'Arson3'],
      dtype='object')

In [10]:
# Select feature set and outcome of interest
X = df[['Population', 'Murder and\nnonnegligent\nmanslaughter', 'Robbery', 'Aggravated\nassault', 'Property\ncrime']]
y = df['Violent\ncrime']

# Instantiate our model
decision_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_features=1,
    max_depth=5)

decision_tree.fit(X, y)
y_pred = decision_tree.predict(X)

print(decision_tree.score(X, y))
print(cross_val_score(decision_tree, X, y, cv=10))

0.9997779914937674
[ 0.92908443  0.8278059   0.19442297 -0.02037965  0.89615137  0.93523057
  0.09314056  0.67352965  0.94664842  0.93050851]


In [11]:
%timeit decision_tree.fit(X, y)

218 µs ± 9.94 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [12]:
rfr = RandomForestRegressor()
rfr.fit(X, y)

print(rfr.score(X, y))
print(cross_val_score(rfr, X, y, cv=10))

0.9208537772633936
[0.97899675 0.79868149 0.95363237 0.9674167  0.9782446  0.98896502
 0.07789626 0.99546398 0.97696643 0.91076314]


In [13]:
%timeit rfr.fit(X, y)

12.2 ms ± 103 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
rfr = RandomForestRegressor(max_depth=10)
rfr.fit(X, y)

print(rfr.score(X, y))
print(cross_val_score(rfr, X, y, cv=10))

0.9648489010603581
[0.95404816 0.80296293 0.96110556 0.97863267 0.96680734 0.98193457
 0.0846909  0.9934425  0.9411987  0.98452889]


In [19]:
%timeit rfr.fit(X, y)

12.1 ms ± 517 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
