In [17]:
%matplotlib inline
import math
import scipy
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

## World Happiness Report: decision tree vs random forest

### Prep Data

I want to use the same data as the last lesson. I want to see how decision tree and random forest does compared to ordinary least squares and k nearest neighbors.

In [11]:
# Read data
df = pd.read_csv('../Data/world_happiness_report_2/2017.csv')
df.columns

Index(['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high',
       'Whisker.low', 'Economy..GDP.per.Capita.', 'Family',
       'Health..Life.Expectancy.', 'Freedom', 'Generosity',
       'Trust..Government.Corruption.', 'Dystopia.Residual'],
      dtype='object')

In [12]:
# Clean data
df = df[['Happiness.Score', 'Economy..GDP.per.Capita.', 'Family',
       'Health..Life.Expectancy.', 'Freedom', 'Generosity',
       'Trust..Government.Corruption.']]

df = df.rename(index=str, columns={'Happiness.Score': 'happiness_score',
                              'Economy..GDP.per.Capita.': 'gdp_per_capita',
                              'Health..Life.Expectancy.': 'life_expectancy',
                              'Trust..Government.Corruption.': 'trust_in_government',
                              'Family': 'family',
                              'Freedom': 'freedom',
                              'Generosity': 'generosity'})

df.head()

Unnamed: 0,happiness_score,gdp_per_capita,family,life_expectancy,freedom,generosity,trust_in_government
0,7.537,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964
1,7.522,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077
2,7.504,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527
3,7.494,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007
4,7.469,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612


### Decision Tree

In [84]:
from sklearn import tree
from sklearn.model_selection import train_test_split
import time

X = df.drop(['happiness_score'], axis=1)
y = df['happiness_score']

# Split data to test later
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X, y, test_size=0.2)

# Start clock
start_time = time.time()

# Initialize and train our tree.
dtree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_features=1,
)

dtree.fit(X_train_dt, y_train_dt)

# Model Accuracy
print('Decision Tree Accuracy: %0.3f' % dtree.score(X_test_dt, y_test_dt))

# Print runtime
dt_time = time.time() - start_time
print("--- %s seconds ---" % dt_time)


Decision Tree Accuracy: 0.446
--- 0.0024878978729248047 seconds ---


Our decision tree, in this instance, it 44.6% accurate. But this number does fluctuate quite a bit everytime we reinstance the model.

### Random Forest

In [86]:
from sklearn import ensemble

X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.2)

# Start clock
start_time = time.time()

# Initialize random forest regressor
rfr = ensemble.RandomForestRegressor(n_estimators=100)

# Fit model to training data
rfr.fit(X_train_rf, y_train_rf)

# Test model with test data
score_1 = rfr.score(X_test_rf, y_test_rf)
print('Accuracy: %0.3f' % score_1)

# Print runtime
rf_time = time.time() - start_time
print("--- %s seconds ---" % rf_time)


Accuracy: 0.702
--- 0.07568931579589844 seconds ---


As expected, the random forest model gives us a more accurate predictions. 

In [88]:
# Let's compare the time for running the two models
diff = abs(dt_time - rf_time)/dt_time
print(diff)

29.422999520843316


So the runtime for the random forest with 100 estimators, or 100 decision trees, is nearly 30 times more time-intensive than running the lone decision tree for this dataset.