In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
petrol_data = pd.read_csv('petrol_consumption.csv')
X = petrol_data.drop('Petrol_Consumption', axis=1)
y = petrol_data['Petrol_Consumption']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [2]:
## Random Forest
from sklearn.ensemble import RandomForestClassifier

# Create the model with 100 trees
randomForestModel = RandomForestClassifier(n_estimators=200,
                               bootstrap = True,
                               max_features = 'sqrt')
# Fit on training data
randomForestModel.fit(X_train, y_train)
RandomForestClassifier(max_features='sqrt', n_estimators=200)
rf_predictions = randomForestModel.predict(X_test)

In [3]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_predictions))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rf_predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_predictions)))

Mean Absolute Error: 55.7
Mean Squared Error: 5066.7
Root Mean Squared Error: 71.18075582627652


In [4]:
feature_list=X_train.columns
# Get numerical feature importances
importances = list(randomForestModel.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Average_income       Importance: 0.29
Variable: Paved_Highways       Importance: 0.28
Variable: Population_Driver_licence(%) Importance: 0.28
Variable: Petrol_tax           Importance: 0.15


In [5]:
type(feature_list)
feature_list

Index(['Petrol_tax', 'Average_income', 'Paved_Highways',
       'Population_Driver_licence(%)'],
      dtype='object')

In [6]:
# New random forest with only the two most important variables
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
rf_most_important = RandomForestRegressor(n_estimators= 500, random_state=5)
# Extract the two most important features
important_indices = [feature_list[2], feature_list[1]]
train_important = X_train.loc[:, ['Paved_Highways','Average_income','Population_Driver_licence(%)']]
test_important = X_test.loc[:, ['Paved_Highways','Average_income','Population_Driver_licence(%)']]
# Train the random forest
rf_most_important.fit(train_important, y_train)
# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
predictions

array([605.83 , 484.104, 623.094, 589.88 , 628.962, 607.238, 604.546,
       572.176, 473.598, 510.536])

In [7]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

Mean Absolute Error: 56.80640000000001
Mean Squared Error: 4410.0591032
Root Mean Squared Error: 66.40827586378072
