In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score


In [3]:
# Load the data set
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv')

# drop features with low correlation
features_lights = df.drop(['date', 'Appliances','rv1', 'rv2', 'Visibility'], axis=1)

# Define target variables for lights energy use
target_lights = df['lights']



In [4]:
# Define decision tree hyperparameter values
parameters={
"max_depth" :  [3, 5, 7,10,13],
"max_leaf_nodes" : [10, 20, 30,40,50],
"min_samples_leaf" : [1, 2, 3],
"criterion" : ['friedman_mse', 'absolute_error', 'squared_error'],
"min_weight_fraction_leaf" : [0.1,0.2,0.3,0.5]}



In [5]:
# Select top features for lights energy use based on mutual information gain
selector_lights = SelectKBest(mutual_info_regression, k=10)
selector_lights.fit(features_lights, target_lights)
X_train_top_lights = selector_lights.transform(features_lights)
mask_lights = selector_lights.get_support() # boolean values of which features were selected
top_features_lights = features_lights.columns[mask_lights].tolist() # list of top features

In [6]:
# Split the data into training and testing sets for lights energy use
X_train_lights, X_test_lights, y_train_lights, y_test_lights = train_test_split(X_train_top_lights, target_lights, test_size=0.2, random_state=42)


In [7]:
# Define and fit the decision tree model(initial selection for hyperparameters) for lights energy use
dt_top_lights = DecisionTreeRegressor()
dt_top_lights.fit(X_train_lights, y_train_lights)

In [8]:
# Predict the lights energy use for the testing set using the top features
y_pred_lights = dt_top_lights.predict(X_test_lights)

In [9]:
# Calculate the mean absolute error for the lights energy use using the top features
scores_before_tuning = cross_val_score(dt_top_lights, X_train_lights, y_train_lights, cv=5, scoring='neg_mean_absolute_error')
print('Lights Energy use results')
print('Mean Absolute Error (before tuning):', -scores_before_tuning.mean())
print('Standard deviation (before tuning):', scores_before_tuning.std())

Lights Energy use results
Mean Absolute Error (before tuning): 0.0019001372744865848
Standard deviation (before tuning): 0.0025333492243427183


In [10]:
# use GridSearch to find the best hyperparameters for the decision tree
tuning_lights_model = GridSearchCV(dt_top_lights,param_grid=parameters,scoring='neg_mean_squared_error',cv=3,verbose=3)
tuning_lights_model.fit(X_train_lights, y_train_lights)


Fitting 3 folds for each of 900 candidates, totalling 2700 fits
[CV 1/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.1;, score=-3.369 total time=   0.0s
[CV 2/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.1;, score=-4.164 total time=   0.0s
[CV 3/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.1;, score=-4.050 total time=   0.0s
[CV 1/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.2;, score=-13.025 total time=   0.0s
[CV 2/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.2;, score=-14.419 total time=   0.0s
[CV 3/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.2;, score=-14.566 total time=   0.0s
[CV 1/3] END criterio

In [11]:
#define and fit a new model that will use the best hyperparameters
lights_tuned_hyper_model= DecisionTreeRegressor(**tuning_lights_model.best_params_)
lights_tuned_hyper_model.fit(X_train_lights,y_train_lights)

In [12]:
#predict the lights energy use with tuned model
tuned_lights_pred=lights_tuned_hyper_model.predict(X_test_lights)

In [13]:
# Define and fit the decision tree model with tuned hyperparameters for lights energy use
scores_after_tuning = cross_val_score(lights_tuned_hyper_model, X_train_lights, y_train_lights, cv=5, scoring='neg_mean_absolute_error')
print('Mean Absolute Error (after tuning):', -scores_after_tuning.mean())
print('Standard deviation (after tuning):', scores_after_tuning.std())


Mean Absolute Error (after tuning): 0.5511671765862942
Standard deviation (after tuning): 0.015112035945263766
