In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score


In [4]:
# Load the data set
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv')

In [5]:
# drop features with low correlation
features_appliances = df.drop(['date', 'lights','rv1', 'rv2', 'Visibility'], axis=1)

# Define target variables for Appliances energy use
target_appliances = df['Appliances']


In [6]:

# Define decision tree hyperparameter values\
parameters={
"max_depth" :  [3, 5, 7,10,13],
"max_leaf_nodes" : [10, 20, 30,40,50],
"min_samples_leaf" : [1, 2, 3],
"criterion" : ['friedman_mse', 'absolute_error', 'squared_error'],
"min_weight_fraction_leaf" : [0.1,0.2,0.3,0.5]}

# Select top features for Appliances energy use based on mutual information gain
selector_appliances = SelectKBest(mutual_info_regression, k=1)
selector_appliances.fit(features_appliances, target_appliances)
X_train_top_appliances = selector_appliances.transform(features_appliances)
mask_appliances = selector_appliances.get_support() # boolean values of which features were selected
top_features_appliances = features_appliances.columns[mask_appliances].tolist() # list of top features

# Split the data into training and testing sets for Appliances energy use
X_train_appliances, X_test_appliances, y_train_appliances, y_test_appliances = train_test_split(X_train_top_appliances, target_appliances, test_size=0.2, random_state=42)




In [15]:
# Define and fit the decision tree model(initial selection for hyperparameters) for Appliances energy use
dt_top_appliances = DecisionTreeRegressor()
dt_top_appliances.fit(X_train_appliances, y_train_appliances)

# Predict the Appliances energy use for the testing set using the top features
y_pred_appliances = dt_top_appliances.predict(X_test_appliances)

# Calculate the mean absolute error for the appliances energy use using the top features
scores_before_tuning = cross_val_score(dt_top_appliances, X_train_appliances, y_train_appliances, cv=5, scoring='neg_mean_absolute_error')
print('Appliances energy used results')
print('Mean Absolute Error (before tuning):', -scores_before_tuning.mean())
print('Standard deviation (before tuning):', scores_before_tuning.std())

Appliances energy used results
Mean Absolute Error (before tuning): 0.009500886978141802
Standard deviation (before tuning): 0.00447847765226594


In [8]:
# use GridSearch to find the best hyperparameters for the decision tree
tuning_appliances_model = GridSearchCV(dt_top_appliances,param_grid=parameters,scoring='neg_mean_squared_error',cv=3,verbose=3)
tuning_appliances_model.fit(X_train_appliances, y_train_appliances)


Fitting 3 folds for each of 900 candidates, totalling 2700 fits
[CV 1/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.1;, score=-2262.035 total time=   0.0s
[CV 2/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.1;, score=-2298.005 total time=   0.0s
[CV 3/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.1;, score=-1717.502 total time=   0.0s
[CV 1/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.2;, score=-5229.254 total time=   0.0s
[CV 2/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.2;, score=-5163.424 total time=   0.0s
[CV 3/3] END criterion=friedman_mse, max_depth=3, max_leaf_nodes=10, min_samples_leaf=1, min_weight_fraction_leaf=0.2;, score=-4596.590 total time=   0.0s
[CV 1/

In [11]:
tuning_appliances_model.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 5,
 'max_leaf_nodes': 10,
 'min_samples_leaf': 1,
 'min_weight_fraction_leaf': 0.1}

In [12]:
#define and fit a new model that will use the best hyperparameters
appliances_tuned_hyper_model= DecisionTreeRegressor(**tuning_appliances_model.best_params_)
appliances_tuned_hyper_model.fit(X_train_appliances,y_train_appliances)

In [13]:
#predict the Appliance energy use with tuned model
tuned_appliances_pred=appliances_tuned_hyper_model.predict(X_test_appliances)

In [14]:
#evaluate hypertuned model using cross validation
scores_after_tuning = cross_val_score(appliances_tuned_hyper_model, X_train_appliances, y_train_appliances, cv=5, scoring='neg_mean_absolute_error')
print('Mean Absolute Error (after tuning):', -scores_after_tuning.mean())
print('Standard deviation (after tuning):', scores_after_tuning.std())



Mean Absolute Error (after tuning): 15.20162503708068
Standard deviation (after tuning): 1.120613933142862
