In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import multioutput
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

In [4]:
raw_data = pd.read_csv('../data/ENB2012_data.csv')

df = raw_data.copy()  # keep the raw data as original

# Seperate the features and the target variables
X = df.iloc[:,0:-2]
y = np.log(df[['heating', 'cooling']])

In [5]:
# Devide them to training, validation and test parts (60:20:20): 
X_train_full_df, X_test_df, y_train_full_df, y_test = train_test_split(X, y, test_size = 0.20, random_state = 155)
X_train_df, X_val_df, y_train, y_val = train_test_split(X_train_full_df, y_train_full_df, test_size = 0.25, random_state = 155)

# Vectorize feature matrices in the form of dictionary (with renewed indexes):
dv = DictVectorizer(sparse=False)

X_train_df = X_train_df.reset_index(drop=True)
X_train_dict = X_train_df.to_dict(orient='records')
X_train = dv.fit_transform(X_train_dict)

X_val_df = X_val_df.reset_index(drop=True)
X_val_dict = X_val_df.to_dict(orient='records')
X_val = dv.fit_transform(X_val_dict)

X_test_df = X_test_df.reset_index(drop=True)
X_test_dict = X_test_df.to_dict(orient='records')
X_test = dv.fit_transform(X_test_dict)

# Renew the index of target variables
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


# Linear Regression

In [6]:
# Apply the base model
lin = LinearRegression()
lin.fit(X_train, y_train)

#GEt prediction with validation dataset
y_pred = lin.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred))
rmse= np.sqrt(mean_squared_error(y_val,y_pred))
print(f"r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

r2 score: 0.923   RMSE: 0.122


### Parameter Tuning

In [7]:
lin_parameters = {'normalize' : [True,False],
                'positive' : [True,False],
                'fit_intercept' : [True,False]            
                }


lin_v2 = GridSearchCV(lin, lin_parameters, cv=10, n_jobs=-1)


lin_v2.fit(X_train, y_train)

print('Best Paraneters', lin_v2.best_params_)

y_pred_ = lin_v2.best_estimator_.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred_))
rmse= np.sqrt(mean_squared_error(y_val,y_pred_))
print(f"Best r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

{'fit_intercept': True, 'normalize': True, 'positive': False}
r2 score: 0.923   RMSE: 0.122


# Lasso Regression

In [8]:
# Apply the base model
ls = Lasso()
ls.fit(X_train, y_train)

#GEt prediction with validation dataset
y_pred = ls.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred))
rmse= np.sqrt(mean_squared_error(y_val,y_pred))
print(f"r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

r2 score: 0.812   RMSE: 0.194


### Parameter Tuning

In [9]:
ls_parameters = {'normalize' : [True,False],
                'positive' : [True,False],
                'fit_intercept' : [True,False],
                'max_iter': [1000, 5000, 10000],
                'alpha': [0.5, 1, 1.5, 2.0]                
                }

ls_v2 = GridSearchCV(ls, ls_parameters, cv=10, n_jobs=-1)


ls_v2.fit(X_train, y_train)

print('Best Paraneters', ls_v2.best_params_)

y_pred_ = ls_v2.best_estimator_.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred_))
rmse= np.sqrt(mean_squared_error(y_val,y_pred_))
print(f"Best r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

{'alpha': 0.5, 'fit_intercept': True, 'max_iter': 1000, 'normalize': False, 'positive': False}
r2 score: 0.817   RMSE: 0.192


# Ridge Regression

In [10]:
# Apply the base model
rd = Ridge()
rd.fit(X_train, y_train)

#GEt prediction with validation dataset
y_pred = rd.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred))
rmse= np.sqrt(mean_squared_error(y_val,y_pred))
print(f"r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

r2 score: 0.921   RMSE: 0.123


### Parameter Tuning

In [11]:
rd_parameters = {'normalize' : [True,False],
                'fit_intercept' : [True,False],
                'max_iter': [1000, 5000, 10000],
                'alpha': [0.5, 1, 1.5]
                }

rd_v2 = GridSearchCV(rd, rd_parameters, cv=10, n_jobs=-1)


rd_v2.fit(X_train, y_train)

print('Best Paraneters',rd_v2.best_params_)

y_pred_ = rd_v2.best_estimator_.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred_))
rmse= np.sqrt(mean_squared_error(y_val,y_pred_))
print(f"Best r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

{'alpha': 0.5, 'fit_intercept': True, 'max_iter': 1000, 'normalize': False}
r2 score: 0.922   RMSE: 0.123


# Decision Tree Regression

In [12]:
# Apply the base model
dt = DecisionTreeRegressor(random_state=155)
dt.fit(X_train, y_train)

#GEt prediction with validation dataset
y_pred = dt.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred))
rmse= np.sqrt(mean_squared_error(y_val,y_pred))
print(f"r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

r2 score: 0.978   RMSE: 0.06


### Parameter Tuning

In [14]:
parameters = {'max_depth' : [5,10,15],
              'min_samples_leaf' : [1,3,5,10,50]           
              }


dt_v2 = GridSearchCV(dt, parameters, cv=10, n_jobs=-1)


dt_v2.fit(X_train, y_train)

print('Best Paraneters', dt_v2.best_params_)

y_pred_ = dt_v2.best_estimator_.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred_))
rmse= np.sqrt(mean_squared_error(y_val,y_pred_))
print(f"Best r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

{'max_depth': 10, 'min_samples_leaf': 3}
r2 score: 0.983   RMSE: 0.055


# Random Forest Regression

In [15]:
# Apply the base model
rf = RandomForestRegressor(random_state=155)
rf.fit(X_train, y_train)

#GEt prediction with validation dataset
y_pred = rf.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred))
rmse= np.sqrt(mean_squared_error(y_val,y_pred))
print(f"r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

r2 score: 0.986   RMSE: 0.048


### Parameter Tuning

In [16]:
rf_parameters = {'max_depth' : [None,5,10,15,25],
                'min_samples_leaf' : [1,3,5,10,50],
                'n_estimators': np.arange(50,501,50)
                }

rf_v2 = GridSearchCV(rf, rf_parameters, cv=10, n_jobs=-1)


rf_v2.fit(X_train, y_train)

print('Best Paraneters', rf_v2.best_params_)

y_pred_ = rf_v2.best_estimator_.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred_))
rmse= np.sqrt(mean_squared_error(y_val,y_pred_))
print(f"Best r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

{'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 150}
r2 score: 0.986   RMSE: 0.048


# Extra Tree Regression

In [17]:
# Apply the model
xt = ExtraTreesRegressor(random_state=155)
xt.fit(X_train, y_train)

#GEt prediction with validation dataset
y_pred = xt.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred))
rmse= np.sqrt(mean_squared_error(y_val,y_pred))
print(f"r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

r2 score: 0.986   RMSE: 0.049


### Parameter Tuning

In [18]:
xt_parameters = {'max_depth' : [5,10,15,25],
                'min_samples_leaf' : [1,3,5,10],
                'n_estimators': np.arange(50,501,50)
                }

xt_v2 = GridSearchCV(xt, xt_parameters, cv=10, n_jobs=-1)


xt_v2.fit(X_train, y_train)

print('Best Paraneters', xt_v2.best_params_)

y_pred_ = xt_v2.best_estimator_.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred_))
rmse= np.sqrt(mean_squared_error(y_val,y_pred_))
print(f"Best r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

{'max_depth': 15, 'min_samples_leaf': 3, 'n_estimators': 50}
r2 score: 0.985   RMSE: 0.051


# XGboost Regression

In [19]:
xgb = XGBRegressor(n_estimators=2000, max_depth=20, learning_rate=0.01)
mxgb = multioutput.MultiOutputRegressor(xgb)
mxgb.fit(X_train, y_train)
y_pred = mxgb.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred))
rmse= np.sqrt(mean_squared_error(y_val,y_pred))
print(f"r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

r2 score: 0.994   RMSE: 0.033


### Parameter Tuning

In [20]:
for n in range(500,2501,500):
    for d in range(5,21,5):
        for l in [x / 10.0 for x in range(2, 11, 2)]:
            xgb = XGBRegressor(n_estimators=n, max_depth=d, learning_rate=l)
            mxgb = multioutput.MultiOutputRegressor(xgb)
            mxgb.fit(X_train, y_train)
            y_pred = mxgb.predict(X_val)

            r2 = np.mean(r2_score(y_val, y_pred))
            rmse= np.sqrt(mean_squared_error(y_val,y_pred))
            print(f"For ({n},{d},{l}) r2 score: {r2.round(5)}   RMSE: {rmse.round(5)}")


For (500,5,0.2) r2 score: 0.99648   RMSE: 0.02503
For (500,5,0.4) r2 score: 0.99593   RMSE: 0.02651
For (500,5,0.6) r2 score: 0.99559   RMSE: 0.02765
For (500,5,0.8) r2 score: 0.9962   RMSE: 0.026
For (500,5,1.0) r2 score: 0.99558   RMSE: 0.02854
For (500,10,0.2) r2 score: 0.99467   RMSE: 0.03002
For (500,10,0.4) r2 score: 0.99484   RMSE: 0.02953
For (500,10,0.6) r2 score: 0.99531   RMSE: 0.02839
For (500,10,0.8) r2 score: 0.99311   RMSE: 0.03418
For (500,10,1.0) r2 score: 0.98706   RMSE: 0.0476
For (500,15,0.2) r2 score: 0.99467   RMSE: 0.03
For (500,15,0.4) r2 score: 0.99473   RMSE: 0.02981
For (500,15,0.6) r2 score: 0.99471   RMSE: 0.03005
For (500,15,0.8) r2 score: 0.99364   RMSE: 0.03295
For (500,15,1.0) r2 score: 0.98992   RMSE: 0.04218
For (500,20,0.2) r2 score: 0.99467   RMSE: 0.03
For (500,20,0.4) r2 score: 0.99473   RMSE: 0.02981
For (500,20,0.6) r2 score: 0.99471   RMSE: 0.03005
For (500,20,0.8) r2 score: 0.99364   RMSE: 0.03294
For (500,20,1.0) r2 score: 0.98994   RMSE: 0.0

Best (n,d,l)=(500,5,0.2) with r2 score: 0.99648   RMSE: 0.02503