In [1]:
import pandas as pd
import numpy as np

from sklearn import multioutput
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import pickle

In [2]:
raw_data = pd.read_csv('../data/ENB2012_data.csv')

df = raw_data.copy()  # keep the raw data as original

# Seperate the features and the target variables
X = df.iloc[:,0:-2]
y = np.log(df[['heating', 'cooling']])

## Data Preperation

In [3]:
# Devide them to training, validation and test parts (60:20:20): 
X_train_full_df, X_test_df, y_train_full_df, y_test = train_test_split(X, y, test_size = 0.20, random_state = 155)
X_train_df, X_val_df, y_train, y_val = train_test_split(X_train_full_df, y_train_full_df, test_size = 0.25, random_state = 155)

# Vectorize feature matrices in the form of dictionary (with renewed indexes):
dv = DictVectorizer(sparse=False)

X_train_df = X_train_df.reset_index(drop=True)
X_train_dict = X_train_df.to_dict(orient='records')
X_train = dv.fit_transform(X_train_dict)

X_val_df = X_val_df.reset_index(drop=True)
X_val_dict = X_val_df.to_dict(orient='records')
X_val = dv.fit_transform(X_val_dict)

X_test_df = X_test_df.reset_index(drop=True)
X_test_dict = X_test_df.to_dict(orient='records')
X_test = dv.fit_transform(X_test_dict)

# Renew the index of target variables
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


## XGboost Regression

In [4]:
xgb = XGBRegressor(n_estimators=500, max_depth=5, learning_rate=0.2)
mxgb = multioutput.MultiOutputRegressor(xgb)
mxgb.fit(X_train, y_train)
y_pred = mxgb.predict(X_val)

r2 = np.mean(r2_score(y_val, y_pred))
rmse= np.sqrt(mean_squared_error(y_val,y_pred))
print(f"r2 score: {r2.round(3)}   RMSE: {rmse.round(3)}")

r2 score: 0.996   RMSE: 0.025


## Saving the Model

In [5]:
output_file = 'model.bin'

In [6]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, mxgb), f_out)

## Load Model

In [7]:
import pickle


In [8]:
model_file = 'model.bin'

In [9]:
with open(model_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [11]:
# test_building = X_train_df.iloc[384]

building = {'compactness': 0.9,
        'surface_area': 563.5,
        'wall_area': 318.5,
        'roof_area': 122.5,
        'height': 7.0,
        'orientation': 5.0,
        'glazing_area': 0.4,
        'glazing_distribution': 4.0}

In [12]:
X = dv.transform([building])

In [13]:
y_pred = model.predict(X)

array([[3.5898054, 3.5801654]], dtype=float32)

In [19]:
print("Estimated heating load:", round(np.exp(y_pred[0,0]),2))
print("Estimated cooling load:", round(np.exp(y_pred[0,1]),2))

Estimated heating load: 12.2
Estimated cooling load: 14.59
