In [65]:
# Import Libraries for analysis and visualisation
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

#import libraries for ML-Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import  MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.stats import boxcox
from scipy.stats import boxcox_normmax
from sklearn.ensemble import ExtraTreesRegressor
import pickle

## Library of warnings would assist in ignoring warnings issued
import warnings
warnings.filterwarnings("ignore")

In [49]:
data = pd.read_csv('data.csv')

In [50]:
data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce')
for col in data.columns:
    if data[col].dtype == 'object' and col != 'timestamp':
        data[col] = pd.to_numeric(data[col], errors='coerce')

In [51]:
#Getting the months, days, hours, minutes from date
data['hour'] = data['timestamp'].dt.hour
data['minute'] = data['timestamp'].dt.minute
data['day'] = data['timestamp'].dt.day
data['weekday'] = data['timestamp'].dt.weekday  # Monday=0, Sunday=6
data['month'] = data['timestamp'].dt.month


# Droping the original timestamp column
data.drop(columns=['timestamp'], inplace=True)

In [52]:
data = data[data['equipment_energy_consumption'] >= 0]

invalid_negative_columns = ['lighting_energy', 'zone1_humidity',
       'zone2_humidity', 'zone3_humidity', 'zone4_humidity', 'zone5_humidity', 'zone6_humidity', 'zone7_humidity',
       'zone8_humidity', 'zone9_humidity', 'outdoor_humidity', 'wind_speed', 'visibility_index']
data[invalid_negative_columns] = data[invalid_negative_columns].applymap(lambda x: np.nan if x < 0 else x)

In [53]:
for column in data.columns:
    if column != 'equipment_energy_consumption':
        mean_value = data[column].median()
        data[column].fillna(mean_value, inplace=True)

In [54]:
data = data.drop_duplicates()

In [55]:
# Handling Outliers & Outlier treatments
for ftr in data.columns:
  q_25= np.percentile(data[ftr], 25)
  q_75 = np.percentile(data[ftr], 75)
  iqr = q_75 - q_25
  # calculate the outlier cutoff
  cut_off = iqr * 1.5
  lower = q_25 - cut_off
  upper = q_75 + cut_off
  # identify outliers
  outliers = [x for x in data[ftr] if x < lower or x > upper]
  #removing outliers
  if len(outliers)!=0:
    def bin(row):
      if row[ftr]> upper:
        return upper
      if row[ftr] < lower:
        return lower
      else:
        return row[ftr]
    data[ftr] =  data.apply (lambda row: bin(row), axis=1)

In [56]:
#Manipulate Features to minimize feature correlation
# create new features
# create a column average zone temperature based on all temperature
data['avg_zone_temperature'] = data[[f'zone{i}_temperature' for i in range(1, 10)]].mean(axis=1)

#create a column of the difference between outside and zone temperature
data['Temperature_difference']=abs(data['avg_zone_temperature']-data['outdoor_temperature'])

#create a column average zone humidity
data['avg_zone_humidity'] = data[[f'zone{i}_humidity' for i in range(1, 10)]].mean(axis=1)

#create a column of the difference between zone and outside building humidity
data['Humidity_difference']=abs(data['avg_zone_humidity']-data['outdoor_humidity'])

In [57]:
data=data[[i for i in data.describe().columns if i not in ['lighting_energy', 'zone9_temperature', 'zone6_temperature', 'avg_zone_humidity',
        'outdoor_temperature', 'zone9_humidity', 'Temperature_difference', 'zone8_humidity', 'zone8_temperature', 'zone7_humidity',
        'zone7_temperature', 'zone6_humidity', 'zone5_humidity', 'avg_zone_temperature', 'outdoor_humidity', 'zone1_temperature',
        'atmospheric_pressure', 'zone3_humidity', 'zone5_temperature', 'zone3_temperature', 'zone1_humidity', 'zone4_temperature',
        'zone4_humidity', 'zone2_humidity', 'zone2_temperature', 'random_variable1', 'random_variable2', 'visibility_index']]]

In [58]:


# Determine the optimal lambda value for the Box-Cox transformation
optimal_lambda = boxcox_normmax(data['equipment_energy_consumption'])

# Apply the Box-Cox transformation to the 'equipment_energy_consumption' column using the optimal lambda
data['equipment_energy_consumption'] = boxcox(data['equipment_energy_consumption'], optimal_lambda)

In [59]:
# Transform Your data
data['wind_speed']=data['wind_speed'].apply(lambda x:np.log10(x+1))
data['Humidity_difference'] = data['Humidity_difference'] ** 2

In [60]:
X = data.drop(columns=['equipment_energy_consumption'])
y = data['equipment_energy_consumption']
split_idx = int(0.8 * len(data))  # 80% train, 20% test
X_train, y_train = X[:split_idx], y[:split_idx]
X_test, y_test = X[split_idx:], y[split_idx:]

In [61]:
if isinstance(y_train, np.ndarray):
    y_train = pd.Series(y_train, name='equipment_energy_consumption', index=X_train.index)

# Combine features and target
train = pd.concat([X_train, y_train], axis=1)
if isinstance(y_test, np.ndarray):
    y_test = pd.Series(y_test, name='equipment_energy_consumption', index=X_train.index)

# Combine features and target
test = pd.concat([X_test, y_test], axis=1)

for lag in [1, 2, 3, 4, 5, 6]: 
    train[f'energy_lag_{lag}'] = train['equipment_energy_consumption'].shift(lag)
    test[f'energy_lag_{lag}'] = test['equipment_energy_consumption'].shift(lag)

for i in [3, 6, 12]: 
    train[f'energy_rolling_{i}h_mean'] = train['equipment_energy_consumption'].shift(1).rolling(i).mean()
    test[f'energy_rolling_{i}h_mean'] = test['equipment_energy_consumption'].shift(1).rolling(i).mean()
    
train = train.dropna()
test = test.dropna()
X_train = train.drop(columns='equipment_energy_consumption')
y_train = train['equipment_energy_consumption']
X_test = test.drop(columns='equipment_energy_consumption')
y_test = test['equipment_energy_consumption']

In [62]:
# Scaling your data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [63]:
#ExtraTreesRegressor
ETR_model = ExtraTreesRegressor( n_estimators=500,
    max_depth=100,            
    min_samples_split=50,
    random_state=42,
    n_jobs=-1)
ETR_model.fit(X_train, y_train)
train_preds = ETR_model.predict(X_train)
test_preds = ETR_model.predict(X_test)
train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
train_r2 = r2_score(y_train, train_preds)
test_r2 = r2_score(y_test, test_preds)
print(f"Train: RMSE = {train_rmse:.4f}, R² = {train_r2:.4f}")
print(f"Test : RMSE = {test_rmse:.4f}, R² = {test_r2:.4f}")

Train: RMSE = 0.0981, R² = 0.7732
Test : RMSE = 0.0976, R² = 0.6680


In [71]:
filename = 'finalized_model.sav'
pickle.dump(ETR_model, open(filename, 'wb'))

In [72]:
loaded_model = pickle.load(open(filename, 'rb'))
y_pred_best = loaded_model.predict(X_test)
print(r2_score(y_test, y_pred_best))

0.6679815986589731
