In [10]:
# Required Libraries
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Ensures folders exist
os.makedirs('data', exist_ok=True)
os.makedirs('model', exist_ok=True)


In [11]:
# Load the dataset
df = pd.read_excel('Energy_consumption_dataset(project).csv.xlsx')
df.head()


Unnamed: 0,Month,Hour,DayOfWeek,Holiday,Temperature,Humidity,SquareFootage,Occupancy,HVACUsage,LightingUsage,RenewableEnergy,EnergyConsumption
0,1,0,Saturday,No,25.139433,43.431581,1565.693999,5,On,Off,2.774699,75.364373
1,1,1,Saturday,No,27.731651,54.225919,1411.064918,1,On,On,21.831384,83.401855
2,1,2,Saturday,No,28.704277,58.907658,1755.715009,2,Off,Off,6.764672,78.270888
3,1,3,Saturday,No,20.080469,50.371637,1452.316318,1,Off,On,8.623447,56.51985
4,1,4,Saturday,No,23.097359,51.401421,1094.130359,9,On,Off,3.071969,70.811732


In [12]:
# Clean column names
df = df.rename(columns=lambda x: x.strip())

# Separate Features (X) and Target (y)
X = df.drop(columns=['EnergyConsumption'])
y = df['EnergyConsumption']

# Encode categorical columns (adjust these based on your dataset)
X['DayOfWeek'] = X['DayOfWeek'].astype('category').cat.codes
X['Holiday'] = X['Holiday'].astype('category').cat.codes
X['HVACUsage'] = X['HVACUsage'].astype('category').cat.codes
X['LightingUsage'] = X['LightingUsage'].astype('category').cat.codes

# Fill any missing values
X = X.fillna(X.mean())


In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save Train and Test CSVs
train_df = X_train.copy()
train_df['EnergyConsumption'] = y_train
test_df = X_test.copy()
test_df['EnergyConsumption'] = y_test

train_df.to_csv('data/train.csv', index=False)
test_df.to_csv('data/test.csv', index=False)
print("✅ Train and test CSVs saved in the 'data' folder.")

✅ Train and test CSVs saved in the 'data' folder.


In [14]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [15]:
# Train RandomForestRegressor model
model = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=42)
model.fit(X_train_scaled, y_train)
print("✅ Model training completed.")


✅ Model training completed.


In [16]:
# Make predictions on test data
y_pred = model.predict(X_test_scaled)

# Calculate RMSE (Compatible with any version)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  # Manually calculating RMSE without using 'squared' parameter

# Calculate R² Score
r2 = r2_score(y_test, y_pred)

print(f'✅ Model Evaluation:')
print(f'RMSE: {rmse:.2f}')
print(f'R² Score: {r2:.2f}')


✅ Model Evaluation:
RMSE: 7.95
R² Score: 0.24


In [17]:
# Save the model and scaler
with open('model/ml_model.sav', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('model/scaler.sav', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("✅ Model and scaler saved in the 'model' folder.")


✅ Model and scaler saved in the 'model' folder.


In [18]:
# Load the saved model and scaler
with open('model/ml_model.sav', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('model/scaler.sav', 'rb') as scaler_file:
    loaded_scaler = pickle.load(scaler_file)

# Load test data
test_data = pd.read_csv('data/test.csv')
X_new = test_data.drop(columns=['EnergyConsumption'])

# Scale the new data
X_new_scaled = loaded_scaler.transform(X_new)

# Make predictions
predictions = loaded_model.predict(X_new_scaled)
print(f"✅ Predictions on test data: {predictions[:10]}")


✅ Predictions on test data: [75.23852779 80.74827779 74.53706665 85.57517214 72.9168771  68.3254156
 77.15494509 76.74457612 69.1693014  79.58892994]
