**Data Loading**

In [1]:
import pandas as pd
data = pd.read_csv('household_energy_consumption.csv')
data.head()

Unnamed: 0,Household_ID,Date,Energy_Consumption_kWh,Household_Size,Avg_Temperature_C,Has_AC,Peak_Hours_Usage_kWh
0,H00001,2025-04-01,8.4,4,17.8,No,3.2
1,H00001,2025-04-02,7.9,4,17.3,No,2.8
2,H00001,2025-04-03,9.2,4,18.6,No,3.0
3,H00001,2025-04-04,7.9,4,18.2,No,2.7
4,H00001,2025-04-05,9.6,4,11.9,No,3.2


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Household_ID            90000 non-null  object 
 1   Date                    90000 non-null  object 
 2   Energy_Consumption_kWh  90000 non-null  float64
 3   Household_Size          90000 non-null  int64  
 4   Avg_Temperature_C       90000 non-null  float64
 5   Has_AC                  90000 non-null  object 
 6   Peak_Hours_Usage_kWh    90000 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 4.8+ MB


**Data Cleaning and Preprocessing**

In [3]:
duplicate = data.duplicated()
print(duplicate.sum())

0


In [4]:
missing = data.isna()
print(missing.sum())

Household_ID              0
Date                      0
Energy_Consumption_kWh    0
Household_Size            0
Avg_Temperature_C         0
Has_AC                    0
Peak_Hours_Usage_kWh      0
dtype: int64


In [5]:
data['Date'] = pd.to_datetime(data['Date'])
data['Has_AC'] = data['Has_AC'].map({'Yes': 1, 'No': 0})
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Household_ID            90000 non-null  object        
 1   Date                    90000 non-null  datetime64[ns]
 2   Energy_Consumption_kWh  90000 non-null  float64       
 3   Household_Size          90000 non-null  int64         
 4   Avg_Temperature_C       90000 non-null  float64       
 5   Has_AC                  90000 non-null  int64         
 6   Peak_Hours_Usage_kWh    90000 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(1)
memory usage: 4.8+ MB


In [6]:
data.describe()

Unnamed: 0,Date,Energy_Consumption_kWh,Household_Size,Avg_Temperature_C,Has_AC,Peak_Hours_Usage_kWh
count,90000,90000.0,90000.0,90000.0,90000.0,90000.0
mean,2025-04-04 00:00:03.840000256,10.571988,3.487811,17.505802,0.494356,4.319557
min,2025-04-01 00:00:00,0.5,1.0,10.0,0.0,0.2
25%,2025-04-02 00:00:00,6.0,2.0,15.8,0.0,2.3
50%,2025-04-04 00:00:00,10.4,3.0,17.5,0.0,4.0
75%,2025-04-06 00:00:00,14.8,5.0,19.2,1.0,6.0
max,2025-04-08 00:00:00,20.0,6.0,25.0,1.0,10.0
std,,5.519494,1.709761,2.491621,0.499971,2.531432


In [7]:
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['Is_Weekend'] = data['DayOfWeek'].isin([5, 6]).astype(int)
data.head()

Unnamed: 0,Household_ID,Date,Energy_Consumption_kWh,Household_Size,Avg_Temperature_C,Has_AC,Peak_Hours_Usage_kWh,Month,Day,DayOfWeek,Is_Weekend
0,H00001,2025-04-01,8.4,4,17.8,0,3.2,4,1,1,0
1,H00001,2025-04-02,7.9,4,17.3,0,2.8,4,2,2,0
2,H00001,2025-04-03,9.2,4,18.6,0,3.0,4,3,3,0
3,H00001,2025-04-04,7.9,4,18.2,0,2.7,4,4,4,0
4,H00001,2025-04-05,9.6,4,11.9,0,3.2,4,5,5,1


**Data Splitting**

In [8]:
from sklearn.model_selection import train_test_split

X = data.drop(['Date', 'Energy_Consumption_kWh', 'Household_ID'], axis=1)
y = data['Energy_Consumption_kWh']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Training Data:', X_train.shape)
print("Testing Data:", X_test.shape)

Training Data: (72000, 8)
Testing Data: (18000, 8)


**Model Training**

In [9]:
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "SVR" : SVR()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{name} Results:")
    print("Mean Absolute Error:", mae)
    print("Mean Squared Error:", mse)
    print("Root Mean Squared Error:", rmse)
    print("R-squared:", r2)


Linear Regression Results:
Mean Absolute Error: 0.6271657518512228
Mean Squared Error: 0.623080388431848
Root Mean Squared Error: 0.789354412435788
R-squared: 0.9795403615744133

Random Forest Results:
Mean Absolute Error: 0.5293686135323009
Mean Squared Error: 0.5236786642824482
Root Mean Squared Error: 0.7236564545987607
R-squared: 0.982804343835346

Gradient Boosting Results:
Mean Absolute Error: 0.4849197638241205
Mean Squared Error: 0.4175120466393194
Root Mean Squared Error: 0.64615172106814
R-squared: 0.9862904599933472

Decision Tree Results:
Mean Absolute Error: 0.6335476455026455
Mean Squared Error: 0.7860728616465105
Root Mean Squared Error: 0.8866075014607707
R-squared: 0.9741882960464691

SVR Results:
Mean Absolute Error: 0.5223474969607002
Mean Squared Error: 0.4551585643877295
Root Mean Squared Error: 0.6746544036673365
R-squared: 0.9850542886173657


**Selecting the best model**

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

**Saving the trained model**

In [11]:
import joblib
joblib.dump(model, 'energy_prediction.pkl')

['energy_prediction.pkl']

**Predicting an output**

In [15]:
import pandas as pd
import joblib

model = joblib.load('energy_prediction.pkl')

# The model expects 8 features: Household_Size, Avg_Temperature_C, Has_AC, Peak_Hours_Usage_kWh, Month, Day, DayOfWeek, Is_Weekend

input_data = pd.DataFrame([[
    4,    # Household_Size
    20.0, # Avg_Temperature_C
    1,    # Has_AC
    3.0,  # Peak_Hours_Usage_kWh
    4,    # Month
    15,   # Day
    1,    # DayOfWeek
    0     # Is_Weekend
]], columns=['Household_Size', 'Avg_Temperature_C', 'Has_AC', 'Peak_Hours_Usage_kWh', 'Month', 'Day', 'DayOfWeek', 'Is_Weekend'])

print(model.predict(input_data))

[8.14345247]


In [26]:
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb
!sudo dpkg -i cloudflared-linux-amd64.deb


--2025-11-15 11:36:30--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2025.11.1/cloudflared-linux-amd64.deb [following]
--2025-11-15 11:36:30--  https://github.com/cloudflare/cloudflared/releases/download/2025.11.1/cloudflared-linux-amd64.deb
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/106867604/8a32f7c6-649c-4f0d-806d-e14c19d0786d?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-11-15T12%3A14%3A22Z&rscd=attachment%3B+filename%3Dcloudflared-linux-amd64.deb&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4d

In [27]:
!streamlit run app.py --server.port 8501 &>/content/log.txt &


In [28]:
!cloudflared tunnel --url http://localhost:8501 --no-autoupdate


[90m2025-11-15T11:37:24Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2025-11-15T11:37:24Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2025-11-15T11:37:28Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2025-11-15T11:37:28Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2025