Importing the Necessary Libraries :-

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

Loading the Dataset:-

In [None]:
df = pd.read_excel("/content/EcoWatt_Homes_Realistic_Updated.xlsx")
df.head()

Unnamed: 0,Home_ID,State,City,Weather/Season,Home_Type,No_Of_Residents,Electricity_Tariff_Type,AC_Type,Monthly_AC_Usage_Hours,Fan_Type,...,Refrigerator_Usage_Hrs_Monthly,TV_Type,Monthly_TV_Usage_Hours,Geyser_Type,Monthly_Geyser_Usage_Minutes,Washing_Machine_Type,Monthly_Washing_Machine_Usage_Cycles,Washing_Machine_Age,Monthly_Kwh_Consumption,Usage_Category
0,1,Gujarat,Mumbai,Summer,Apartment,6,LT-1,,145,,...,720,LCD,74,15-25L,508,,14,,193.73,Medium Usage
1,2,Delhi,Ahmedabad,Summer,Apartment,6,LT-1,Split AC,147,Havells,...,720,SmartTV,87,,632,6kg,18,Mid,465.02,High Usage
2,3,Gujarat,Bengaluru,Summer,Row House,5,LT-1,Window AC,130,Bajaj,...,720,LCD,95,15-25L,688,8kg,21,Mid,364.61,High Usage
3,4,Others,Mumbai,Winter,Bungalow,2,LT-1,Window AC,71,Polycab,...,720,SmartTV,85,15-25L,945,10kg,14,Old,196.48,Medium Usage
4,5,Karnataka,Delhi,Winter,Row House,5,LT-1,,63,Bajaj,...,720,,90,6-10L,986,8kg,22,New,221.98,Medium Usage


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Home_ID                               500 non-null    int64  
 1   State                                 500 non-null    object 
 2   City                                  500 non-null    object 
 3   Weather/Season                        500 non-null    object 
 4   Home_Type                             500 non-null    object 
 5   No_Of_Residents                       500 non-null    int64  
 6   Electricity_Tariff_Type               500 non-null    object 
 7   AC_Type                               379 non-null    object 
 8   Monthly_AC_Usage_Hours                500 non-null    int64  
 9   Fan_Type                              414 non-null    object 
 10  Monthly_Fan_Usage_Hours               500 non-null    int64  
 11  Refrigerator_Type  

In [None]:
df.shape

(500, 22)

Replacing the "Nan" Values :-

In [None]:
df.fillna("None", inplace=True)

In [None]:
df.loc[df['AC_Type'] == 'None', 'Monthly_AC_Usage_Hours'] = 0
df.loc[df['Fan_Type'] == 'None', 'Monthly_Fan_Usage_Hours'] = 0
df.loc[df['Refrigerator_Type'] == 'None', 'Refrigerator_Usage_Hrs_Monthly'] = 0
df.loc[df['TV_Type'] == 'None', 'Monthly_TV_Usage_Hours'] = 0
df.loc[df['Geyser_Type'] == 'None', 'Monthly_Geyser_Usage_Minutes'] = 0
df.loc[df['Washing_Machine_Type'] == 'None', 'Monthly_Washing_Machine_Usage_Cycles'] = 0

Checking for Missing Values after Value Replacement Task :-

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Home_ID                               500 non-null    int64  
 1   State                                 500 non-null    object 
 2   City                                  500 non-null    object 
 3   Weather/Season                        500 non-null    object 
 4   Home_Type                             500 non-null    object 
 5   No_Of_Residents                       500 non-null    int64  
 6   Electricity_Tariff_Type               500 non-null    object 
 7   AC_Type                               500 non-null    object 
 8   Monthly_AC_Usage_Hours                500 non-null    int64  
 9   Fan_Type                              500 non-null    object 
 10  Monthly_Fan_Usage_Hours               500 non-null    int64  
 11  Refrigerator_Type  

In [None]:
df.head()

Unnamed: 0,Home_ID,State,City,Weather/Season,Home_Type,No_Of_Residents,Electricity_Tariff_Type,AC_Type,Monthly_AC_Usage_Hours,Fan_Type,...,Refrigerator_Usage_Hrs_Monthly,TV_Type,Monthly_TV_Usage_Hours,Geyser_Type,Monthly_Geyser_Usage_Minutes,Washing_Machine_Type,Monthly_Washing_Machine_Usage_Cycles,Washing_Machine_Age,Monthly_Kwh_Consumption,Usage_Category
0,1,Gujarat,Mumbai,Summer,Apartment,6,LT-1,,0,,...,720,LCD,74,15-25L,508,,0,,193.73,Medium Usage
1,2,Delhi,Ahmedabad,Summer,Apartment,6,LT-1,Split AC,147,Havells,...,720,SmartTV,87,,0,6kg,18,Mid,465.02,High Usage
2,3,Gujarat,Bengaluru,Summer,Row House,5,LT-1,Window AC,130,Bajaj,...,720,LCD,95,15-25L,688,8kg,21,Mid,364.61,High Usage
3,4,Others,Mumbai,Winter,Bungalow,2,LT-1,Window AC,71,Polycab,...,720,SmartTV,85,15-25L,945,10kg,14,Old,196.48,Medium Usage
4,5,Karnataka,Delhi,Winter,Row House,5,LT-1,,0,Bajaj,...,720,,0,6-10L,986,8kg,22,New,221.98,Medium Usage


In [None]:
df.columns

Index(['Home_ID', 'State', 'City', 'Weather/Season', 'Home_Type',
       'No_Of_Residents', 'Electricity_Tariff_Type', 'AC_Type',
       'Monthly_AC_Usage_Hours', 'Fan_Type', 'Monthly_Fan_Usage_Hours',
       'Refrigerator_Type', 'Refrigerator_Usage_Hrs_Monthly', 'TV_Type',
       'Monthly_TV_Usage_Hours', 'Geyser_Type', 'Monthly_Geyser_Usage_Minutes',
       'Washing_Machine_Type', 'Monthly_Washing_Machine_Usage_Cycles',
       'Washing_Machine_Age', 'Monthly_Kwh_Consumption', 'Usage_Category'],
      dtype='object')

Dividing the data into Independent & Dependent Columns :-

In [None]:
X = df[['State', 'City', 'Weather/Season', 'Home_Type',
       'No_Of_Residents', 'Electricity_Tariff_Type', 'AC_Type',
       'Monthly_AC_Usage_Hours', 'Fan_Type', 'Monthly_Fan_Usage_Hours',
       'Refrigerator_Type', 'Refrigerator_Usage_Hrs_Monthly', 'TV_Type',
       'Monthly_TV_Usage_Hours', 'Geyser_Type', 'Monthly_Geyser_Usage_Minutes',
       'Washing_Machine_Type', 'Monthly_Washing_Machine_Usage_Cycles',
       'Washing_Machine_Age']]
y = df['Monthly_Kwh_Consumption']

Performing Data Transformation :-

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler, OneHotEncoder


Categorical_cols = ["State", "City", "Weather/Season"]
Numerical_cols = ['No_Of_Residents','Monthly_AC_Usage_Hours','Monthly_Fan_Usage_Hours','Refrigerator_Usage_Hrs_Monthly','Monthly_TV_Usage_Hours','Monthly_Geyser_Usage_Minutes','Monthly_Washing_Machine_Usage_Cycles']
Ordinal_cols = ['AC_Type',"Fan_Type",'Refrigerator_Type','TV_Type','Geyser_Type','Washing_Machine_Age']
Cat_cols = ['Home_Type','Electricity_Tariff_Type', 'Washing_Machine_Type']

categories_ordinal = [['None', 'Split AC', 'Window AC',"Inverter AC"],['Crompton', 'Bajaj', 'Polycab', 'Havells', 'None'],['Double Door 3★', 'Double Door 4★', 'Side-by-Side 5★', 'None','Single Door 5★', 'Side-by-Side 4★', 'Single Door 4★','Single Door 3★', 'Double Door 5★'],['SmartTV', 'LCD', 'LED',"None"],['15-25L', '30+L', 'None', '6-10L'],['New', 'Mid', 'Old', 'None']]
categories_cat = [['Apartment', 'Row House', 'Bungalow'],['LT-1'],['12kg', '6kg', '7kg', '8kg', 'None', '10kg']]


transformer = ColumnTransformer(
    transformers=[
        ("tnf1", OneHotEncoder(drop="first"), Categorical_cols),
        ("tnf2", OrdinalEncoder(categories=categories_ordinal), Ordinal_cols),
        ("tnf3", MinMaxScaler(), Numerical_cols),
        ("tnf4", OrdinalEncoder(categories=categories_cat), Cat_cols)
    ],
    remainder="passthrough"
)

In [None]:
from sklearn.pipeline import Pipeline
Pipe = Pipeline([
    ("tnf1",transformer)
])


Dividing the Data Into Train-Test Splits :-

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.8 ,random_state = 42)

X_train_trans = Pipe.fit_transform(X_train)
X_test_trans = Pipe.transform(X_test)

In [None]:
X_train_trans = Pipe.fit_transform(X_train)
X_test_trans = Pipe.transform(X_test)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train_trans ,y_train)
y_pred = rf.predict(X_test_trans)
print(r2_score(y_test,y_pred))

0.8215854865734258


Model Training And Evaluation :-

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [None]:
rf = RandomForestRegressor(n_estimators=10 , random_state = 42 , max_samples = 0.8)

In [None]:
rf.fit(X_train_trans ,y_train)

In [None]:
y_pred = rf.predict(X_test_trans)
print(r2_score(y_test,y_pred))

0.7912536050361068


Performing Hyper-Paramter Tunning :-

In [None]:
n_estimators = [10,15,20,25]
max_features = [0.75,0.8,0.9,0.95]
max_depth = [4,5,6]
max_samples = [0.7,0.8,0.9]

In [None]:
param_grid = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth': max_depth ,
    'max_samples':max_samples
}
print(param_grid)

{'n_estimators': [10, 15, 20, 25], 'max_features': [0.75, 0.8, 0.9, 0.95], 'max_depth': [4, 5, 6], 'max_samples': [0.7, 0.8, 0.9]}


In [None]:
rf = RandomForestRegressor()

In [None]:
rf_grid = GridSearchCV(
    estimator = rf ,
    param_grid =param_grid ,
    cv = 10 ,
    verbose = 2,
    n_jobs = -1
)

In [None]:
rf_grid.fit(X_train_trans , y_train)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


In [None]:
rf_grid.best_params_

{'max_depth': 6, 'max_features': 0.95, 'max_samples': 0.9, 'n_estimators': 20}

In [None]:
print(rf_grid.best_score_)

0.8215990151386674


Building the Best Model :-

---



In [None]:
RF = RandomForestRegressor( max_depth = 6 , max_features = 0.95 , max_samples = 0.9, n_estimators=20)

In [None]:
RF.fit(X_train_trans , y_train)
y_pred = RF.predict(X_test_trans)
print(r2_score(y_test , y_pred))

0.8130761088082015


Dumping The Preprocessing Pipeline and Best_Model :-

In [None]:
import joblib
# Saving the preprocessing Pipeline
joblib.dump(Pipe ,"Preprocessing_EcoWatt_Homes.pkl")

# Saving the Best Model
joblib.dump(RF , "Best_Model_EcoWatt_Homes.pkl")
print("Model And Preprocessing Object Saved SuccessFully .")

Model And Preprocessing Object Saved SuccessFully .
