Importing the Necessary Libraries :-

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline , make_pipeline

Loading the Dataset :-

In [None]:
df = pd.read_excel("/content/EcoWatt_Shops_Realistic_v2.xlsx")
df.head()

Unnamed: 0,State,City,Weather/Season,Shop_Type,Shop_Scale,Electricity_Tariff_Type,Avg_Working_Hours_Monthly,No_of_AC,AC_Type,Monthly_AC_Usage_Hours,...,No_of_Refrigerators_Type_2,Refrigerator_Type_2,Monthly_Refrigerator_Usage_Hours_Type_2,No_of_Lights,Lights_Type,Monthly_Lights_Usage_Hours,Billing_System/PC_Type,Monthly_PC_Usage_Hours,Monthly_Total_Kwh_Consumption,Usage_Category
0,Maharashtra,Kolhapur,Winter,Grocery,Medium,LT-2,287,1,Split AC,98,...,2,Deep Freezer (Single Lid),720,15,CFL,288,HP,223,842.76,High Usage
1,Others,Others,Summer,Medicals,Large,LT-2,211,2,,181,...,1,Deep Freezer (Double Lid),720,43,CFL,315,Dell,156,469.31,Medium Usage
2,Delhi,Mumbai,Monsoon,Clothing/Footwear,Medium,LT-2,246,1,Window AC,103,...,1,,720,24,CFL,375,,176,311.58,Low Usage
3,Others,Kolhapur,Monsoon,Medicals,Large,LT-2,285,2,Inverter AC,116,...,1,Deep Freezer (Single Lid),720,50,Tube Light,357,,159,1095.41,High Usage
4,West Bengal,Kolhapur,Summer,Grocery,Small,LT-2,215,0,Inverter AC,229,...,1,Deep Freezer (Single Lid),720,5,Tube Light,277,Lenovo,245,249.97,Low Usage


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 26 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   State                                    500 non-null    object 
 1   City                                     500 non-null    object 
 2   Weather/Season                           500 non-null    object 
 3   Shop_Type                                500 non-null    object 
 4   Shop_Scale                               500 non-null    object 
 5   Electricity_Tariff_Type                  500 non-null    object 
 6   Avg_Working_Hours_Monthly                500 non-null    int64  
 7   No_of_AC                                 500 non-null    int64  
 8   AC_Type                                  361 non-null    object 
 9   Monthly_AC_Usage_Hours                   500 non-null    int64  
 10  No_of_Fans                               500 non-n

In [None]:
df.describe()

Unnamed: 0,Avg_Working_Hours_Monthly,No_of_AC,Monthly_AC_Usage_Hours,No_of_Fans,Monthly_Fan_Usage_Hours,No_of_Refrigerators_Type_1,Monthly_Refrigerator_Usage_Hours_Type_1,No_of_Refrigerators_Type_2,Monthly_Refrigerator_Usage_Hours_Type_2,No_of_Lights,Monthly_Lights_Usage_Hours,Monthly_PC_Usage_Hours,Monthly_Total_Kwh_Consumption
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,252.898,0.99,136.056,2.946,217.352,0.71,720.0,0.74,720.0,24.898,315.388,198.382,441.22996
std,28.681507,0.821736,54.824962,1.413889,38.732784,0.686179,0.0,0.719329,0.0,13.36611,37.714749,28.84455,248.475331
min,200.0,0.0,50.0,1.0,150.0,0.0,720.0,0.0,720.0,5.0,250.0,150.0,35.1
25%,229.0,0.0,90.0,2.0,187.75,0.0,720.0,0.0,720.0,13.0,285.0,173.75,249.2825
50%,254.0,1.0,129.0,3.0,217.0,1.0,720.0,1.0,720.0,24.0,315.5,195.0,410.4
75%,277.25,2.0,179.25,4.0,243.0,1.0,720.0,1.0,720.0,37.0,340.0,223.0,585.4125
max,300.0,2.0,249.0,5.0,299.0,2.0,720.0,2.0,720.0,50.0,399.0,250.0,1484.97


In [None]:
df.shape

(500, 26)

Replacing the "Nan" Values :-

In [None]:
df.fillna("None", inplace=True)

In [None]:
# Replacing the missing appliances usage hours with 0 when type is 'None'

df.loc[df['AC_Type'] == 'None', 'Monthly_AC_Usage_Hours'] = 0
df.loc[df['Fan_Type'] == 'None', 'Monthly_Fan_Usage_Hours'] = 0
df.loc[df['Refrigerator_Type_1'] == 'None', 'Monthly_Refrigerator_Usage_Hours_Type_1'] = 0
df.loc[df['Refrigerator_Type_2'] == 'None', 'Monthly_Refrigerator_Usage_Hours_Type_2'] = 0
df.loc[df['Lights_Type'] == 'None', 'Monthly_Lights_Usage_Hours'] = 0
df.loc[df['Billing_System/PC_Type'] == 'None', 'Monthly_PC_Usage_Hours'] = 0


Checking Info After Value Replacement:-

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 26 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   State                                    500 non-null    object 
 1   City                                     500 non-null    object 
 2   Weather/Season                           500 non-null    object 
 3   Shop_Type                                500 non-null    object 
 4   Shop_Scale                               500 non-null    object 
 5   Electricity_Tariff_Type                  500 non-null    object 
 6   Avg_Working_Hours_Monthly                500 non-null    int64  
 7   No_of_AC                                 500 non-null    int64  
 8   AC_Type                                  500 non-null    object 
 9   Monthly_AC_Usage_Hours                   500 non-null    int64  
 10  No_of_Fans                               500 non-n

In [None]:
df.columns

Index(['State', 'City', 'Weather/Season', 'Shop_Type', 'Shop_Scale',
       'Electricity_Tariff_Type', 'Avg_Working_Hours_Monthly', 'No_of_AC',
       'AC_Type', 'Monthly_AC_Usage_Hours', 'No_of_Fans', 'Fan_Type',
       'Monthly_Fan_Usage_Hours', 'No_of_Refrigerators_Type_1',
       'Refrigerator_Type_1', 'Monthly_Refrigerator_Usage_Hours_Type_1',
       'No_of_Refrigerators_Type_2', 'Refrigerator_Type_2',
       'Monthly_Refrigerator_Usage_Hours_Type_2', 'No_of_Lights',
       'Lights_Type', 'Monthly_Lights_Usage_Hours', 'Billing_System/PC_Type',
       'Monthly_PC_Usage_Hours', 'Monthly_Total_Kwh_Consumption',
       'Usage_Category'],
      dtype='object')

In [None]:
df.shape

(500, 26)

Dividing the data into Independent & Dependent Columns :-

In [None]:
X = df[['State', 'City', 'Weather/Season', 'Shop_Type', 'Shop_Scale',
       'Electricity_Tariff_Type', 'Avg_Working_Hours_Monthly', 'No_of_AC',
       'AC_Type', 'Monthly_AC_Usage_Hours', 'No_of_Fans', 'Fan_Type',
       'Monthly_Fan_Usage_Hours', 'No_of_Refrigerators_Type_1',
       'Refrigerator_Type_1', 'Monthly_Refrigerator_Usage_Hours_Type_1',
       'No_of_Refrigerators_Type_2', 'Refrigerator_Type_2',
       'Monthly_Refrigerator_Usage_Hours_Type_2', 'No_of_Lights',
       'Lights_Type', 'Monthly_Lights_Usage_Hours', 'Billing_System/PC_Type',
       'Monthly_PC_Usage_Hours']]
y = df['Monthly_Total_Kwh_Consumption']

Performing Data Transformation:-

In [None]:
Nominal_cols = ['State', 'City', 'Weather/Season']
Ordinal_cols = ['AC_Type','Fan_Type','Refrigerator_Type_1','Refrigerator_Type_2','Lights_Type','Billing_System/PC_Type']
Numerical_cols = ['Avg_Working_Hours_Monthly','No_of_AC','Monthly_AC_Usage_Hours','No_of_Fans','Monthly_Fan_Usage_Hours','No_of_Refrigerators_Type_1','No_of_Refrigerators_Type_2','Monthly_Refrigerator_Usage_Hours_Type_1','Monthly_Refrigerator_Usage_Hours_Type_2','No_of_Lights','Monthly_Lights_Usage_Hours','Monthly_PC_Usage_Hours']
ordinal_cols = ['Shop_Scale',"Shop_Type",'Electricity_Tariff_Type']

In [None]:
Category1 = [['Window AC', 'None', 'Inverter AC', 'Split AC'],['Havells', 'Crompton', 'None', 'Bajaj', 'Polycab'],['Display Cooler (Double Door)', 'None','Display Cooler (Single Door)'],['Deep Freezer (Single Lid)', 'Deep Freezer (Double Lid)', 'None'],['CFL', 'LED', 'Tube Light'],['Dell', 'HP', 'Lenovo',"None"]]
category2 =[ ['Large', 'Small', 'Medium'],['Bakery/SweetShop', 'Clothing/Footwear', 'Medicals', 'Grocery'],['LT-2']]

In [None]:

transformer1 = ColumnTransformer( transformers =[
    ("tnf1" , OneHotEncoder( drop = "first") ,Nominal_cols),
    ("tnf2", OrdinalEncoder(categories= Category1), Ordinal_cols),
    ("tnf3",MinMaxScaler() ,Numerical_cols  ),
    ("tnf4" ,OrdinalEncoder(categories = category2),ordinal_cols)
],remainder = "passthrough")

In [None]:
pipe = Pipeline([
    ("transformer1",transformer1)
])

Dividing the Data Into Train_Test_Splits :-

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size = 0.2 , random_state = 2)

In [None]:
X_train_trans = pipe.fit_transform(X_train)
X_test_trans = pipe.transform(X_test)

Model Training & Evaluation :-

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train_trans , y_train)
y_pred = rf.predict(X_test_trans)
print(r2_score(y_test , y_pred))

0.800644039898837


Performing Hyper-Parameter Tunning :-

In [None]:
n_estimators = [10,15,20,25]
max_features = [0.75,0.8,0.9,0.95]
max_depth = [4,5,6]
max_samples = [0.7,0.8,0.9]

In [None]:
param_grid = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth': max_depth ,
    'max_samples':max_samples
}
print(param_grid)

{'n_estimators': [10, 15, 20, 25], 'max_features': [0.75, 0.8, 0.9, 0.95], 'max_depth': [4, 5, 6], 'max_samples': [0.7, 0.8, 0.9]}


In [None]:
rf_grid = GridSearchCV(
    estimator = rf ,
    param_grid =param_grid ,
    cv = 10 ,
    verbose = 2,
    n_jobs = -1
)

In [None]:
rf_grid.fit(X_train_trans , y_train)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


In [None]:
rf_grid.best_params_



{'max_depth': 6, 'max_features': 0.95, 'max_samples': 0.8, 'n_estimators': 25}

In [None]:
print(rf_grid.best_score_)


0.7399316119737764


In [None]:
y_pred = rf_grid.predict(X_test_trans)
print(r2_score(y_test,y_pred))

0.7618674511071017


Building the Best Model :-

In [None]:
RF = RandomForestRegressor( max_depth= 6 , max_features=0.95 , max_samples=0.8, n_estimators=25)

In [None]:
RF.fit(X_train_trans , y_train)
y_pred = RF.predict(X_test_trans)
print(r2_score(y_test , y_pred))

0.7638107943601558


Dumping the Preprocessing Pipeline And Best Model :-

In [None]:
import joblib
# Saving the preprocessing Pipeline
joblib.dump(pipe ,"Preprocessing_EcoWatt_Shops.pkl")

# Saving the Best Model
joblib.dump(RF , "Best_Model_EcoWatt_Shops.pkl")
print("Model And Preprocessing Object Saved SuccessFully .")


Model And Preprocessing Object Saved SuccessFully .
