<a href="https://colab.research.google.com/github/ArunodayGupta/Electricity_Bill_Predictor/blob/main/Notebook/Final_Solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

#reading data from a csv file
df=pd.read_csv("electricity_bill_dataset.csv")

#splitting the data into training and test data
train_df,test_df=train_test_split(df,test_size=0.2)

input_cols=list(train_df.columns)[0:-1]
target_col="ElectricityBill"

train_input=train_df[input_cols].copy()
train_target=train_df[target_col].copy()

test_input=test_df[input_cols].copy()
test_target=test_df[target_col].copy()

#dividing the columns based on their datatype
numeric_cols = train_input.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_input.select_dtypes('object').columns.tolist()

#using minmaxscaler to
scaler = MinMaxScaler()
scaler.fit(df[numeric_cols])

train_input[numeric_cols]=scaler.transform(train_input[numeric_cols])
test_input[numeric_cols]=scaler.transform(test_input[numeric_cols])

train_input[numeric_cols].head()

#using onehotencoder to encode the categorical values
encoder=OneHotEncoder(sparse_output=False)
encoder.fit(df[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

train_input[encoded_cols] = encoder.transform(train_input[categorical_cols])
test_input[encoded_cols] = encoder.transform(test_input[categorical_cols])

print(train_input.shape)
print(train_target.shape)
print(test_input.shape)
print(test_target.shape)

train_final=train_input[numeric_cols+encoded_cols]
test_final=test_input[numeric_cols+encoded_cols]

#using random forest model to predict the electricity bill
model = RandomForestRegressor(n_jobs=-1, random_state=42)
model.fit(train_final, train_target)
#score for training data
print(model.score(train_final, train_target))
#score for target data
print(model.score(test_final, test_target))

#list of the features based on their importance in the model
feature_importance = pd.Series(model.feature_importances_, index=train_final.columns).sort_values(ascending=False)
print(feature_importance)

(36276, 59)
(36276,)
(9069, 59)
(9069,)
0.9999949648916105
0.999976560148595
MonthlyHours                                                                  9.201472e-01
TariffRate                                                                    7.866308e-02
City_New Delhi                                                                2.345297e-04
City_Ratnagiri                                                                2.139225e-04
City_Nagpur                                                                   1.202551e-04
City_Ahmedabad                                                                9.704391e-05
City_Dahej                                                                    7.138488e-05
City_Navi Mumbai                                                              6.337180e-05
City_Pune                                                                     5.824269e-05
City_Kolkata                                                                  5.667128e-05
City_Shimla  