# Energy Production Prediction

## Importing required packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Getting dataset

In [3]:
df = pd.read_csv("data/proj64/Energy_production.csv")
df.head()

Unnamed: 0,index,Date,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU),Total Actual Production
0,0,01-09-2017,Northern,624.23,484.21,30.36,35.57,273.27,320.81,927.86
1,1,01-09-2017,Western,1106.89,1024.33,25.17,3.81,72.0,21.53,1204.06
2,2,01-09-2017,Southern,576.66,578.55,62.73,49.8,111.57,64.78,750.96
3,3,01-09-2017,Eastern,441.02,429.39,,,85.94,69.36,526.96
4,4,01-09-2017,NorthEastern,29.11,15.91,,,24.64,21.21,53.75


## Cleaning dataset

In [4]:
df.isna().sum()

index                                      0
Date                                       0
Region                                     0
Thermal Generation Actual (in MU)          0
Thermal Generation Estimated (in MU)       0
Nuclear Generation Actual (in MU)       1978
Nuclear Generation Estimated (in MU)    1978
Hydro Generation Actual (in MU)            0
Hydro Generation Estimated (in MU)         0
Total Actual Production                    0
dtype: int64

In [8]:
df["Nuclear Generation Actual (in MU)"].fillna(np.mean(df["Nuclear Generation Actual (in MU)"]), inplace=True)
df["Nuclear Generation Estimated (in MU)"].fillna(np.mean(df["Nuclear Generation Estimated (in MU)"]), inplace=True)

In [9]:
df.head()

Unnamed: 0,index,Date,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU),Total Actual Production
0,0,01-09-2017,Northern,624.23,484.21,30.36,35.57,273.27,320.81,927.86
1,1,01-09-2017,Western,1106.89,1024.33,25.17,3.81,72.0,21.53,1204.06
2,2,01-09-2017,Southern,576.66,578.55,62.73,49.8,111.57,64.78,750.96
3,3,01-09-2017,Eastern,441.02,429.39,37.242208,36.987877,85.94,69.36,526.96
4,4,01-09-2017,NorthEastern,29.11,15.91,37.242208,36.987877,24.64,21.21,53.75


In [10]:

df.isna().sum()

index                                   0
Date                                    0
Region                                  0
Thermal Generation Actual (in MU)       0
Thermal Generation Estimated (in MU)    0
Nuclear Generation Actual (in MU)       0
Nuclear Generation Estimated (in MU)    0
Hydro Generation Actual (in MU)         0
Hydro Generation Estimated (in MU)      0
Total Actual Production                 0
dtype: int64

## Pre-Processing

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4945 entries, 0 to 4944
Data columns (total 10 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   index                                 4945 non-null   int64  
 1   Date                                  4945 non-null   object 
 2   Region                                4945 non-null   object 
 3   Thermal Generation Actual (in MU)     4945 non-null   float64
 4   Thermal Generation Estimated (in MU)  4945 non-null   float64
 5   Nuclear Generation Actual (in MU)     4945 non-null   float64
 6   Nuclear Generation Estimated (in MU)  4945 non-null   float64
 7   Hydro Generation Actual (in MU)       4945 non-null   float64
 8   Hydro Generation Estimated (in MU)    4945 non-null   float64
 9   Total Actual Production               4945 non-null   float64
dtypes: float64(7), int64(1), object(2)
memory usage: 386.5+ KB


In [13]:
df.drop("Date", inplace=True, axis=1)
df.head()

Unnamed: 0,index,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU),Total Actual Production
0,0,Northern,624.23,484.21,30.36,35.57,273.27,320.81,927.86
1,1,Western,1106.89,1024.33,25.17,3.81,72.0,21.53,1204.06
2,2,Southern,576.66,578.55,62.73,49.8,111.57,64.78,750.96
3,3,Eastern,441.02,429.39,37.242208,36.987877,85.94,69.36,526.96
4,4,NorthEastern,29.11,15.91,37.242208,36.987877,24.64,21.21,53.75


In [14]:
df.drop("index", inplace=True, axis=1)
df.head()

Unnamed: 0,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU),Total Actual Production
0,Northern,624.23,484.21,30.36,35.57,273.27,320.81,927.86
1,Western,1106.89,1024.33,25.17,3.81,72.0,21.53,1204.06
2,Southern,576.66,578.55,62.73,49.8,111.57,64.78,750.96
3,Eastern,441.02,429.39,37.242208,36.987877,85.94,69.36,526.96
4,NorthEastern,29.11,15.91,37.242208,36.987877,24.64,21.21,53.75


In [15]:
df["Region"].value_counts()

Northern        989
Western         989
Southern        989
Eastern         989
NorthEastern    989
Name: Region, dtype: int64

In [16]:
le = LabelEncoder()
df["Region"] = le.fit_transform(df["Region"])
le.classes_

array(['Eastern', 'NorthEastern', 'Northern', 'Southern', 'Western'],
      dtype=object)

In [17]:
df.head()

Unnamed: 0,Region,Thermal Generation Actual (in MU),Thermal Generation Estimated (in MU),Nuclear Generation Actual (in MU),Nuclear Generation Estimated (in MU),Hydro Generation Actual (in MU),Hydro Generation Estimated (in MU),Total Actual Production
0,2,624.23,484.21,30.36,35.57,273.27,320.81,927.86
1,4,1106.89,1024.33,25.17,3.81,72.0,21.53,1204.06
2,3,576.66,578.55,62.73,49.8,111.57,64.78,750.96
3,0,441.02,429.39,37.242208,36.987877,85.94,69.36,526.96
4,1,29.11,15.91,37.242208,36.987877,24.64,21.21,53.75


In [18]:
X = df.drop("Total Actual Production", axis=1)
y = df["Total Actual Production"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
len(X_train), len(y_train)

(3956, 3956)

## Training model

In [20]:
from xgboost import XGBRegressor

rreg = RandomForestRegressor()
rreg.fit(X_train, y_train)
rreg.score(X_test, y_test)

0.9999831158221425

In [40]:
xgbreg = XGBRegressor()
xgbreg.fit(X_train.values, y_train.values)
xgbreg.score(X_test.values, y_test.values)

0.9999777670766526

## Prediction

In [43]:
xgbreg.predict([X_test.iloc[1, :].values])

array([733.7414], dtype=float32)