In [96]:
# Import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sqlalchemy import create_engine

import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [64]:
db = '../static/data/eia_electric.sqlite'

In [65]:
engine = create_engine(f'sqlite:///{db}')
conn = engine.connect()

In [71]:
data = pd.read_sql('SELECT * FROM epo', conn, parse_dates = ['period'])
data.head()

Unnamed: 0,period,energySource,ash-content,consumption-for-eg,consumption-for-eg-btu,consumption-uto,consumption-uto-btu,cost,cost-per-btu,generation,heat-content,receipts,receipts-btu,stocks,sulfur-content,total-consumption,total-consumption-btu
0,2008-01-01,fossil fuels,308.89,3734358.85,27878.91,594559.14,1025.72,4477.05,692.55,2854188.91,974.95,4011940.29,26408984.47,2367212.11,81.48,4328918.0,28904.63
1,2008-01-01,others,0.0,2666.2,2263.2,12391.55,7.95,0.0,0.0,214662.9,0.0,0.0,0.0,0.0,0.0,15057.75,2271.15
2,2008-01-01,renewables,254.19,51894.93,1381.14,4114.64,453.03,591.66,101.83,126059.98,101.13,4074.26,44771.59,7392.34,8.36,56009.57,1834.17
3,2008-02-01,fossil fuels,169.75,1476772.66,10823.13,321531.26,499.01,2371.16,374.65,1092655.81,617.48,1660658.05,10715695.93,1002393.72,64.86,1798303.92,11322.14
4,2008-02-01,others,0.0,913.06,13.82,8851.58,3.47,0.0,0.0,1004.15,0.0,0.0,0.0,0.0,0.0,9764.64,17.29


In [72]:
engine.dispose()

In [80]:
# Create DF of each energySource
ff_df = data.loc[data['energySource'] == 'fossil fuels'].drop(columns = ['energySource']).set_index('period')
re_df = data.loc[data['energySource'] == 'renewables'].drop(columns = ['energySource']).set_index('period')
oth_df = data.loc[data['energySource'] == 'others'].drop(columns = ['energySource']).set_index('period')

In [82]:
# View the DF
display(ff_df.head())
display(re_df.head())
display(oth_df.head())

Unnamed: 0_level_0,ash-content,consumption-for-eg,consumption-for-eg-btu,consumption-uto,consumption-uto-btu,cost,cost-per-btu,generation,heat-content,receipts,receipts-btu,stocks,sulfur-content,total-consumption,total-consumption-btu
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2008-01-01,308.89,3734358.85,27878.91,594559.14,1025.72,4477.05,692.55,2854188.91,974.95,4011940.29,26408984.47,2367212.11,81.48,4328918.0,28904.63
2008-02-01,169.75,1476772.66,10823.13,321531.26,499.01,2371.16,374.65,1092655.81,617.48,1660658.05,10715695.93,1002393.72,64.86,1798303.92,11322.14
2008-03-01,186.0,2082011.42,18790.52,391763.24,517.57,2835.72,430.28,1896707.59,636.02,2399500.27,19884252.94,1688695.23,57.8,2473774.66,19308.09
2008-04-01,141.35,1203472.47,3568.69,292456.65,365.29,3000.42,467.37,389797.38,597.83,1418512.17,4269570.66,383229.39,57.73,1495929.12,3933.97
2008-05-01,292.46,3564108.14,20102.26,346582.24,613.08,4866.52,811.72,2047455.12,859.7,3941582.44,22080016.66,1857521.77,81.2,3910690.38,20715.35


Unnamed: 0_level_0,ash-content,consumption-for-eg,consumption-for-eg-btu,consumption-uto,consumption-uto-btu,cost,cost-per-btu,generation,heat-content,receipts,receipts-btu,stocks,sulfur-content,total-consumption,total-consumption-btu
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2008-01-01,254.19,51894.93,1381.14,4114.64,453.03,591.66,101.83,126059.98,101.13,4074.26,44771.59,7392.34,8.36,56009.57,1834.17
2008-02-01,84.5,56674.93,984.17,11392.97,286.58,79.05,10.15,92283.99,26.48,467.08,5661.45,0.31,2.22,68067.89,1270.75
2008-03-01,121.44,86340.29,1096.7,2532.69,275.74,301.47,48.14,97623.18,57.18,1673.73,19221.36,2269.99,7.06,88872.98,1372.43
2008-04-01,183.17,65583.37,955.16,3220.26,111.66,328.71,53.69,88211.84,63.21,3321.7,38998.03,3663.94,8.07,68803.63,1066.82
2008-05-01,192.42,114449.16,2110.28,10828.85,514.37,434.22,69.25,204628.12,82.69,2072.81,22865.65,2210.53,7.72,125278.01,2624.66


Unnamed: 0_level_0,ash-content,consumption-for-eg,consumption-for-eg-btu,consumption-uto,consumption-uto-btu,cost,cost-per-btu,generation,heat-content,receipts,receipts-btu,stocks,sulfur-content,total-consumption,total-consumption-btu
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2008-01-01,0.0,2666.2,2263.2,12391.55,7.95,0.0,0.0,214662.9,0.0,0.0,0.0,0.0,0.0,15057.75,2271.15
2008-02-01,0.0,913.06,13.82,8851.58,3.47,0.0,0.0,1004.15,0.0,0.0,0.0,0.0,0.0,9764.64,17.29
2008-03-01,0.0,417.94,364.76,18515.54,4.11,0.0,0.0,34789.2,0.0,0.0,0.0,0.0,0.0,18933.48,368.87
2008-04-01,0.0,88.84,1476.02,136.23,1.46,0.0,0.0,140679.82,0.0,0.0,0.0,0.0,0.0,225.07,1477.48
2008-05-01,0.0,1493.5,2067.83,8539.97,3.84,0.0,0.0,196224.27,0.0,0.0,0.0,0.0,0.0,10033.47,2071.67


In [81]:
# Check dType for one to verify the data types are correct
ff_df.dtypes

ash-content               float64
consumption-for-eg        float64
consumption-for-eg-btu    float64
consumption-uto           float64
consumption-uto-btu       float64
cost                      float64
cost-per-btu              float64
generation                float64
heat-content              float64
receipts                  float64
receipts-btu              float64
stocks                    float64
sulfur-content            float64
total-consumption         float64
total-consumption-btu     float64
dtype: object

In [83]:
# Check the shapes of our DFs
print(ff_df.shape, re_df.shape, oth_df.shape)

(180, 15) (180, 15) (179, 15)


# FUNCTIONS

In [92]:
def set_scale_df(df):
    '''Function to scale the data to help reduce biases as part of preprocessing'''
    cols = list(df)[:-1]
    train_df = df[cols]
    
    scaler = StandardScaler()
    scaler = scaler.fit(train_df)
    
    train_scaled_df = scaler.transform(train_df)
    return train_scaled_df



['ash-content', 'consumption-for-eg', 'consumption-for-eg-btu', 'consumption-uto', 'consumption-uto-btu', 'cost', 'cost-per-btu', 'generation', 'heat-content', 'receipts', 'receipts-btu', 'stocks', 'sulfur-content', 'total-consumption', 'total-consumption-btu']
['ash-content', 'consumption-for-eg', 'consumption-for-eg-btu', 'consumption-uto', 'consumption-uto-btu', 'cost', 'cost-per-btu', 'generation', 'heat-content', 'receipts', 'receipts-btu', 'stocks', 'sulfur-content', 'total-consumption', 'total-consumption-btu']


In [93]:
# Scale the features
ff_train_scaled_df = set_scale_df(ff_df)
re_train_scaled_df = set_scale_df(re_df)
oth_train_scaled_df = set_scale_df(oth_df)

In [95]:
# Preview 5 rows of the scaled data
print('fossil fuels', ff_train_scaled_df[:5])
print('-'*5)
print('renewables', re_train_scaled_df[:5])
print('-'*5)
print('others', oth_train_scaled_df[:5])

fossil fuels [[ 1.59118429  0.69372857  3.19906015  1.35270603  2.37318514  2.34091676
   2.68075298  3.02911345  1.58777867  0.84892865  2.87387535  2.89757378
   1.09094845  0.79375917]
 [-0.02961288 -0.62308096  0.2864369   0.01310788  0.24537713  0.57226177
   0.83327554  0.16401519  0.15358062 -0.52716529  0.23967415  0.48433726
   0.4362113  -0.57731627]
 [ 0.15967815 -0.27005605  1.64703169  0.35769771  0.32035599  0.96242762
   1.15656957  1.47178929  0.22796457 -0.09475599  1.77866477  1.69783781
   0.15808589 -0.21134923]
 [-0.36043536 -0.78249206 -0.95240709 -0.1295453  -0.29482619  1.10075273
   1.37211829 -0.97916996  0.07474326 -0.66888178 -0.84234196 -0.61045307
   0.15532827 -0.74114159]
 [ 1.39979649  0.59442439  1.87103812  0.13601931  0.70619819  2.66801744
   3.37331003  1.71697712  1.12538651  0.80775154  2.14723529  1.99635252
   1.07991798  0.56716531]]
-----
renewables [[ 2.30255848 -0.95210114 -0.30106619 -1.2020454   1.43778887  3.16742772
   3.13412063 -0.517

In [None]:
# Plot ff's features
fig_ff, axes_ff = plt.subplots(nrows = 5, ncols = 3, dpi = 120)
for i, ax in enumerate(axes_ff)