# Extract, Transform, Load (ETL)
---
* Source `https://www.eia.gov/opendata/browser/`
* Main Route `Electricity`
    * Sub Route 1 - `Electric Power Operations (Annual And Monthly)`    
* Year range `2008` to `2022`
---
API Key can be obtained via signing up via `https://www.eia.gov/opendata/` then added to to the variable `api_key` located under the `config.py` file. Additionally, the API url path can be obtained after choosing the primary route and its subroutes which can then be copied and used here.

More information regarding EIA's API documentation can be found at `https://www.eia.gov/opendata/documentation.php`.

* Categorizing fuel types into the following bin:
    * `Fossil fuels` - anthracite coal, bituminous coal, bituminous coal and syntehtic coal, 'coal, excluding waste coal', distillate fuel oil, fossil fuels, ignite coal, natural gas, natural gas & other gases, other gases, petroleum, petroleum coke, petroleum liquids, refined coal, residual fuel oil, subbituminous coal, lignite coal
    * `Renewables` - biogenic municipal solid waste, biomass, conventional hydroelectric, estimated small scale solar photovoltaic, estimated total solar, estimated total solar photovoltaic, geothermal, hydro-electric pumped storage, landfill gas, municiapl landfill gas, offshore wind turbine, onshore wind turbine, renewable, renewable waste products, solar, solar photovoltaic, solar thermal, waste coal, waste oil and other oils, wind, wood and wood wastes, other renewables
    * `Others` - other fuel types and nuclear

For location, we will use the data from All-

Binning information is based on sources from `https://www.eia.gov/tools/faqs/faq.php?id=427&t=3`, `https://www.eia.gov/electricity/data/browser/`, and `https://www.eia.gov/dnav/pet/TblDefs/pet_cons_821dst_tbldef2.asp`.

In [1]:
# Import dependencies
from config import api_key
import json
import requests
import pandas as pd

### Functions

In [2]:
def request_to_df(url, api_key, years = []):
    '''Function to request data from target API by looping through the years provided to return as a list before combining into one DataFrame'''
    data = []
    
    for year in years:
        api_path = url.replace('||KEY||', api_key).replace('||START||', year).replace('||END||', year)
        
        # Send the request
        response = requests.get(api_path).json()
        
        # Verify the response and raise error if bad response returned otherwise pass it into a DataFrame before returning it
        if (not 'warning' in response.keys()) and (not 'error' in response.keys()) :
            data += response['response']['data']
        else:
            raise Exception('Bad request submitted or no response received from the source API, verify that the url and/or offset provided is correct')
    
    df = pd.DataFrame(data)
    return df

def category_bin(df, check_col, list_to_bin, bin_name, new_col = ''):
    '''
        Function to create a bin category for a DataFrame based on the provided list then replace existing value with a bin category.
        If new_col is provided, a new column will be created for the binned category
    '''
    tmp_df = df.copy()
    
    for item in list_to_bin:
        if new_col == '' or new_col.isspace():
            tmp_df[check_col] = tmp_df[check_col].replace(item, bin_name)
        else:
            tmp_df.loc[tmp_df[check_col] == item, new_col] = bin_name
    
    return tmp_df

def fix_nan(df, col, fill_value = 0, to_type = 'float'):
    '''Function to fill the selected column's NaN value with the provided value and change it type'''
    tmp_df = df.copy()
    tmp_df[col] = tmp_df[col].fillna(fill_value)
    tmp_df[col] = tmp_df[col].astype(to_type)
    
    return tmp_df

### Extract

In [3]:
# Set years for the API to go through
epo_years = [str(year) + '-{:02d}'.format(month) for year in range(2008, 2023) for month in range(1, 13)]

# Set up the paths for retrieving the data into DataFrames with ||START|| and ||END|| for start and end parameter
epo_url = 'https://api.eia.gov/v2/electricity/electric-power-operational-data/data/?api_key=||KEY||&frequency=monthly&data[0]=ash-content&data[1]=consumption-for-eg&data[2]=consumption-for-eg-btu&data[3]=consumption-uto&data[4]=consumption-uto-btu&data[5]=cost&data[6]=cost-per-btu&data[7]=generation&data[8]=heat-content&data[9]=receipts&data[10]=receipts-btu&data[11]=stocks&data[12]=sulfur-content&data[13]=total-consumption&data[14]=total-consumption-btu&start=||START||&end=||END||&sort[0][column]=period&sort[0][direction]=desc&offset=0&length=5000'

In [4]:
# Get the request and create the DataFrames
epo_raw_df = request_to_df(epo_url, api_key, epo_years)

# Or if already using csv file for extraction then (make sure to comment out the above)
# epo_raw_df = pd.read_csv('../static/data/epo_2012_2022_raw.csv')

In [5]:
# Export out raw data to csv files
epo_raw_df.to_csv('../static/data/epo_2012_2022_raw.csv', index = False)

In [6]:
# Print out the keys for epo
epo_raw_df.keys()

Index(['period', 'location', 'stateDescription', 'sectorid',
       'sectorDescription', 'fueltypeid', 'fuelTypeDescription', 'ash-content',
       'ash-content-units', 'consumption-for-eg', 'consumption-for-eg-units',
       'consumption-for-eg-btu', 'consumption-for-eg-btu-units',
       'consumption-uto', 'consumption-uto-units', 'consumption-uto-btu',
       'consumption-uto-btu-units', 'cost', 'cost-units', 'cost-per-btu',
       'cost-per-btu-units', 'generation', 'generation-units', 'heat-content',
       'heat-content-units', 'receipts', 'receipts-units', 'receipts-btu',
       'receipts-btu-units', 'stocks', 'stocks-units', 'sulfur-content',
       'sulfur-content-units', 'total-consumption', 'total-consumption-units',
       'total-consumption-btu', 'total-consumption-btu-units'],
      dtype='object')

In [7]:
# Print out the types
epo_raw_df.dtypes

period                           object
location                         object
stateDescription                 object
sectorid                          int64
sectorDescription                object
fueltypeid                       object
fuelTypeDescription              object
ash-content                     float64
ash-content-units                object
consumption-for-eg              float64
consumption-for-eg-units         object
consumption-for-eg-btu          float64
consumption-for-eg-btu-units     object
consumption-uto                 float64
consumption-uto-units            object
consumption-uto-btu             float64
consumption-uto-btu-units        object
cost                            float64
cost-units                       object
cost-per-btu                    float64
cost-per-btu-units               object
generation                      float64
generation-units                 object
heat-content                    float64
heat-content-units               object


### Transform and Load

In [8]:
# Create copies of the DataFrames
epo_cleaned_df = epo_raw_df.copy()

---

In [9]:
# Drop the not needed columns for our objective
epo_cleaned_df = epo_cleaned_df.drop([
    'location', 'sectorid', 'fueltypeid', 'sectorDescription', 'ash-content-units', 'consumption-for-eg-units',
    'consumption-for-eg-btu-units', 'consumption-uto-units', 'consumption-uto-btu-units', 'cost-units', 'cost-per-btu-units',
    'generation-units', 'heat-content-units', 'receipts-units', 'receipts-btu-units', 'stocks-units', 'sulfur-content-units',
    'total-consumption-units', 'total-consumption-btu-units', 'consumption-for-eg', 'consumption-uto', 'cost', 'receipts', 'total-consumption'
], axis = 1)
print(epo_cleaned_df.columns)

Index(['period', 'stateDescription', 'fuelTypeDescription', 'ash-content',
       'consumption-for-eg-btu', 'consumption-uto-btu', 'cost-per-btu',
       'generation', 'heat-content', 'receipts-btu', 'stocks',
       'sulfur-content', 'total-consumption-btu'],
      dtype='object')


In [10]:
# Print out the value_counts() in fuelTypeDescription for epo_cleaned_df
print('epo', epo_cleaned_df['fuelTypeDescription'].value_counts())

epo fuelTypeDescription
biomass                                     75729
all fuels                                   42153
fossil fuels                                40516
natural gas & other gases                   38337
renewable                                   38017
natural gas                                 37957
petroleum                                   36290
petroleum liquids                           36220
distillate fuel oil                         35446
all renewables                              35187
renewable waste products                    28506
all coal products                           26782
coal, excluding waste coal                  26699
other                                       23207
conventional hydroelectric                  23156
bituminous coal                             22200
bituminous coal and synthetic coal          22159
municiapl landfill gas                      21377
landfill gas                                20493
other renewables          

In [11]:
# Remove all rows where fuelTypeDescription for epo_cleaned_df is 'all coal products', 'all fuels', or 'all renewables'
epo_cleaned_df = epo_cleaned_df[
    ~epo_cleaned_df['fuelTypeDescription'].isin([
        'all coal products', 'all fuels', 'all renewables'
    ])
]

# Take only rows where stateDescription is in 'U.S. Total' since we are only interested in the national level then drop the
# column afterward as we do not need it for our objective
epo_cleaned_df = epo_cleaned_df[epo_cleaned_df['stateDescription'].isin(['U.S. Total'])].drop(columns = 'stateDescription')

In [12]:
epo_cleaned_df.head()

Unnamed: 0,period,fuelTypeDescription,ash-content,consumption-for-eg-btu,consumption-uto-btu,cost-per-btu,generation,heat-content,receipts-btu,stocks,sulfur-content,total-consumption-btu
35,2008-01,biogenic municipal solid waste,,0.33181,0.0,,16.22972,,0.0,0.0,,0.33181
36,2008-01,natural gas,0.0,218.25615,0.0,8.3075,25795.45248,1.0237,221807.3309,0.0,0.0,218.25615
40,2008-01,nuclear,,400.12862,0.0,,38151.089,,,,,400.12862
41,2008-01,biomass,,0.24026,0.0,,19.57995,,0.0,0.0,,0.24026
42,2008-01,other gases,0.0,0.19821,0.0,19.2457,4.848,1.302,,0.0,0.0,0.19821


In [13]:
# Create bins based on fuel type description into 'Fossil Fuels', 'Renewables', and 'Others' for epo_cleaned_df under
# energySource
epo_ff_source = [
    'anthracite coal', 'bituminous coal', 'bituminous coal and synthetic coal', 'coal, excluding waste coal', 
    'distillate fuel oil', 'fossil fuels', 'ignite coal', 'natural gas', 'natural gas & other gases', 'other gases', 
    'petroleum', 'petroleum coke', 'petroleum liquids', 'refined coal', 'residual fuel oil', 'subbituminous coal', 'lignite coal'
]

epo_renew_source = [
    'biogenic municipal solid waste', 'biomass', 'conventional hydroelectric', 'estimated small scale solar photovoltaic', 
    'estimated total solar', 'estimated total solar photovoltaic', 'geothermal', 'hydro-electric pumped storage', 
    'landfill gas', 'municiapl landfill gas', 'offshore wind turbine', 'onshore wind turbine', 'renewable', 
    'renewable waste products', 'solar', 'solar photovoltaic', 'solar thermal', 'waste coal', 'waste oil and other oils', 
    'wind', 'wood and wood wastes', 'other renewables'
]

epo_oth_source = [item for item in epo_cleaned_df['fuelTypeDescription'].value_counts().index if (item not in epo_ff_source and item not in epo_renew_source)]

epo_cleaned_df = category_bin(epo_cleaned_df, 'fuelTypeDescription', epo_ff_source, 'fossil fuels', 'energySource')
epo_cleaned_df = category_bin(epo_cleaned_df, 'fuelTypeDescription', epo_renew_source, 'renewables', 'energySource')
epo_cleaned_df = category_bin(epo_cleaned_df, 'fuelTypeDescription', epo_oth_source, 'others', 'energySource')

  tmp_df.loc[tmp_df[check_col] == item, new_col] = bin_name


In [14]:
# Check the value_counts() again to make sure binning was done correctly
epo_cleaned_df['energySource'].value_counts()

energySource
renewables      13741
fossil fuels    10714
others           1115
Name: count, dtype: int64

In [15]:
# Fill in NaN columns as 0 then set these columns as float
# 'ash-content', 'consumption-for-eg-btu', 'consumption-uto-btu', 'cost-per-btu', 'generation', 'heat-content',
# 'receipts-btu', 'sulfur-content', 'total-consumption-btu'

# Other units (in case needed more features): 'consumption-for-eg', 'consumption-uto', 'cost', 'receipts', 'total-consumption',
# 'stocks'
epo_cols = [
    'ash-content', 'consumption-for-eg-btu', 'consumption-uto-btu', 'cost-per-btu', 'generation', 'heat-content',
    'receipts-btu', 'sulfur-content', 'total-consumption-btu', 'stocks'
]

for col in epo_cols:
    epo_cleaned_df = fix_nan(epo_cleaned_df, epo_cols)

In [16]:
# Review the cleaned DF before additional cleaning
display(epo_cleaned_df.head())
print('epo\'s shape:', epo_cleaned_df.shape)

Unnamed: 0,period,fuelTypeDescription,ash-content,consumption-for-eg-btu,consumption-uto-btu,cost-per-btu,generation,heat-content,receipts-btu,stocks,sulfur-content,total-consumption-btu,energySource
35,2008-01,biogenic municipal solid waste,0.0,0.33181,0.0,0.0,16.22972,0.0,0.0,0.0,0.0,0.33181,renewables
36,2008-01,natural gas,0.0,218.25615,0.0,8.3075,25795.45248,1.0237,221807.3309,0.0,0.0,218.25615,fossil fuels
40,2008-01,nuclear,0.0,400.12862,0.0,0.0,38151.089,0.0,0.0,0.0,0.0,400.12862,others
41,2008-01,biomass,0.0,0.24026,0.0,0.0,19.57995,0.0,0.0,0.0,0.0,0.24026,renewables
42,2008-01,other gases,0.0,0.19821,0.0,19.2457,4.848,1.302,0.0,0.0,0.0,0.19821,fossil fuels


epo's shape: (25570, 13)


In [17]:
# Drop the 'fuelTypeDescription' column since we are only looking at the high level of energy source
epo_cleaned_df =  epo_cleaned_df.drop(columns = 'fuelTypeDescription')

In [18]:
# Using groupby() and sum() functions to merge matching rows based on 'period', 'stateDescription', and 'energySource' 
# then round to the nearest 2. Note to also group the UOM columns as well.
group_by = ['period', 'energySource']

epo_cleaned_df = epo_cleaned_df.groupby(group_by).mean().round(2).reset_index()
display(epo_cleaned_df.head())
print('epo\'s shape:', epo_cleaned_df.shape)

Unnamed: 0,period,energySource,ash-content,consumption-for-eg-btu,consumption-uto-btu,cost-per-btu,generation,heat-content,receipts-btu,stocks,sulfur-content,total-consumption-btu
0,2008-01,fossil fuels,3.12,281.61,10.36,7.0,28830.19,9.85,266757.42,23911.23,0.82,291.97
1,2008-01,others,0.0,251.47,0.88,0.0,23851.43,0.0,0.0,0.0,0.0,252.35
2,2008-01,renewables,2.65,14.39,4.72,1.06,1313.12,1.05,466.37,77.0,0.09,19.11
3,2008-02,fossil fuels,3.03,193.27,8.91,6.69,19511.71,11.03,191351.71,17899.89,1.16,202.18
4,2008-02,others,0.0,2.76,0.69,0.0,200.83,0.0,0.0,0.0,0.0,3.46


epo's shape: (539, 12)


In [19]:
# Add first date to the period so we can transform it into date format
epo_cleaned_df['period'] += '-01'
epo_cleaned_df.head()

Unnamed: 0,period,energySource,ash-content,consumption-for-eg-btu,consumption-uto-btu,cost-per-btu,generation,heat-content,receipts-btu,stocks,sulfur-content,total-consumption-btu
0,2008-01-01,fossil fuels,3.12,281.61,10.36,7.0,28830.19,9.85,266757.42,23911.23,0.82,291.97
1,2008-01-01,others,0.0,251.47,0.88,0.0,23851.43,0.0,0.0,0.0,0.0,252.35
2,2008-01-01,renewables,2.65,14.39,4.72,1.06,1313.12,1.05,466.37,77.0,0.09,19.11
3,2008-02-01,fossil fuels,3.03,193.27,8.91,6.69,19511.71,11.03,191351.71,17899.89,1.16,202.18
4,2008-02-01,others,0.0,2.76,0.69,0.0,200.83,0.0,0.0,0.0,0.0,3.46


In [20]:
# Convert the period column to DateTime format
epo_cleaned_df['period'] = pd.to_datetime(epo_cleaned_df['period'], format='%Y-%m-%d')

In [21]:
# Check the column types
epo_cleaned_df.dtypes

period                    datetime64[ns]
energySource                      object
ash-content                      float64
consumption-for-eg-btu           float64
consumption-uto-btu              float64
cost-per-btu                     float64
generation                       float64
heat-content                     float64
receipts-btu                     float64
stocks                           float64
sulfur-content                   float64
total-consumption-btu            float64
dtype: object

In [22]:
epo_cleaned_df.columns

Index(['period', 'energySource', 'ash-content', 'consumption-for-eg-btu',
       'consumption-uto-btu', 'cost-per-btu', 'generation', 'heat-content',
       'receipts-btu', 'stocks', 'sulfur-content', 'total-consumption-btu'],
      dtype='object')

In [23]:
# Export the cleanned DataFrame for epo_cleaned_df into csv and json
epo_cleaned_df.to_csv('../static/data/epo_2012_2022_cleaned.csv', index = False)
epo_cleaned_df.to_json('../static/data/epo_2012_2022_cleaned.json', orient = 'records', date_format = 'iso')

### Database Storing
---

In [24]:
# Import dependencies for handling the database
from os import path, remove
from sqlalchemy import create_engine, text
from sqlalchemy.orm import Session

In [25]:
# Setup the db path
db_path = '../static/data/eia_electric.sqlite'

# Delete the existing database if it exists
if path.exists(db_path):
    remove(db_path)

In [26]:
# Setup the engine and connect the database
engine = create_engine(f'sqlite:///{db_path}')
conn = engine.connect()

In [27]:
# Create session for querying later to verify tables have been created correctly
session = Session(bind = engine)

In [28]:
# Append the epo_cleaned_df to the database created
epo_cleaned_df.to_sql(name = 'epo', con = engine, if_exists = 'replace', index = False)

539

In [29]:
session.execute(text('SELECT * from epo')).fetchone()

('2008-01-01 00:00:00.000000', 'fossil fuels', 3.12, 281.61, 10.36, 7.0, 28830.19, 9.85, 266757.42, 23911.23, 0.82, 291.97)

In [30]:
# Close out of the session and engine
session.close()
engine.dispose()