In [1]:
import json
import numpy as np
import pandas as pd
from urllib.error import URLError, HTTPError
from urllib.request import urlopen

from datetime import datetime
from dateutil.parser import parse

In [2]:
'''
Code copied from https://quantcorner.wordpress.com/2014/11/18/downloading-eias-data-with-python/ on 8/13/2018
'''

class EIAgov(object):
    def __init__(self, token, series):
        '''
        Purpose:
        Initialise the EIAgov class by requesting:
        - EIA token
        - id code(s) of the series to be downloaded

        Parameters:
        - token: string
        - series: string or list of strings
        '''
        self.token = token
        self.series = series

    def Raw(self, ser):
            # Construct url
            url = 'http://api.eia.gov/series/?api_key=' + self.token + '&series_id=' + ser.upper()
            #url = 'http://api.eia.gov/category/?api_key=' + self.token + '&category_id=' + ser.id

            try:
                # URL request, URL opener, read content
                response = urlopen(url);
                raw_byte = response.read()
                raw_string = str(raw_byte, 'utf-8-sig')
                jso = json.loads(raw_string)
                return jso

            except HTTPError as e:
                print('HTTP error type.')
                print('Error code: ', e.code)

            except URLError as e:
                print('URL type error.')
                print('Reason: ', e.reason)

    def GetData(self):
            # Deal with the date series                       
            date_ = self.Raw(self.series[0])        
            date_series = date_['series'][0]['data']
            endi = len(date_series) # or len(date_['series'][0]['data'])
            date = []
            for i in range (endi):
                date.append(date_series[i][0])

            # Create dataframe
            df = pd.DataFrame(data=date)
            df.columns = ['Date']

            # Deal with data
            lenj = len(self.series)
            for j in range (lenj):
                data_ = self.Raw(self.series[j])
                data_series = data_['series'][0]['data']
                data = []
                endk = len(date_series)         
                for k in range (endk):
                    data.append(data_series[k][1])
                df[self.series[j]] = data

            return df



In [3]:
#Download multiple BA demand data
#To do:
#Check that df_all date range are earliest and latest dates for all file
#download all time series data per BA and do quality checks; but first check if EIA will release code they use for QC

if __name__ == '__main__':
    tok = '6d666bb7097e41102ef69a35aa1edb2b'

    demand_list = ['EBA.AVA-ALL.D.H',
                   'EBA.AZPS-ALL.D.H',
                   'EBA.BANC-ALL.D.H',
                   'EBA.BPAT-ALL.D.H',
                   'EBA.CHPD-ALL.D.H',
                   'EBA.CISO-ALL.D.H',
                   'EBA.DOPD-ALL.D.H',
                   'EBA.EPE-ALL.D.H',
                   'EBA.GCPD-ALL.D.H',
                   'EBA.IID-ALL.D.H',
                   'EBA.IPCO-ALL.D.H',
                   'EBA.LDWP-ALL.D.H',
                   'EBA.NEVP-ALL.D.H',
                   'EBA.NWMT-ALL.D.H',  
                   'EBA.PACE-ALL.D.H',
                   'EBA.PACW-ALL.D.H',
                   'EBA.PGE-ALL.D.H',
                   'EBA.PSCO-ALL.D.H',
                   'EBA.PSEI-ALL.D.H',
                   'EBA.SRP-ALL.D.H',
                   'EBA.SCL-ALL.D.H',
                   'EBA.TEPC-ALL.D.H',
                   'EBA.TIDC-ALL.D.H',
                   'EBA.TPWR-ALL.D.H',
                   'EBA.WALC-ALL.D.H',
                   'EBA.WACM-ALL.D.H',
                   'EBA.WAUW-ALL.D.H']
    
    df={}
    for x in [[i] for i in demand_list]:
        BA = x[0]
        print(BA)
        d = EIAgov(tok, x)
        df[BA] = d.GetData()
        df[BA].index = pd.to_datetime(df[BA]['Date'])
        df[BA].drop(columns =['Date'], inplace=True)
        df[BA] = df[BA].resample('H').asfreq()
        
            
    df_all = pd.DataFrame(index = df['EBA.EPE-ALL.D.H'].index)    #EPE has least missing data
    for x in demand_list:
        df_all = pd.concat([df_all,df[x]], axis=1)
        


EBA.AVA-ALL.D.H
EBA.AZPS-ALL.D.H
EBA.BANC-ALL.D.H
EBA.BPAT-ALL.D.H
EBA.CHPD-ALL.D.H
EBA.CISO-ALL.D.H
EBA.DOPD-ALL.D.H
EBA.EPE-ALL.D.H
EBA.GCPD-ALL.D.H
EBA.IID-ALL.D.H
EBA.IPCO-ALL.D.H
EBA.LDWP-ALL.D.H
EBA.NEVP-ALL.D.H
EBA.NWMT-ALL.D.H
EBA.PACE-ALL.D.H
EBA.PACW-ALL.D.H
EBA.PGE-ALL.D.H
EBA.PSCO-ALL.D.H
EBA.PSEI-ALL.D.H
EBA.SRP-ALL.D.H
EBA.SCL-ALL.D.H
EBA.TEPC-ALL.D.H
EBA.TIDC-ALL.D.H
EBA.TPWR-ALL.D.H
EBA.WALC-ALL.D.H
EBA.WACM-ALL.D.H
EBA.WAUW-ALL.D.H


In [4]:
df_all.head()

Unnamed: 0_level_0,EBA.AVA-ALL.D.H,EBA.AZPS-ALL.D.H,EBA.BANC-ALL.D.H,EBA.BPAT-ALL.D.H,EBA.CHPD-ALL.D.H,EBA.CISO-ALL.D.H,EBA.DOPD-ALL.D.H,EBA.EPE-ALL.D.H,EBA.GCPD-ALL.D.H,EBA.IID-ALL.D.H,...,EBA.PSCO-ALL.D.H,EBA.PSEI-ALL.D.H,EBA.SRP-ALL.D.H,EBA.SCL-ALL.D.H,EBA.TEPC-ALL.D.H,EBA.TIDC-ALL.D.H,EBA.TPWR-ALL.D.H,EBA.WALC-ALL.D.H,EBA.WACM-ALL.D.H,EBA.WAUW-ALL.D.H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-07-01 07:00:00,,,,,,,,,,,...,4875.0,,,,,,,,,
2015-07-01 08:00:00,1192.0,4478.0,2513.0,,434.0,31486.0,177.0,925.0,663.0,505.0,...,4618.0,3066.0,4103.0,873.0,1605.0,408.0,468.0,1119.0,,
2015-07-01 09:00:00,1108.0,4227.0,2275.0,,422.0,28989.0,169.0,856.0,654.0,482.0,...,4427.0,2865.0,3897.0,833.0,1537.0,380.0,441.0,1018.0,,
2015-07-01 10:00:00,1058.0,4016.0,2104.0,,416.0,27416.0,161.0,839.0,646.0,474.0,...,4344.0,2735.0,3694.0,802.0,1487.0,357.0,420.0,1039.0,,
2015-07-01 11:00:00,1024.0,3879.0,1988.0,,413.0,26388.0,160.0,827.0,642.0,450.0,...,4374.0,2710.0,3590.0,796.0,1470.0,342.0,417.0,1019.0,,


In [5]:
print(df_all.isnull().sum())


EBA.AVA-ALL.D.H      900
EBA.AZPS-ALL.D.H     652
EBA.BANC-ALL.D.H     744
EBA.BPAT-ALL.D.H    1952
EBA.CHPD-ALL.D.H     455
EBA.CISO-ALL.D.H     455
EBA.DOPD-ALL.D.H    2195
EBA.EPE-ALL.D.H      401
EBA.GCPD-ALL.D.H     432
EBA.IID-ALL.D.H      832
EBA.IPCO-ALL.D.H     534
EBA.LDWP-ALL.D.H    1019
EBA.NEVP-ALL.D.H    1314
EBA.NWMT-ALL.D.H     938
EBA.PACE-ALL.D.H    3144
EBA.PACW-ALL.D.H    4273
EBA.PGE-ALL.D.H      915
EBA.PSCO-ALL.D.H     820
EBA.PSEI-ALL.D.H     721
EBA.SRP-ALL.D.H     1474
EBA.SCL-ALL.D.H      535
EBA.TEPC-ALL.D.H    1687
EBA.TIDC-ALL.D.H     525
EBA.TPWR-ALL.D.H     506
EBA.WALC-ALL.D.H     673
EBA.WACM-ALL.D.H    1249
EBA.WAUW-ALL.D.H    4758
dtype: int64


In [6]:
df_all.tail()

Unnamed: 0_level_0,EBA.AVA-ALL.D.H,EBA.AZPS-ALL.D.H,EBA.BANC-ALL.D.H,EBA.BPAT-ALL.D.H,EBA.CHPD-ALL.D.H,EBA.CISO-ALL.D.H,EBA.DOPD-ALL.D.H,EBA.EPE-ALL.D.H,EBA.GCPD-ALL.D.H,EBA.IID-ALL.D.H,...,EBA.PSCO-ALL.D.H,EBA.PSEI-ALL.D.H,EBA.SRP-ALL.D.H,EBA.SCL-ALL.D.H,EBA.TEPC-ALL.D.H,EBA.TIDC-ALL.D.H,EBA.TPWR-ALL.D.H,EBA.WALC-ALL.D.H,EBA.WACM-ALL.D.H,EBA.WAUW-ALL.D.H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-08-23 18:00:00,1338.0,4532.0,2074.0,,190.0,31064.0,,,671.0,,...,5594.0,3485.0,4758.0,1059.0,,356.0,526.0,1144.0,3323.0,107.0
2018-08-23 19:00:00,1387.0,4759.0,2150.0,,195.0,31688.0,,,685.0,,...,5869.0,3521.0,5097.0,1065.0,,366.0,525.0,1194.0,3412.0,110.0
2018-08-23 20:00:00,1434.0,5037.0,2248.0,,203.0,32516.0,,,696.0,,...,6102.0,3539.0,5408.0,1068.0,,382.0,522.0,1248.0,3442.0,114.0
2018-08-23 21:00:00,1482.0,5336.0,2369.0,,201.0,33728.0,,,711.0,,...,6393.0,3570.0,5672.0,1057.0,,400.0,540.0,1263.0,3453.0,115.0
2018-08-23 22:00:00,1516.0,5539.0,2488.0,,206.0,35172.0,,,719.0,,...,6634.0,3590.0,5811.0,1054.0,,422.0,128.0,1284.0,3551.0,119.0


In [8]:
len(df_all.columns)

27

In [9]:
df_all.describe()

Unnamed: 0,EBA.AVA-ALL.D.H,EBA.AZPS-ALL.D.H,EBA.BANC-ALL.D.H,EBA.BPAT-ALL.D.H,EBA.CHPD-ALL.D.H,EBA.CISO-ALL.D.H,EBA.DOPD-ALL.D.H,EBA.EPE-ALL.D.H,EBA.GCPD-ALL.D.H,EBA.IID-ALL.D.H,...,EBA.PSCO-ALL.D.H,EBA.PSEI-ALL.D.H,EBA.SRP-ALL.D.H,EBA.SCL-ALL.D.H,EBA.TEPC-ALL.D.H,EBA.TIDC-ALL.D.H,EBA.TPWR-ALL.D.H,EBA.WALC-ALL.D.H,EBA.WACM-ALL.D.H,EBA.WAUW-ALL.D.H
count,26692.0,26940.0,26848.0,25640.0,27137.0,27137.0,25397.0,27191.0,27160.0,26760.0,...,26772.0,26871.0,26118.0,27057.0,25905.0,27067.0,27086.0,26919.0,26343.0,22834.0
mean,1356.463127,3742.7,2035.997206,6297.84688,237.620739,26587.435347,176.791747,977.12677,567.948012,416.737556,...,4991.98674,3406.589744,3424.255877,1110.101489,1684.020459,306.795877,555.22927,1154.497158,2942.027066,92.834019
std,466.437654,14470.26,537.410696,990.477522,102.706333,5176.54259,44.68398,283.800133,81.997369,183.090255,...,928.266512,640.678941,1848.013905,218.061683,689.682195,87.267497,112.254126,3983.630916,380.938506,19.096696
min,748.0,2023.0,0.0,2600.0,-85.0,18068.0,0.0,-7342.0,362.0,-29.0,...,0.0,679.0,-755.0,0.0,890.0,0.0,-12.0,-92392.0,-933.0,0.0
25%,1148.0,2805.0,1696.0,5620.0,159.0,22836.0,143.0,787.0,507.0,280.75,...,4381.0,2958.0,2643.0,974.0,1399.0,250.0,480.0,885.0,2691.0,79.0
50%,1327.0,3275.0,1905.0,6214.0,201.0,25617.0,167.0,901.0,558.0,357.0,...,4890.0,3379.0,3041.5,1109.0,1563.0,284.0,540.0,1016.0,2889.0,90.0
75%,1558.0,4154.0,2151.0,6886.0,294.0,28762.0,205.0,1108.0,619.0,515.0,...,5422.0,3823.0,3934.0,1236.0,1843.0,337.0,625.0,1184.0,3162.0,105.0
max,58854.0,1680538.0,4763.0,11827.0,591.0,49899.0,397.0,5150.0,982.0,1081.0,...,60576.0,5504.0,247000.0,11583.0,66155.0,653.0,998.0,401364.0,14528.0,168.0


In [11]:
min(df_all.index)

Timestamp('2015-07-01 07:00:00', freq='H')

In [12]:
max(df_all.index)

Timestamp('2018-08-23 22:00:00', freq='H')