### Checking how downloaded data treats missing data: results
The series data have time stamps by the hour. 
* If an hour has missing data, there is no row entry of data for that hour.
* Using the downloaded code: if multiple data sets are downloaded together, only rows for the intersection of the timestamps with data are kept, so many hours of valid data can be thrown out when combining the downloads as is. 
* Solution: download each series separately, fill in the missing timestamps and give NA values before combining the datasets. 
* Note that the data don't really have to be combined for the simulations; combining them is more for inter-BA analyses.

In [1]:
import json
import numpy as np
import pandas as pd
from urllib.error import URLError, HTTPError
from urllib.request import urlopen

from datetime import datetime
from dateutil.parser import parse

In [2]:
'''
Code copied from https://quantcorner.wordpress.com/2014/11/18/downloading-eias-data-with-python/ on 8/13/2018
'''

class EIAgov(object):
    def __init__(self, token, series):
        '''
        Purpose:
        Initialise the EIAgov class by requesting:
        - EIA token
        - id code(s) of the series to be downloaded

        Parameters:
        - token: string
        - series: string or list of strings
        '''
        self.token = token
        self.series = series

    def Raw(self, ser):
            # Construct url
            url = 'http://api.eia.gov/series/?api_key=' + self.token + '&series_id=' + ser.upper()
            #url = 'http://api.eia.gov/category/?api_key=' + self.token + '&category_id=' + ser.id

            try:
                # URL request, URL opener, read content
                response = urlopen(url);
                raw_byte = response.read()
                raw_string = str(raw_byte, 'utf-8-sig')
                jso = json.loads(raw_string)
                return jso

            except HTTPError as e:
                print('HTTP error type.')
                print('Error code: ', e.code)

            except URLError as e:
                print('URL type error.')
                print('Reason: ', e.reason)

    def GetData(self):
            # Deal with the date series                       
            date_ = self.Raw(self.series[0])        
            date_series = date_['series'][0]['data']
            endi = len(date_series) # or len(date_['series'][0]['data'])
            date = []
            for i in range (endi):
                date.append(date_series[i][0])

            # Create dataframe
            df = pd.DataFrame(data=date)
            df.columns = ['Date']

            # Deal with data
            lenj = len(self.series)
            for j in range (lenj):
                data_ = self.Raw(self.series[j])
                data_series = data_['series'][0]['data']
                data = []
                endk = len(date_series)         
                for k in range (endk):
                    data.append(data_series[k][1])
                df[self.series[j]] = data

            return df



In [3]:
if __name__ == '__main__':
    tok = '6d666bb7097e41102ef69a35aa1edb2b'

    #demand example
    demand_all = ['EBA.AVA-ALL.D.H','EBA.BANC-ALL.D.H']
    data_all = EIAgov(tok, demand_all)
    
    print(data_all.GetData())
    
    df_all = data_all.GetData()

    demand1 = ['EBA.AVA-ALL.D.H']
    demand2 = ['EBA.BANC-ALL.D.H']
    d1 = EIAgov(tok, demand1)
    d2 = EIAgov(tok, demand2)
    
    df1 = d1.GetData()
    df2 = d2.GetData()
    
    print(df_all.shape)
    print(df1.shape)
    print(df2.shape)

               Date  EBA.AVA-ALL.D.H  EBA.BANC-ALL.D.H
0      20180823T00Z           1563.0              2880
1      20180822T23Z           1533.0              2727
2      20180822T22Z           1495.0              2558
3      20180822T21Z           1445.0              2373
4      20180822T20Z           1375.0              2152
5      20180822T19Z           1321.0              1996
6      20180822T18Z           1275.0              2049
7      20180822T17Z           1220.0              1991
8      20180822T16Z           1189.0              1913
9      20180822T15Z           1178.0              1868
10     20180822T14Z           1105.0              1803
11     20180822T13Z           1002.0              1683
12     20180822T12Z            921.0              1587
13     20180822T11Z            891.0              1559
14     20180822T10Z            906.0              1584
15     20180822T09Z            926.0              1633
16     20180822T08Z            985.0              1704
17     201

In [4]:
df1.head()

Unnamed: 0,Date,EBA.AVA-ALL.D.H
0,20180823T00Z,1563.0
1,20180822T23Z,1533.0
2,20180822T22Z,1495.0
3,20180822T21Z,1445.0
4,20180822T20Z,1375.0


In [5]:
df1['Date0'] = pd.to_datetime(df1['Date'])

In [6]:
df1.head()

Unnamed: 0,Date,EBA.AVA-ALL.D.H,Date0
0,20180823T00Z,1563.0,2018-08-23 00:00:00
1,20180822T23Z,1533.0,2018-08-22 23:00:00
2,20180822T22Z,1495.0,2018-08-22 22:00:00
3,20180822T21Z,1445.0,2018-08-22 21:00:00
4,20180822T20Z,1375.0,2018-08-22 20:00:00


In [7]:
df1.index = df1['Date0']

In [8]:
df1.head()

Unnamed: 0_level_0,Date,EBA.AVA-ALL.D.H,Date0
Date0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-08-23 00:00:00,20180823T00Z,1563.0,2018-08-23 00:00:00
2018-08-22 23:00:00,20180822T23Z,1533.0,2018-08-22 23:00:00
2018-08-22 22:00:00,20180822T22Z,1495.0,2018-08-22 22:00:00
2018-08-22 21:00:00,20180822T21Z,1445.0,2018-08-22 21:00:00
2018-08-22 20:00:00,20180822T20Z,1375.0,2018-08-22 20:00:00


In [9]:
df1.shape

(26670, 3)

In [10]:
type(df1.Date[0])

str

In [11]:
df2.index = pd.to_datetime(df2['Date'])

In [12]:
df2.head()

Unnamed: 0_level_0,Date,EBA.BANC-ALL.D.H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-08-23 00:00:00,20180823T00Z,2880
2018-08-22 23:00:00,20180822T23Z,2727
2018-08-22 22:00:00,20180822T22Z,2558
2018-08-22 21:00:00,20180822T21Z,2373
2018-08-22 20:00:00,20180822T20Z,2152


In [13]:
df2.shape

(26826, 2)

#### Looking at the index differences shows where the timestamps are missing for the 2 datasets.

In [14]:
df1.index.difference(df2.index)

DatetimeIndex(['2018-06-30 08:00:00', '2018-06-30 09:00:00',
               '2018-06-30 10:00:00', '2018-06-30 11:00:00',
               '2018-06-30 12:00:00', '2018-06-30 13:00:00',
               '2018-06-30 14:00:00', '2018-06-30 15:00:00',
               '2018-06-30 16:00:00', '2018-06-30 17:00:00',
               ...
               '2018-07-15 22:00:00', '2018-07-15 23:00:00',
               '2018-07-16 00:00:00', '2018-07-16 01:00:00',
               '2018-07-16 02:00:00', '2018-07-16 03:00:00',
               '2018-07-16 04:00:00', '2018-07-16 05:00:00',
               '2018-07-16 06:00:00', '2018-07-16 07:00:00'],
              dtype='datetime64[ns]', length=312, freq=None)

In [15]:
df2.index.difference(df1.index)

DatetimeIndex(['2015-07-01 13:00:00', '2015-07-01 14:00:00',
               '2015-07-01 15:00:00', '2015-07-01 16:00:00',
               '2015-07-01 17:00:00', '2015-07-01 18:00:00',
               '2015-07-01 19:00:00', '2015-07-01 20:00:00',
               '2015-07-01 21:00:00', '2015-07-01 22:00:00',
               ...
               '2018-06-29 22:00:00', '2018-06-29 23:00:00',
               '2018-06-30 00:00:00', '2018-06-30 01:00:00',
               '2018-06-30 02:00:00', '2018-06-30 03:00:00',
               '2018-06-30 04:00:00', '2018-06-30 05:00:00',
               '2018-06-30 06:00:00', '2018-06-30 07:00:00'],
              dtype='datetime64[ns]', length=468, freq=None)

#### The isnull checks show that missing data are not included in the datasets.

In [16]:
df1.isnull().sum()

Date               0
EBA.AVA-ALL.D.H    0
Date0              0
dtype: int64

In [17]:
df2.isnull().sum()

Date                0
EBA.BANC-ALL.D.H    0
dtype: int64

In [18]:
df_all.isnull().sum()

Date                0
EBA.AVA-ALL.D.H     0
EBA.BANC-ALL.D.H    0
dtype: int64

In [19]:
df1.head(10)

Unnamed: 0_level_0,Date,EBA.AVA-ALL.D.H,Date0
Date0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-08-23 00:00:00,20180823T00Z,1563.0,2018-08-23 00:00:00
2018-08-22 23:00:00,20180822T23Z,1533.0,2018-08-22 23:00:00
2018-08-22 22:00:00,20180822T22Z,1495.0,2018-08-22 22:00:00
2018-08-22 21:00:00,20180822T21Z,1445.0,2018-08-22 21:00:00
2018-08-22 20:00:00,20180822T20Z,1375.0,2018-08-22 20:00:00
2018-08-22 19:00:00,20180822T19Z,1321.0,2018-08-22 19:00:00
2018-08-22 18:00:00,20180822T18Z,1275.0,2018-08-22 18:00:00
2018-08-22 17:00:00,20180822T17Z,1220.0,2018-08-22 17:00:00
2018-08-22 16:00:00,20180822T16Z,1189.0,2018-08-22 16:00:00
2018-08-22 15:00:00,20180822T15Z,1178.0,2018-08-22 15:00:00


In [20]:
df1.tail(10)

Unnamed: 0_level_0,Date,EBA.AVA-ALL.D.H,Date0
Date0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-07-02 05:00:00,20150702T05Z,1609.0,2015-07-02 05:00:00
2015-07-02 04:00:00,20150702T04Z,1670.0,2015-07-02 04:00:00
2015-07-02 03:00:00,20150702T03Z,1761.0,2015-07-02 03:00:00
2015-07-02 02:00:00,20150702T02Z,1800.0,2015-07-02 02:00:00
2015-07-02 01:00:00,20150702T01Z,1838.0,2015-07-02 01:00:00
2015-07-01 12:00:00,20150701T12Z,1031.0,2015-07-01 12:00:00
2015-07-01 11:00:00,20150701T11Z,1024.0,2015-07-01 11:00:00
2015-07-01 10:00:00,20150701T10Z,1058.0,2015-07-01 10:00:00
2015-07-01 09:00:00,20150701T09Z,1108.0,2015-07-01 09:00:00
2015-07-01 08:00:00,20150701T08Z,1192.0,2015-07-01 08:00:00


In [21]:
df2.head(10)

Unnamed: 0_level_0,Date,EBA.BANC-ALL.D.H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-08-23 00:00:00,20180823T00Z,2880
2018-08-22 23:00:00,20180822T23Z,2727
2018-08-22 22:00:00,20180822T22Z,2558
2018-08-22 21:00:00,20180822T21Z,2373
2018-08-22 20:00:00,20180822T20Z,2152
2018-08-22 19:00:00,20180822T19Z,1996
2018-08-22 18:00:00,20180822T18Z,2049
2018-08-22 17:00:00,20180822T17Z,1991
2018-08-22 16:00:00,20180822T16Z,1913
2018-08-22 15:00:00,20180822T15Z,1868


In [22]:
df2.tail(10)

Unnamed: 0_level_0,Date,EBA.BANC-ALL.D.H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-07-01 17:00:00,20150701T17Z,2599
2015-07-01 16:00:00,20150701T16Z,2372
2015-07-01 15:00:00,20150701T15Z,2209
2015-07-01 14:00:00,20150701T14Z,2076
2015-07-01 13:00:00,20150701T13Z,2006
2015-07-01 12:00:00,20150701T12Z,1958
2015-07-01 11:00:00,20150701T11Z,1988
2015-07-01 10:00:00,20150701T10Z,2104
2015-07-01 09:00:00,20150701T09Z,2275
2015-07-01 08:00:00,20150701T08Z,2513


#### These next commands explicitly show the present hours in the data.

In [23]:
df2['2015-07-01']

Unnamed: 0_level_0,Date,EBA.BANC-ALL.D.H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-07-01 23:00:00,20150701T23Z,3752
2015-07-01 22:00:00,20150701T22Z,3667
2015-07-01 21:00:00,20150701T21Z,3506
2015-07-01 20:00:00,20150701T20Z,3297
2015-07-01 19:00:00,20150701T19Z,3127
2015-07-01 18:00:00,20150701T18Z,2880
2015-07-01 17:00:00,20150701T17Z,2599
2015-07-01 16:00:00,20150701T16Z,2372
2015-07-01 15:00:00,20150701T15Z,2209
2015-07-01 14:00:00,20150701T14Z,2076


In [24]:
df2['2018-06-30']

Unnamed: 0_level_0,Date,EBA.BANC-ALL.D.H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-06-30 07:00:00,20180630T07Z,2307
2018-06-30 06:00:00,20180630T06Z,2612
2018-06-30 05:00:00,20180630T05Z,2916
2018-06-30 04:00:00,20180630T04Z,3111
2018-06-30 03:00:00,20180630T03Z,3326
2018-06-30 02:00:00,20180630T02Z,3454
2018-06-30 01:00:00,20180630T01Z,3502
2018-06-30 00:00:00,20180630T00Z,3437


In [25]:
df1['2018-06-30']

Unnamed: 0_level_0,Date,EBA.AVA-ALL.D.H,Date0
Date0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06-30 23:00:00,20180630T23Z,1154.0,2018-06-30 23:00:00
2018-06-30 22:00:00,20180630T22Z,1160.0,2018-06-30 22:00:00
2018-06-30 21:00:00,20180630T21Z,1165.0,2018-06-30 21:00:00
2018-06-30 20:00:00,20180630T20Z,1175.0,2018-06-30 20:00:00
2018-06-30 19:00:00,20180630T19Z,1176.0,2018-06-30 19:00:00
2018-06-30 18:00:00,20180630T18Z,1150.0,2018-06-30 18:00:00
2018-06-30 17:00:00,20180630T17Z,1116.0,2018-06-30 17:00:00
2018-06-30 16:00:00,20180630T16Z,1074.0,2018-06-30 16:00:00
2018-06-30 15:00:00,20180630T15Z,997.0,2018-06-30 15:00:00
2018-06-30 14:00:00,20180630T14Z,924.0,2018-06-30 14:00:00


In [26]:
df1['2018-06-29']

Unnamed: 0_level_0,Date,EBA.AVA-ALL.D.H,Date0
Date0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06-29 07:00:00,20180629T07Z,1021.0,2018-06-29 07:00:00
2018-06-29 06:00:00,20180629T06Z,1113.0,2018-06-29 06:00:00
2018-06-29 05:00:00,20180629T05Z,1192.0,2018-06-29 05:00:00
2018-06-29 04:00:00,20180629T04Z,1228.0,2018-06-29 04:00:00
2018-06-29 03:00:00,20180629T03Z,1229.0,2018-06-29 03:00:00
2018-06-29 02:00:00,20180629T02Z,1244.0,2018-06-29 02:00:00
2018-06-29 01:00:00,20180629T01Z,1230.0,2018-06-29 01:00:00
2018-06-29 00:00:00,20180629T00Z,1246.0,2018-06-29 00:00:00


In [27]:
df2['2018-06-29']

Unnamed: 0_level_0,Date,EBA.BANC-ALL.D.H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-06-29 23:00:00,20180629T23Z,3268
2018-06-29 22:00:00,20180629T22Z,3061
2018-06-29 21:00:00,20180629T21Z,2811
2018-06-29 20:00:00,20180629T20Z,2582
2018-06-29 19:00:00,20180629T19Z,2392
2018-06-29 18:00:00,20180629T18Z,2223
2018-06-29 17:00:00,20180629T17Z,2054
2018-06-29 16:00:00,20180629T16Z,1960
2018-06-29 15:00:00,20180629T15Z,1841
2018-06-29 14:00:00,20180629T14Z,1736


In [28]:
df_all.index = pd.to_datetime(df_all['Date'])

In [29]:
df_all['2018-06-29']

Unnamed: 0_level_0,Date,EBA.AVA-ALL.D.H,EBA.BANC-ALL.D.H
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06-29 07:00:00,20180629T07Z,1021.0,1714
2018-06-29 06:00:00,20180629T06Z,1113.0,1875
2018-06-29 05:00:00,20180629T05Z,1192.0,1996
2018-06-29 04:00:00,20180629T04Z,1228.0,2036
2018-06-29 03:00:00,20180629T03Z,1229.0,2089
2018-06-29 02:00:00,20180629T02Z,1244.0,2227
2018-06-29 01:00:00,20180629T01Z,1230.0,2315
2018-06-29 00:00:00,20180629T00Z,1246.0,2292


In [30]:
df1.shape

(26670, 3)

In [31]:
#### Add missing hours

In [32]:
df1 = df1.resample('H').asfreq()

In [33]:
df1.shape

(27569, 3)

In [34]:
df1['2018-06-29']

Unnamed: 0_level_0,Date,EBA.AVA-ALL.D.H,Date0
Date0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06-29 00:00:00,20180629T00Z,1246.0,2018-06-29 00:00:00
2018-06-29 01:00:00,20180629T01Z,1230.0,2018-06-29 01:00:00
2018-06-29 02:00:00,20180629T02Z,1244.0,2018-06-29 02:00:00
2018-06-29 03:00:00,20180629T03Z,1229.0,2018-06-29 03:00:00
2018-06-29 04:00:00,20180629T04Z,1228.0,2018-06-29 04:00:00
2018-06-29 05:00:00,20180629T05Z,1192.0,2018-06-29 05:00:00
2018-06-29 06:00:00,20180629T06Z,1113.0,2018-06-29 06:00:00
2018-06-29 07:00:00,20180629T07Z,1021.0,2018-06-29 07:00:00
2018-06-29 08:00:00,,,NaT
2018-06-29 09:00:00,,,NaT


In [35]:
df1.isnull().sum()

Date               899
EBA.AVA-ALL.D.H    899
Date0              899
dtype: int64

In [36]:
df2 = df2.resample('H').asfreq()

In [37]:
df2.isnull().sum()

Date                743
EBA.BANC-ALL.D.H    743
dtype: int64

In [38]:
print(df1.shape)
print(df2.shape)

(27569, 3)
(27569, 2)


In [39]:
1500/27000 

0.05555555555555555

In [40]:
df1['2018-06-29']

Unnamed: 0_level_0,Date,EBA.AVA-ALL.D.H,Date0
Date0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06-29 00:00:00,20180629T00Z,1246.0,2018-06-29 00:00:00
2018-06-29 01:00:00,20180629T01Z,1230.0,2018-06-29 01:00:00
2018-06-29 02:00:00,20180629T02Z,1244.0,2018-06-29 02:00:00
2018-06-29 03:00:00,20180629T03Z,1229.0,2018-06-29 03:00:00
2018-06-29 04:00:00,20180629T04Z,1228.0,2018-06-29 04:00:00
2018-06-29 05:00:00,20180629T05Z,1192.0,2018-06-29 05:00:00
2018-06-29 06:00:00,20180629T06Z,1113.0,2018-06-29 06:00:00
2018-06-29 07:00:00,20180629T07Z,1021.0,2018-06-29 07:00:00
2018-06-29 08:00:00,,,NaT
2018-06-29 09:00:00,,,NaT


#### To do: download all series into separate DFs, fill in the missing hours, then concatenate as one big file; future users can pick the time and specific columns they want as needed. Can also use this to check overall fraction of missing data and which BA's have more missing data than others.

In [None]:
if __name__ == '__main__':
    tok = '6d666bb7097e41102ef69a35aa1edb2b'

    #demand example
    demand_all = ['EBA.AVA-ALL.D.H','EBA.BANC-ALL.D.H']
    data_all = EIAgov(tok, demand_all)
    
    print(data_all.GetData())
    
    df_all = data_all.GetData()

    demand1 = ['EBA.AVA-ALL.D.H']
    demand2 = ['EBA.BANC-ALL.D.H']
    d1 = EIAgov(tok, demand1)
    d2 = EIAgov(tok, demand2)
    
    df1 = d1.GetData()
    df2 = d2.GetData()
    
    print(df_all.shape)
    print(df1.shape)
    print(df2.shape)