Import AirNow data
------------------

[Format documentation](http://www.airnowapi.org/docs/HourlyDataFactSheet.pdf)

In [None]:
import codecs, datetime, glob, os, re

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('python-utils/esdr-library.ipynb')

In [None]:
# Accumulate data from multiple files
# Assumes accumulation in time order
accumulated = {}
accumulated_files = {}

def clear_accumulated():
    global accumulated, accumulated_files
    accumulated = {}
    accumulated_files = {}

def accumulate_airnow_file(src):
    print 'Accumulating airnow file %s' % src
    src_epoch_timestamp = os.path.getmtime(src)
    dt = datetime.datetime.strptime(os.path.basename(src), '%Y%m%d%H.dat')
    # Offset epoch_time by 1800 seconds to be in middle of hour-long sample
    epoch_time = (dt - datetime.datetime(1970, 1, 1)).total_seconds() + 1800
    
    nsamples = 0

    with open(src, 'r') as airnow:
        for record in airnow:
            (_, _, id, _, _, type, units, value, _) = record.split('|')
            type = re.sub(r'\W', '_', type) # Replace non-word chars with _;  e.g. PM2.5 becomes PM2_5
            
            if not id in accumulated:
                accumulated[id] = {}
            
            if not type in accumulated[id]:
                accumulated[id][type] = []
                
            accumulated[id][type].append([epoch_time, float(value)])
            nsamples += 1
    
    print 'Read %d samples from %s' % (nsamples, src)
    accumulated_files[src] = src_epoch_timestamp

In [None]:
# Documentation for monitoring_site_locations.dat
# http://www.airnowapi.org/docs/MonitoringSiteFactSheet.pdf

def read_sites():
    field_names = ('AQSID|parameter name|site code|site name|status|' +
                   'agency id|agency name|EPA region|latitude|longitude|' +
                   'elevation|GMT offset|country code|CMSA code|CMSA name|' + 
                   'MSA code|MSA name|state code|state name|county code|' +
                   'county name|city code|city name').split('|')

    sites = {}

    # monitoring_site_locations.dat has non-ASCII characters, in the archaic Original IBM PC 8-bit charset
    # known today as Code page 437.  Translate to unicode during read
    source = 'AirNow/monitoring_site_locations.dat'
    data = codecs.open(source, 'r', 'cp437').read()
    # Test: 000050121 is PARC OCÉANIE
    
    for line in data.split('\n'):
        line = line.strip()
        if len(line) == 0:
            continue
        fields = line.strip().split('|')
        if len(field_names) != len(fields):
            raise Exception('There are %d field names but %d fields' % (len(field_names), len(fields)))
        channel_info = dict(zip(field_names, fields))
        aqsid = channel_info['AQSID']
        if not aqsid in sites:
            sites[aqsid] = {}
        parameter = channel_info['parameter name']
        if parameter in sites[aqsid]:
            raise Exception('Duplicate in monitoring_site_locations: %s:%s' % (aqsid, parameter))
        sites[aqsid][parameter] = channel_info
    
    print 'Read %d sites from %s' % (len(sites), source)
    return sites

sites_cached = None

def get_site_channel_info(site_id):
    global sites_cached
    if not sites_cached:
        sites_cached = read_sites()
    try:
        return sites_cached[site_id]
    except:
        return None

# Use info from first channel for overall site
def get_site_info(site_id):
    channel_info = get_site_channel_info(site_id)
    if channel_info == None:
        return None
    else:
        return channel_info[sorted(channel_info.keys())[0]]

In [None]:
esdr = None
airnow_product = None

In [None]:
def upload_site(site_id):
    global esdr, airnow_product
    if not airnow_product:
        esdr = Esdr('esdr-auth-airnow-uploader.json')
        # esdr.create_product('AirNow', 'AirNow', 'EPA and Sonoma Tech', 'Real-time feeds from EPA/STI AirNow')
        airnow_product = esdr.get_product_by_name('AirNow')

    device = esdr.get_device_by_serial_number(airnow_product, site_id)

    if not device:
        site_info = get_site_info(site_id)
        if not site_info:
            print 'NOTIFY(WARNING): Cannot create site %s because no information can be found for it.  Skipping.' % site_id
            return
        device = esdr.get_or_create_device(airnow_product, serial_number=site_id, name=site_info['site name'])

    feed = esdr.get_feed(device)
    
    if not feed:
        site_info = get_site_info(site_id)
        feed = esdr.get_or_create_feed(device, lat=float(site_info['latitude']), lon=float(site_info['longitude']))
    
    channels = accumulated[site_id]

    for channel in channels:
        esdr.upload(feed, {
            'channel_names': [channel],
            'data': channels[channel]
        })
        print '%s/%s, %s: Uploaded %d samples.' % (site_id, device['name'], channel, len(channels[channel]))

#upload_site('000010401')
#upload_site('000051501')

In [None]:
def upload_check_path(src):
    return 'upload-airnow-to-esdr/uploaded-' + os.path.basename(src)

def upload_accumulated():
    for site_id in sorted(accumulated.keys()):
        upload_site(site_id)
    for src in sorted(accumulated_files):
        check_path = upload_check_path(src)
        try:
            os.makedirs(os.path.dirname(check_path))
        except:
            pass
        open(check_path + '.tmp', 'w').close()
        src_epoch_time = accumulated_files[src]
        os.utime(check_path + '.tmp', (src_epoch_time, src_epoch_time))
        os.rename(check_path + '.tmp', check_path)
        print 'STATUS(SUCCESS): uploaded %s to ESDR' % src
    clear_accumulated()

def process_all():
    clear_accumulated()
    for src in sorted(glob.glob('AirNow/[0-9]*.dat')):
        if len(accumulated_files) == 1000:
            upload_accumulated()
        try:
            if os.path.getmtime(src) == os.path.getmtime(upload_check_path(src)):
                continue
        except:
            pass

        accumulate_airnow_file(src)
    upload_accumulated()
        
#for site in sorted(accumulated.keys())[0:10]:
#    print site
#    print len(accumulated[site])

#site_id = '000051501'
#channels = accumulated[site_id]
#channels  

In [None]:
process_all()