Upload AirNow data
------------------

Processes the Airnow hourly data files located in `./AirNow` and uploads to ESDR.

Reports to stat.createlab.org as `Airnow Hourly Data - Upload`

[Format documentation](http://www.airnowapi.org/docs/HourlyDataFactSheet.pdf)

In [0]:
import json, os, dateutil, re, requests, subprocess, datetime, glob, stat, codecs, sys

from dateutil import rrule, tz, parser

In [0]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb


def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())


exec_ipynb('./python-utils/utils.ipynb')
exec_ipynb('./python-utils/esdr-library.ipynb')
exec_ipynb('./airnow-common.ipynb')

In [0]:
STAT_SERVICE_NAME = 'Airnow Hourly Data - Upload'
STAT_HOSTNAME = 'hal21'
STAT_SHORTNAME = 'airnow-hourly-data-upload'

UPTIME_VALID_TIME_PERIOD_SECS = 60 * 60 * 2 # two hours

ESDR_MONITORING_SITE_LOCATION_DEVICES_AND_FEEDS_JSON_FILENAME = 'esdr_monitoring_site_location_devices_and_feeds.json'

In [0]:
Stat.set_service(STAT_SERVICE_NAME)

In [0]:
# Accumulate data from multiple files
# Assumes accumulation in time order
accumulated = {}
accumulated_files = {}

def clear_accumulated():
    global accumulated, accumulated_files
    accumulated = {}
    accumulated_files = {}

def accumulate_airnow_file(src):
    print('Accumulating airnow file %s' % src)
    src_epoch_timestamp = os.path.getmtime(src)
    dt = datetime.datetime.strptime(os.path.basename(src), '%Y%m%d%H.dat')
    # Offset epoch_time by 1800 seconds to be in middle of hour-long sample
    epoch_time = (dt - datetime.datetime(1970, 1, 1)).total_seconds() + 1800

    nsamples = 0

    with open(src, 'r', encoding='cp437') as airnow:
        lineno = 0
        error_count = 0
        for record in airnow:
            lineno += 1
            try:
                (_, _, id, _, _, type, units, value, _) = record.split('|')
            except:
                sys.stderr.write('Problem parsing %s line %d, skipping\n' % (src, lineno))
                sys.stderr.write('Line "%s"\n' % record)
                error_count += 1
            type = re.sub(r'\W', '_', type) # Replace non-word chars with _;  e.g. PM2.5 becomes PM2_5

            if not id in accumulated:
                accumulated[id] = {}

            if not type in accumulated[id]:
                accumulated[id][type] = []

            accumulated[id][type].append([epoch_time, float(value)])
            nsamples += 1
        if error_count > 5:
            raise Exception('Too many parse errors (%d) reading %s, aborting' % (error_count, src))

    if error_count > 0:
        Stat.warning('Read %d records from %s (%d error(s))' % (nsamples, src, error_count), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    else:
        Stat.debug('Read %d records from %s (%d error(s))' % (nsamples, src, error_count), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

    accumulated_files[src] = src_epoch_timestamp

In [0]:
sites_cached = None

def refresh_site_info_cache():
    global sites_cached
    with open(AirnowCommon.DATA_DIRECTORY + '/monitoring_site_locations.json', 'r') as f:
        sites_cached = json.load(f)

def get_site_info(site_id):
    global sites_cached
    if not sites_cached:
        refresh_site_info_cache()

    try:
        return sites_cached['sites'][site_id]
    except:
        return None

# print(json.dumps(get_site_info('420030008'), sort_keys=True, indent=3))  # Lawrenceville aka "BAPC 301 39TH STREET BLDG #7 AirNow"
# print(json.dumps(get_site_info('000050121'), sort_keys=True, indent=3))  # Meteorological Service of Canada"
# print(json.dumps(get_site_info('044201010'), sort_keys=True, indent=3))  # null
# print(json.dumps(get_site_info('060870007'), sort_keys=True, indent=3))  # Santa Cruz AMS
# print(json.dumps(get_site_info('033211050'), sort_keys=True, indent=3))  # null

In [0]:
esdr = None
airnow_product = None
esdr_monitoring_site_devices = None

In [0]:
def get_airnow_product():
    global esdr, airnow_product
    if not esdr:
        esdr = Esdr('esdr-auth-airnow-uploader.json', user_agent='esdr-library.py['+STAT_SERVICE_NAME+']')
    if not airnow_product:
        # esdr.create_product('AirNow', 'AirNow', 'EPA and Sonoma Tech', 'Real-time feeds from EPA/STI AirNow')
        airnow_product = esdr.get_product_by_name('AirNow')
    return airnow_product

In [0]:
def refresh_esdr_monitoring_site_device_cache():
    global esdr_monitoring_site_devices
    with open(AirnowCommon.DATA_DIRECTORY + '/' + ESDR_MONITORING_SITE_LOCATION_DEVICES_AND_FEEDS_JSON_FILENAME, 'r') as f:
        esdr_monitoring_site_devices = json.load(f)

def get_esdr_monitoring_site_device(serialNumber):
    global airnow_product, esdr_monitoring_site_devices
    if not airnow_product:
        airnow_product = get_airnow_product()
    if not esdr_monitoring_site_devices:
        refresh_esdr_monitoring_site_device_cache()

    if serialNumber in esdr_monitoring_site_devices:
        # get a copy of the device
        device = esdr_monitoring_site_devices[serialNumber].copy()

        # add the serial number and product id
        device['serialNumber'] = serialNumber
        device['productId'] = airnow_product['id']
        return device

    return None

def get_esdr_monitoring_site_feed(device, lat, lng):
    if device and lat and lng:
        if 'feeds' in device:
            for feed in device['feeds']:
                if float(lat) == feed['lat'] and float(lng) == feed['lng']:
                    return feed
    return None

# print(get_esdr_monitoring_site_device('no such site'))  # None
# print(get_esdr_monitoring_site_device('033211050'))     # None
# print(get_esdr_monitoring_site_device('010972005'))     # {'feeds': [{'id': 2264, 'lat': 30.4744, 'lng': -88.1411}], 'id': 2264, 'name': 'BAYROAD', 'serialNumber': '010972005', 'productId': 11}
# print(get_esdr_monitoring_site_device('060870007'))     # {'feeds': [{'id': 34142, 'lat': 36.98332, 'lng': -121.98822}, {'id': 2511, 'lat': 36.985802, 'lng': -121.993103}], 'id': 2511, 'name': 'Santa Cruz AMS', 'serialNumber': '060870007', 'productId': 11}
# print(get_esdr_monitoring_site_feed(get_esdr_monitoring_site_device('060870007'),36.98332,-121.98822))        # {'id': 34142, 'lat': 36.98332, 'lng': -121.98822}
# print(get_esdr_monitoring_site_feed(get_esdr_monitoring_site_device('060870007'),36.985802,-121.993103))      # {'id': 2511, 'lat': 36.985802, 'lng': -121.993103}
# print(get_esdr_monitoring_site_feed(get_esdr_monitoring_site_device('060870007'),'36.98332','-121.98822'))    # {'id': 34142, 'lat': 36.98332, 'lng': -121.98822}
# print(get_esdr_monitoring_site_feed(get_esdr_monitoring_site_device('060870007'),'36.985802','-121.993103'))  # {'id': 2511, 'lat': 36.985802, 'lng': -121.993103}
# print(get_esdr_monitoring_site_feed(get_esdr_monitoring_site_device('060870007'),'36.9833','-121.9882'))      # None
# print(get_esdr_monitoring_site_feed(get_esdr_monitoring_site_device('060870007'),'36.9858','-121.9931'))      # None

In [0]:
def upload_site(site_id):
    global esdr, airnow_product
    if not esdr:
        esdr = Esdr('esdr-auth-airnow-uploader.json', user_agent='esdr-library.py['+STAT_SERVICE_NAME+']')
    if not airnow_product:
        airnow_product = get_airnow_product()

    # try to get the device from the cache
    device = get_esdr_monitoring_site_device(site_id)

    site_info = get_site_info(site_id)

    if not device:
        if not site_info:
            Stat.warning('Cannot create device for site %s because no information can be found for it.  Skipping.' % (site_id), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            return
        Stat.info('Failed to find cached device for site %s. Will get from ESDR, creating if necessary.' % (site_id), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        device = esdr.get_or_create_device(airnow_product, serial_number=site_id, name=site_info['site name'])

    if device:
        # Find the feed, matching on lat/lon for the case where the site has moved.  We start by
        # checking the cache, which has all known feeds for the devices, and the feeds are sorted
        # in reverse chronological order, so we should typically (always?) find a hit on the first
        # try.  If we don't find the feed in the cache, then fallback to esdr.get_feed(), which
        # also will match by lat/lon if there are multiple feeds for the device
        lat = float(site_info['latitude']) if site_info else None
        lon = float(site_info['longitude']) if site_info else None
        feed = get_esdr_monitoring_site_feed(device, lat, lon)

        # load from ESDR if we couldn't find it in the cache
        if not feed:
            Stat.info("Failed to find cached feed for device %d with (lat,lng) of (%s, %s). Will try to fetch from ESDR." % (device['id'], lat, lon), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            feed = esdr.get_feed(device, lat=lat, lon=lon)

        # if we couldn't load from ESDR, then create it
        if not feed:
            if not site_info:
                Stat.warning('Cannot create feed for site %s because no information can be found for it.  Skipping.' % (site_id), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
                return
            Stat.info("Failed to load feed for device %d with (lat,lng) of (%s, %s) from ESDR. Will try to create in ESDR." % (device['id'], lat, lon), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            feed = esdr.get_or_create_feed(device, lat=lat, lon=lon)

        if site_id in accumulated:
            channels = accumulated[site_id]
            channel_to_num_samples_uploaded = {}

            for channel in channels:
                channel_to_num_samples_uploaded[channel] = 0
                try:
                    esdr.upload(feed, {
                        'channel_names': [channel],
                        'data': channels[channel]
                    })
                    channel_to_num_samples_uploaded[channel] = len(channels[channel])
                    print('%s/%s, %s: Uploaded %d samples to feed ID %d.' % (site_id, device['name'], channel, len(channels[channel]), feed['id']))
                    #Stat.info('%s/%s, %s: Uploaded %d samples.' % (site_id, device['name'], channel, len(channels[channel])), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
                except requests.HTTPError as e:
                    Stat.warning('%s/%s, %s: Failed to upload %d samples (HTTP %d).' %
                                 (site_id, device['name'], channel, len(channels[channel]), e.response.status_code), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
                except:
                    Stat.warning('%s/%s, %s: Failed to upload %d samples.' %
                                 (site_id, device['name'], channel, len(channels[channel])), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

            # build per-channel upload stats
            samples_uploaded_per_channel = []
            for item in channel_to_num_samples_uploaded.items():
                samples_uploaded_per_channel.append(':'.join(map(str,item)))

            per_channel_stats = ', '.join(samples_uploaded_per_channel)

            Stat.info('%s/%s: Uploaded %d channels to feed ID %d (%s)' % (site_id, device['name'], len(channels), feed['id'], per_channel_stats), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        else:
            Stat.warning('%s/%s: No accumulated data found. Skipping.' % (site_id, device['name']), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    else:
        Stat.warning('Failed to find or create device for site %s.  Skipping.' % (site_id), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        return

#upload_site('000010401')
#upload_site('000051501')

In [0]:
def upload_check_path(src):
    return 'upload-airnow-to-esdr/uploaded-' + os.path.basename(src)

def upload_accumulated():

    refresh_site_info_cache()

    i = 0
    for site_id in sorted(accumulated.keys()):
        print('Uploading site %d' % i)
        i += 1
        upload_site(site_id)
    for src in sorted(accumulated_files):
        check_path = upload_check_path(src)
        try:
            os.makedirs(os.path.dirname(check_path))
        except:
            pass
        open(check_path + '.tmp', 'w').close()
        src_epoch_time = accumulated_files[src]
        os.utime(check_path + '.tmp', (src_epoch_time, src_epoch_time))
        os.rename(check_path + '.tmp', check_path)
        Stat.debug('Uploaded %s to ESDR' % (src), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    clear_accumulated()
    return i

In [0]:
def process_all():
    Stat.info('Uploading hourly Airnow data to ESDR...', host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    before = time.time()
    clear_accumulated()
    for src in sorted(glob.glob('AirNow/[0-9]*.dat')):
        if len(accumulated_files) == 1000:
            upload_accumulated()
        try:
            if os.path.getmtime(src) == os.path.getmtime(upload_check_path(src)):
                continue
        except:
            pass

        accumulate_airnow_file(src)
    nsites = upload_accumulated()
    after = time.time()
    Stat.up('Done uploading %d sites to ESDR' % nsites, details='Took %.1f minutes' % ((after - before) / 60), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=UPTIME_VALID_TIME_PERIOD_SECS)

def process_all_forever():
    while True:
        process_all()
        sleep_until_next_period(1 * 60)  # start up again within 1 minute

process_all_forever()
#process_all()