# Mirror Reporting Area Locations

Mirrors two Airnow reporting area data files, converts each to JSON, and then also merges the two into `reporting_areas.json`. Details:

* The Airnow [Site_To_ReportingArea.csv](https://files.airnowtech.org/airnow/today/Site_To_ReportingArea.csv) is converted into `reporting_areas_to_sites.json`.
* The [reporting area locations .dat file](https://files.airnowtech.org/airnow/today/reporting_area_locations_V2.dat) is converted into `reporting_area_locations.json`.

We merge the files using the reporting area ID (e.g. `wa019`, `ak010`, etc). When I first started mirroring, I found a few errors such as multiple reporting areas being assigned to the same ID (see the git history for this file).  I contacted Airnow, and they assured me reporting area IDs should be unique and fixed the problems.

The resulting merged JSON file, `reporting_areas.json`, is a dictionary mapping reporting area ID (e.g. `wa019`) to data about that reporting area, including a collection of monitoring site IDs associated with that reporting area.  Note that a monitoring site may be associated with more than one reporting area, and that a reporting area may actually have zero associated monitoring sites.

Airnow told me that `Site_To_ReportingArea.csv` gets updated twice daily. But, from what I can tell, the `reporting_area_locations_V2.dat` file gets updated twice per hour, at 25 and 55 minutes after the hour.  So this mirror runs on the hour and half hour.

Data sheet is located at https://docs.airnowapi.org/docs/ReportingAreaInformationFactSheet.pdf

Reports to stat.createlab.org as `Airnow Reporting Area Locations File - Mirror`.

In [0]:
import json, os, dateutil, re, requests, subprocess, datetime, glob, stat
import csv

from dateutil import rrule, tz, parser

In [0]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb

def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())


exec_ipynb('./python-utils/utils.ipynb')
exec_ipynb('./airnow-common.ipynb')

In [0]:
MIRROR_TIME_PERIOD_SECS = 60 * 30   # every 30 minutes

STAT_SERVICE_NAME = 'Airnow Reporting Area Locations File - Mirror'
STAT_HOSTNAME = 'hal21'
STAT_SHORTNAME = 'airnow-mirror-reporting-area-locations-file'

REPORTING_AREA_LOCATIONS_DAT_FILENAME = 'reporting_area_locations_V2.dat'
REPORTING_AREA_LOCATIONS_JSON_FILENAME = 'reporting_area_locations.json'

SITE_TO_REPORTING_AREA_CSV_FILENAME = 'Site_To_ReportingArea.csv'
REPORTING_AREAS_TO_SITES_JSON_FILENAME = 'reporting_areas_to_sites.json'

REPORTING_AREAS_JSON_FILENAME = 'reporting_areas.json'

In [0]:
Stat.set_service(STAT_SERVICE_NAME)

In [0]:
# Currently unused now that Airnow has fixed the duplications and errors surrounding reporting area IDs (see docs above, and in the git history)
#
# def create_reporting_area_unique_id(reporting_area_id, reporting_area_name):
#     if reporting_area_id and reporting_area_name:
#         stripped_id = reporting_area_id.strip()
#         clean_name = re.sub(r'[^a-zA-Z0-9]+', '', reporting_area_name) # Strip non-alphanumeric chars
#
#         if len(stripped_id) > 0 and len(clean_name) > 0:
#             return (stripped_id + '-' + clean_name).lower()
#
#     return None

In [0]:
def jsonify_reporting_area_locations():
    field_names = ('name|stateCode|countryCode|forecasts|actionDayName|lat|lng|gmtOffset|hasDST|tzLabel|dstzLabel|id|usaToday|forecastSource').split('|')

    reporting_areas = {}

    # The file may have non-ASCII characters, in the archaic Original IBM PC 8-bit charset
    # known today as Code page 437.  Translate to unicode during read
    source = AirnowCommon.DATA_DIRECTORY + '/' + REPORTING_AREA_LOCATIONS_DAT_FILENAME
    dest = AirnowCommon.DATA_DIRECTORY + '/' + REPORTING_AREA_LOCATIONS_JSON_FILENAME
    data =  open(source, 'r', encoding='cp437').read()

    for line in data.split('\n'):
        line = line.strip()
        if len(line) == 0:
            continue
        fields = list(map(lambda s: s.strip(), line.split('|')))    # split on | then strip whitespace from every field
        if len(field_names) != len(fields):
            Stat.warning('Record has %d field names but %d fields. Skipping.' % (len(field_names), len(fields)), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            continue
        field_map = dict(zip(field_names, fields))
        key = field_map['id']

        if key and len(key) > 0:
            # delete keys we don't need
            field_map.pop('forecasts', None)
            field_map.pop('actionDayName', None)
            field_map.pop('usaToday', None)
            field_map.pop('forecasts', None)
            field_map.pop('forecastSource', None)

            if field_map['hasDST'] == 'Yes':
                field_map['hasDST'] = True
            elif field_map['hasDST'] == 'No':
                field_map['hasDST'] = False
                field_map.pop('dstzLabel', None)  # no point including the daylight savings time label if they don't do DST

            # convert lat/lng from string to float
            field_map['lat'] = float(field_map['lat'])
            field_map['lng'] = float(field_map['lng'])

            # add it to the map
            if key not in reporting_areas:
                reporting_areas[key] = field_map
            else:
                Stat.warning('skipping duplicate ID [%s] for reporting area [%s, %s, %s]' % (key, field_map['name'], field_map['stateCode'], field_map['countryCode']), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        else:
            Stat.warning('skipping reporting area [%s, %s, %s] since it has an empty ID' % (field_map['name'], field_map['stateCode'], field_map['countryCode']), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

    Stat.debug('Read %d reporting areas from %s' % (len(reporting_areas), source), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

    # write the JSON file to disk
    tmp = dest + '.tmp' + str(os.getpid())
    os.makedirs(os.path.dirname(tmp), exist_ok=True)
    with open(tmp, 'w') as json_file:
        json.dump(reporting_areas, json_file, sort_keys=True)
    os.rename(tmp, dest)

    # make the JSON file readable by everyone
    os.chmod(dest, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP | stat.S_IROTH)

    # make the JSON file's file stat times match those of the .dat
    source_file_stat = os.stat(source)
    os.utime(dest, (source_file_stat.st_mtime, source_file_stat.st_mtime))

    Stat.info('Successfully created %s ' % REPORTING_AREA_LOCATIONS_JSON_FILENAME, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

    return reporting_areas

#jsonify_reporting_area_locations()

In [0]:
def jsonify_site_to_reporting_area():
    # CSV header:   "ReportingAreaName","ReportingAreaState","ReportingAreaID","ReportingAreaLat","ReportingAreaLong","SiteID","SiteName","SiteAgencyName","SiteLat","SiteLong"
    field_names = ('name|stateCode|id|lat|lng|siteId|siteName|siteAgencyName|siteLat|siteLng').split('|')

    reporting_areas = {}

    source = AirnowCommon.DATA_DIRECTORY + '/' + SITE_TO_REPORTING_AREA_CSV_FILENAME
    dest = AirnowCommon.DATA_DIRECTORY + '/' + REPORTING_AREAS_TO_SITES_JSON_FILENAME

    with open(source, 'r', encoding='cp437') as f:
        reader = csv.reader(f, delimiter=',', quotechar='"')

        line_number = 0
        for row in reader:
            # skip the header
            if line_number > 0:
                fields = list(map(lambda s: s.strip(), row))    # strip whitespace from every field
                if len(field_names) != len(fields):
                    Stat.warning('Record has %d field names but %d fields. Skipping.' % (len(field_names), len(fields)), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
                    continue
                field_map = dict(zip(field_names, fields))
                key = field_map['id']

                if key and len(key) > 0:
                    # remember the site ID so we can add to the siteIDs collection later
                    site_id = field_map['siteId']

                    # see whether we've already seen this reporting area, inserting into the dictionary if not
                    if key not in reporting_areas:
                        # delete keys we don't need
                        field_map.pop('siteId', None)
                        field_map.pop('siteName', None)
                        field_map.pop('siteAgencyName', None)
                        field_map.pop('siteLat', None)
                        field_map.pop('siteLng', None)

                        # convert lat/lng from string to float
                        field_map['lat'] = float(field_map['lat'])
                        field_map['lng'] = float(field_map['lng'])

                        # add a siteIDs field
                        field_map['siteIDs'] = []

                        # insert into the dictionary
                        reporting_areas[key] = field_map

                    reporting_areas[key]['siteIDs'].append(site_id)

                else:
                    Stat.warning('skipping reporting area [%s, %s] since it has an empty ID' % (field_map['name'], field_map['stateCode']), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            line_number = line_number + 1

    Stat.debug('Read %d reporting areas from %s' % (len(reporting_areas), source), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

    # write the JSON file to disk
    tmp = dest + '.tmp' + str(os.getpid())
    os.makedirs(os.path.dirname(tmp), exist_ok=True)
    with open(tmp, 'w') as json_file:
        json.dump(reporting_areas, json_file, sort_keys=True)
    os.rename(tmp, dest)

    # make the JSON file readable by everyone
    os.chmod(dest, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP | stat.S_IROTH)

    # make the JSON file's file stat times match those of the .dat
    source_file_stat = os.stat(source)
    os.utime(dest, (source_file_stat.st_mtime, source_file_stat.st_mtime))

    Stat.info('Successfully created %s ' % REPORTING_AREAS_TO_SITES_JSON_FILENAME, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

    return reporting_areas

#jsonify_site_to_reporting_area()

In [0]:
def merge_data():
    reporting_area_locations = jsonify_reporting_area_locations()
    reporting_areas_to_sites = jsonify_site_to_reporting_area()

    # we're going to merge reporting_areas_to_sites into reporting_area_locations, so start by iterating over 
    # reporting_area_locations and inserting a siteIDs field (an empty array) into each one
    for key in reporting_area_locations.keys():
        reporting_area_locations[key]['siteIDs'] = []

    # now iterate over all the reporting_areas_to_sites, copying the siteIDs over. In the (rare?) case that an
    # item exists in reporting_areas_to_sites but not in reporting_area_locations, then we'll insert and copy
    # what info we do have about from reporting_areas_to_sites.
    for key in reporting_areas_to_sites.keys():
        if key in reporting_area_locations:
            if reporting_area_locations[key]['lat'] != reporting_areas_to_sites[key]['lat']:
                Stat.warning('Ignoring latitude mismatch for reporting area [%s]' % (key), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            if reporting_area_locations[key]['lng'] != reporting_areas_to_sites[key]['lng']:
                Stat.warning('Ignoring longitude mismatch for reporting area [%s]' % (key), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            if reporting_area_locations[key]['stateCode'] != reporting_areas_to_sites[key]['stateCode']:
                Stat.warning('Ignoring stateCode mismatch for reporting area [%s]' % (key), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

            reporting_area_locations[key]['siteIDs'] = reporting_areas_to_sites[key]['siteIDs']
        else:
            print("### Reporting area [%s] not found in reporting_area_locations!")
            reporting_area_locations[key] = reporting_areas_to_sites[key]

    # write the JSON file to disk
    dest = AirnowCommon.DATA_DIRECTORY + '/' + REPORTING_AREAS_JSON_FILENAME
    tmp = dest + '.tmp' + str(os.getpid())
    os.makedirs(os.path.dirname(tmp), exist_ok=True)
    with open(tmp, 'w') as json_file:
        json.dump(reporting_area_locations, json_file, sort_keys=True)
    os.rename(tmp, dest)

    # make the JSON file readable by everyone
    os.chmod(dest, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP | stat.S_IROTH)

    Stat.info('Successfully created %s ' % REPORTING_AREAS_JSON_FILENAME, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

#merge_data()

In [0]:
# For mirroring files located under https://files.airnowtech.org/airnow/today/
def mirror_today_file(filename):
    Stat.info('Mirroring https://files.airnowtech.org/airnow/today/%s' % (filename), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

    (is_new, message, status_code) = AirnowCommon.mirror_airnow_file('today' + '/' + filename, AirnowCommon.DATA_DIRECTORY + '/' + filename)

    if is_new:
        Stat.info(message, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        return True
    else:
        if status_code == 304:
            Stat.info(message, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        elif status_code < 400:
            Stat.info(message, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        else:
            Stat.warning(message, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

    return False

#mirror_today_file(REPORTING_AREA_LOCATIONS_DAT_FILENAME)
#mirror_today_file(SITE_TO_REPORTING_AREA_CSV_FILENAME)

In [0]:
def mirror():
    starting_timestamp = datetime.datetime.now().timestamp()

    # Latest file is at https://files.airnowtech.org/airnow/today/reporting_area_locations_V2.dat
    is_new1 = mirror_today_file(REPORTING_AREA_LOCATIONS_DAT_FILENAME)
    is_new2 = mirror_today_file(SITE_TO_REPORTING_AREA_CSV_FILENAME)
    if is_new1 or is_new2:
        merge_data()
    else:
        Stat.info("Files unchanged, nothing to do.", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    elapsed_seconds = datetime.datetime.now().timestamp() - starting_timestamp
    Stat.up('Done! (elapsed time: %d seconds)' % (elapsed_seconds), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=MIRROR_TIME_PERIOD_SECS*1.5)

def mirror_forever():
    while True:
        mirror()
        sleep_until_next_period(MIRROR_TIME_PERIOD_SECS)

mirror_forever()
#mirror()