# Airnow Highest 10 Downloader

Fetches data from Airnow's [reporting area top ten API](https://airnowgovapi.com/reportingarea/get_top) to obtain and save the highest 10 AQI locations.  Back when it was a Highest Five, Airnow said they updated it hourly, but it often happened more often, so this scraper runs every five minutes.

Reports to stat.createlab.org as `Airnow Highest Ten - Downloader`.

In [0]:
import json, os, dateutil, re, requests, subprocess, datetime, glob, stat
from dateutil import rrule, tz, parser
from sqlitedict import SqliteDict

In [0]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb

def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())

exec_ipynb('./python-utils/utils.ipynb')
exec_ipynb('./airnow-common.ipynb')

In [0]:
STAT_SERVICE_NAME = 'Airnow Highest Ten - Downloader'
STAT_HOSTNAME = 'hal21'
STAT_SHORTNAME = 'airnow-highest-ten-downloader'

RUN_INTERVAL_SECONDS = 60 * 5   # every 5 minutes

NUM_AQI_VALUES = 10;

REPORTING_AREAS_JSON_FILENAME = 'reporting_areas.json'

In [0]:
Stat.set_service(STAT_SERVICE_NAME)

In [0]:
def create_reporting_area_lookup_dictionary_key(name, state_code):
    return name + '|' + state_code

In [0]:
def is_non_empty_array(a):
    return isinstance(a, list) and len(a) > 0

In [0]:
# Reads in the REPORTING_AREAS_JSON_FILENAME and builds an in-memory dictionary which maps name|stateCode to reporting area ID.
# For example, this item in reporting_areas.json:
#
#   "ak006" : {
#      "countryCode" : "US",
#      "dstzLabel" : "ADT",
#      "gmtOffset" : "-9",
#      "hasDST" : true,
#      "id" : "ak006",
#      "lat" : 61.3124,
#      "lng" : -149.5678,
#      "name" : "Eagle River",
#      "siteIDs" : ["020201004"],
#      "stateCode" : "AK",
#      "tzLabel" : "AKT"
#   },
#
# Will be stored in the dictionary as key "Eagle River|AK" with a value of "ak006". This dictionary is used for fast lookups
# of the reporting areas referenced in the top ten JSON we download from Airnow.  It would have been ideal if they had actually
# included the reporting area ID in their JSON, but sometimes the world isn't ideal :-\  They do include reporting area name,
# state code, and lat/long, so we'll use those to lookup reporting area IDs from this in-memory dictionary.

reporting_area_lookup_dictionary = {}

def build_reporting_area_lookup_dictionary():
    global reporting_area_lookup_dictionary

    with open(AirnowCommon.DATA_DIRECTORY + '/' + REPORTING_AREAS_JSON_FILENAME, 'r') as f:
        reporting_areas_by_id = json.load(f)

        for id in reporting_areas_by_id:
            reporting_area = reporting_areas_by_id[id]
            key = create_reporting_area_lookup_dictionary_key(reporting_area['name'], reporting_area['stateCode'])
            reporting_area_lookup_dictionary[key] = id

#build_reporting_area_lookup_dictionary()

In [0]:
# Gets the JSON from Airnow containing the top ten AQI reporting areas. Note that "ten" may not actually
# be ten.  I've seen it return 11 records before.  Maybe they're fans of Spinal Tap?
def fetch_top_ten():
    try:
        page = requests.get("https://airnowgovapi.com/reportingarea/get_top", timeout=20)
        if (page.status_code >= 200 and page.status_code < 300):
            return json.loads(page.text)
        else:
            Stat.warning("Failed to get Airnow home page (HTTP %d)" % (page.status_code), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    except requests.HTTPError as e:
        Stat.warning("Failed to get Airnow home page (HTTP %d)" % (e.response.status_code), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    except Exception as e:
        Stat.warning("Failed to get Airnow home page (%s)" % e, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    return None

#fetch_top_ten()

In [0]:
# Tries to find the reporting area (in the in-memory reporting_area_lookup_dictionary) matching the
# given name and state code and, if found, returns the ID. Returns None if no match is found.
def find_reporting_area_id(name, state_code):
    global reporting_area_lookup_dictionary
    key = create_reporting_area_lookup_dictionary_key(name, state_code)
    if key in reporting_area_lookup_dictionary:
        return reporting_area_lookup_dictionary[key]

    return None

# print(find_reporting_area_id('Birmingham', 'AL'))  # al001
# print(find_reporting_area_id('Mono Lake', 'CA'))  # ca225
# print(find_reporting_area_id('Springdale (Springdale-Fayetteville-Bentonville)', 'AR'))  # ar002
# print(find_reporting_area_id('Bogusville', 'WV'))  # None

In [0]:
def parse_top_ten(top_ten):
    records = []
    if is_non_empty_array(top_ten):

        # build the in-memory lookup dictionary
        build_reporting_area_lookup_dictionary()

        # iterate over the items and convert to (rank, reporting_area_id, aqi) tuples
        rank = 0
        for item in top_ten:
            rank += 1
            reporting_area_id = find_reporting_area_id(item['reportingArea'], item['stateCode'])
            aqi = item['aqi']
            parameter = item['parameter']

            records.append((rank, reporting_area_id, aqi, parameter))

    return records

# top_ten = fetch_top_ten()
# str(parse_top_ten(top_ten)) # E.g. [(1, 'ca162', 341, 'PM10'), (2, 'mo002', 262, 'PM10'), (3, 'ca126', 187, 'PM10'), ... , (8, 'il009', 120, 'PM2.5'), (9, 'wi002', 117, 'PM2.5'), (10, 'nm007', 113, 'PM10')]

In [0]:
def stringify_tuple(t):
    return ','.join(map(str,t))

def stringify_records(r):
    return '|'.join(map(stringify_tuple,r))

def save_records(sample_timestamp, records):
    try:
        # build a file path for today's data
        filename = datetime.datetime.utcnow().strftime('%Y%m%d.dat')
        file_path = AirnowCommon.HIGHEST_TEN_AQI_DAT_DIRECTORY + '/' + filename
        print(file_path)

        # make sure the directories to the file exist
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        # build the line to append (apparently it's safe to just use \n instead of os.linesep...see https://stackoverflow.com/a/11497391/703200)
        line = str(sample_timestamp) + ':' + stringify_records(records) + '\n'

        # append to the file
        with open(file_path, "a") as data_file:
            data_file.write(line)

        return True
    except:
        return False

# save_records(time.time(), [(1, 'ca162', 341, 'PM10'), (2, 'mo002', 262, 'PM10'), (3, 'ca126', 187, 'PM10'), (4, 'az012', 145, 'PM10'), (5, 'mo011', 130, 'PM10'), (6, 'al001', 123, 'PM2.5'), (7, 'il001', 120, 'PM2.5'), (8, 'il009', 120, 'PM2.5'), (9, 'wi002', 117, 'PM2.5'), (10, 'nm007', 113, 'PM10')])

In [0]:
def run():
    Stat.info('Downloading highest ten AQI readings from Airnow...', host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    start_time = time.time()
    top_ten = fetch_top_ten()
    if is_non_empty_array(top_ten):
        records = parse_top_ten(top_ten)
        if is_non_empty_array(records):
            if save_records(start_time, records):
                Stat.info("%f: %s" % (start_time, records), details="saved", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            else:
                Stat.error("%f: %s" % (start_time, records), details="failed to save", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            end_time = time.time()
            Stat.up('Done downloading highest ten AQI readings from Airnow', details='Took %.1f seconds' % (end_time - start_time), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=RUN_INTERVAL_SECONDS*1.5)
        else:
            Stat.down('Failed to parse highest ten AQI readings from Airnow', host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=RUN_INTERVAL_SECONDS*1.5)
    else:
        Stat.down('Failed to fetch highest ten AQI readings from Airnow', host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=RUN_INTERVAL_SECONDS*1.5)

def run_forever():
    while True:
        run()
        sleep_until_next_period(RUN_INTERVAL_SECONDS)

#run()
run_forever()