# Airnow Highest 5 Scraper

Scrapes the (old, and now archived, but apparently still functional as of 2020-04-15) [airnow.gov](https://cfpub.epa.gov/airnow/) home page to obtain and save the highest 5 AQI locations.  Airnow says they update it hourly, but it often happens more often, so this scraper runs every five minutes.

Reports to stat.createlab.org as `Airnow Highest Five - Scraper`.

Airnow's docs for the highest 5 are here: https://airnow.gov/index.cfm?action=airnow.news_item&newsitemid=103

In [0]:
import json, os, dateutil, re, requests, subprocess, datetime, glob, stat
from bs4 import BeautifulSoup
from dateutil import rrule, tz, parser
from sqlitedict import SqliteDict

In [0]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb


def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())


exec_ipynb('./python-utils/utils.ipynb')
exec_ipynb('./airnow-common.ipynb')

In [0]:
STAT_SERVICE_NAME = 'Airnow Highest Five - Scraper'
STAT_HOSTNAME = 'hal21'
STAT_SHORTNAME = 'airnow-highest-five-scraper'

RUN_INTERVAL_SECONDS = 60 * 5   # every 5 minutes

In [0]:
Stat.set_service(STAT_SERVICE_NAME)

In [0]:
airnow_city_id_to_city_info = SqliteDict(AirnowCommon.HIGHEST_FIVE_AQI_DIRECTORY + '/airnow_city_id_to_city_info.db', autocommit=True)

In [0]:
def fetch_airnow_home_page():
    try:
        page = requests.get("https://cfpub.epa.gov/airnow/", timeout=20)
        if (page.status_code >= 200 and page.status_code < 300):
            return page.text
        else:
            Stat.warning("Failed to get Airnow home page (HTTP %d)" % (page.status_code), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    except requests.HTTPError as e:
        Stat.warning("Failed to get Airnow home page (HTTP %d)" % (e.response.status_code), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    except Exception as e:
        Stat.warning("Failed to get Airnow home page (%s)" % e, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    return None

In [0]:
def parse_a_top_five_row(row):
    try:
        # pick out the city id from the href
        city_id = int(row.a.attrs['href'].split('&cityid=')[1])

        # pick out the city and state
        city_and_state = row.a.contents[0].strip()

        # find the last comma in the city and state, which should separate the city name from the state
        comma_index = city_and_state.rfind(',')
        if comma_index >= 0:
            city = city_and_state[:comma_index].strip()
            state = city_and_state[comma_index+1:].strip()
            aqi = int(row.table.table.tr.td.contents[0].strip())
            return (city_id, state, city, aqi)
        else:
            Stat.warning("Failed to split city and state [%s]. Skipping." % city_and_state, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    except:
        Stat.warning("Failed to parse row [%s]. Skipping." % row, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    return None


def parse_airnow_home_page_html(html):
    records = []
    soup = BeautifulSoup(html, 'html.parser')

    # pick out the div with id 'curaqi'
    curaqi_divs = soup.find_all(id="curaqi")

    # there should only be one, but just take the first one if for some reason there are multiple
    if len(curaqi_divs) > 0:
        curaqi_div = curaqi_divs[0]

        if curaqi_div:
            if curaqi_div.table:
                # Get the rows in the table.  There should be exactly 5, one each for the top 5.
                # Set recursive to False in the find_all, so we don't get nested table rows here
                rows = curaqi_div.table.find_all('tr', None, False)

                if rows:
                    if len(rows) != 5:
                        Stat.warning("Expected 5 table rows, but found %d" % (len(rows)), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

                    rank = 0
                    for row in rows:
                        rank += 1
                        parsed_row = parse_a_top_five_row(row)
                        if parsed_row:
                            (city_id, state, city, aqi) = parsed_row
                            # Stat.debug("[%d|%s|%s|%s|%d]" % (city_id, state, city, aqi, rank), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

                            # make sure this city is in our city_id map
                            if city_id not in airnow_city_id_to_city_info:
                                airnow_city_id_to_city_info[city_id] = {"state": state, "city" : city}

                            records.append((rank, city_id, aqi))
                        else:
                            Stat.warning("Failed to parse row at rank %d." % (rank), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
                else:
                    Stat.warning("Failed to find any table rows. Skipping.", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            else:
                Stat.warning("Failed to find the table under the 'curaqi' div. Skipping.", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        else:
            Stat.warning("Failed to find the div with 'curaqi' id. Skipping.", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    else:
        Stat.warning("Failed to find a div with 'curaqi' id. Skipping.", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

    return records

In [0]:
def jsonify_airnow_city_id_database():
    try:
        source = AirnowCommon.HIGHEST_FIVE_AQI_DIRECTORY + '/airnow_city_id_to_city_info.db'
        dest = AirnowCommon.HIGHEST_FIVE_AQI_DIRECTORY + '/airnow_city_id_to_city_info.json'

        # build the JSON data
        json_data = {}
        for key in airnow_city_id_to_city_info.keys():
            json_data[key] = airnow_city_id_to_city_info[key]

        # write the JSON file to disk
        tmp = dest + '.tmp' + str(os.getpid())
        os.makedirs(os.path.dirname(tmp), exist_ok=True)
        with open(tmp, 'w') as json_file:
            json.dump(json_data, json_file, sort_keys=True)
        os.rename(tmp, dest)

        # make the JSON file readable by everyone
        os.chmod(dest, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP | stat.S_IROTH)

        # make the JSON file's file stat times match those of the .db
        source_file_stat = os.stat(source)
        os.utime(dest, (source_file_stat.st_mtime, source_file_stat.st_mtime))
    except:
        Stat.warning("Failed to jsonify airnow_city_id_to_city_info.db.", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

In [0]:
def stringify_tuple(t):
    return ','.join(map(str,t))

def stringify_records(r):
    return '|'.join(map(stringify_tuple,r))

def save_records(sample_timestamp, records):
    try:
        # build a file path for today's data
        filename = datetime.datetime.utcnow().strftime('%Y%m%d.dat')
        file_path = AirnowCommon.HIGHEST_FIVE_AQI_DAT_DIRECTORY + '/' + filename
        print(file_path)

        # make sure the directories to the file exist
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        # build the line to append (apparently it's safe to just use \n instead of os.linesep...see https://stackoverflow.com/a/11497391/703200)
        line = str(sample_timestamp) + ':' + stringify_records(records) + '\n'

        # append to the file
        with open(file_path, "a") as data_file:
            data_file.write(line)

        return True
    except:
        return False

#save_records(time.time(), [(1, 360, 77), (2, 576, 77), (3, 91, 70), (4, 157, 68), (5, 785, 65)])

In [0]:
def run():
    Stat.info('Scraping highest five AQI readings from Airnow...', host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    start_time = time.time()
    airnow_home_page = fetch_airnow_home_page()
    if airnow_home_page:
        records = parse_airnow_home_page_html(airnow_home_page)
        if records and len(records) > 0:
            jsonify_airnow_city_id_database()
            if save_records(start_time, records):
                Stat.info("%f: %s" % (start_time, records), details="saved", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            else:
                Stat.warning("%f: %s" % (start_time, records), details="failed to save", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    end_time = time.time()
    Stat.up('Done scraping highest five AQI readings from Airnow', details='Took %.1f seconds' % (end_time - start_time), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=RUN_INTERVAL_SECONDS*1.5)

def run_forever():
    while True:
        run()
        sleep_until_next_period(RUN_INTERVAL_SECONDS)

run_forever()