# Process Reporting Area Forecasts

Processes data we mirror from Airnow's Reporting Area data file (`reportingarea.dat`) and extracts the forecast for each reporting area, saving each to a separate JSON file and served from https://airstats.createlab.org/data/reporting-area/forecasts/.  JSON files are named by reporting area id (e.g. `pa005.json` for the Liberty-Clairton Area reporting area).

This script runs every hour, at 50 minutes after the hour.

Reports to stat.createlab.org as `Airnow Reporting Area Forecasts`.

Airnow's docs for the data files are here: https://s3-us-west-1.amazonaws.com//files.airnowtech.org/airnow/docs/ReportingAreaFactSheet.pdf


In [None]:
import json, os, dateutil, re, requests, subprocess, datetime, glob, stat, csv

from dateutil import rrule, tz, parser

In [None]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb

def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())

exec_ipynb('./python-utils/utils.ipynb')
exec_ipynb('./airnow-common.ipynb')

In [None]:
RUN_INTERVAL_SECONDS = 60 * 60    # every hour
RUN_AT_MINUTE = 50*60             # start at 50 minutes after the hour

DAT_DIRECTORY = AirnowCommon.REPORTING_AREA_DAT_DIRECTORY
FORECASTS_DIRECTORY = AirnowCommon.REPORTING_AREA_FORECASTS_DIRECTORY

REPORTING_AREA_ID_LOOKUP_JSON_FILENAME = 'reporting_area_id_lookup.json'

STAT_SERVICE_NAME = 'Airnow Reporting Area Forecasts'
STAT_HOSTNAME = 'hal21'
STAT_SHORTNAME = 'airnow-reporting-area-forecasts'

FILE_SUFFIX_PATTERN = '-[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9].dat' # -YYYYMMDDHHMMSS.dat

In [None]:
Stat.set_service(STAT_SERVICE_NAME)

In [None]:
# Globals
reporting_area_id_lookup = {}
reporting_area_id_to_forecasts = {}

In [None]:
def load_reporting_area_id_lookup():
    global reporting_area_id_lookup

    with open(AirnowCommon.DATA_DIRECTORY + '/' + REPORTING_AREA_ID_LOOKUP_JSON_FILENAME, 'r') as f:
        reporting_area_id_lookup = json.load(f)

In [None]:
def create_reporting_area_lookup_dictionary_key(name, state_code):
    return state_code + '|' + name

In [None]:
# Tries to find the reporting area (in the in-memory reporting_area_id_lookup) matching the
# given name and state code and, if found, returns the ID. Returns None if no match is found.
def find_reporting_area_id(name, state_code):
    global reporting_area_id_lookup
    key = create_reporting_area_lookup_dictionary_key(name, state_code)
    if key in reporting_area_id_lookup:
        return reporting_area_id_lookup[key]

    return None

# load_reporting_area_id_lookup()
# print(find_reporting_area_id('Birmingham', 'AL'))  # al001
# print(find_reporting_area_id('Mono Lake', 'CA'))  # ca225
# print(find_reporting_area_id('Springdale (Springdale-Fayetteville-Bentonville)', 'AR'))  # ar002
# print(find_reporting_area_id('Bogusville', 'WV'))  # None

In [None]:
# Returns an array (possibly empty) of all of today's partial data files.  That is, all
# files matching the YYYYMMDD-YYYYMMDDHHMMSS.dat pattern where the YYYYMMDD prefix is today.
# Filenames are sorted in reverse order.
def compute_files_to_check():
    today = datetime.datetime.now()
    yesterday = today - datetime.timedelta(days=1)
    today_yyyymmdd = today.strftime('%Y%m%d')
    yesterday_yyyymmdd = yesterday.strftime('%Y%m%d')
    files_today = glob.glob(DAT_DIRECTORY + '/' + today_yyyymmdd+  FILE_SUFFIX_PATTERN) # YYYYMMDD-YYYYMMDDHHMMSS.dat
    files_yesterday = glob.glob(DAT_DIRECTORY + '/' + yesterday_yyyymmdd + FILE_SUFFIX_PATTERN) # YYYYMMDD-YYYYMMDDHHMMSS.dat
    files = files_today + files_yesterday
    return sorted(files, reverse=True)

#compute_files_to_check()

In [None]:
def initialize_reporting_area_id_to_forecasts():
    global reporting_area_id_to_forecasts
    reporting_area_id_to_forecasts = {}

In [None]:
# trims and returns the given string, or returns None if empty or not a string
def trim_string_or_none_if_empty(s):
    if type(s) == str and not s.isspace():
        trimmed_string = s.strip()
        if trimmed_string:
            return trimmed_string

    return None

# print(trim_string_or_none_if_empty([1,2,3]))
# print(trim_string_or_none_if_empty(1.2))
# print(trim_string_or_none_if_empty(True))
# print(trim_string_or_none_if_empty(False))
# print(trim_string_or_none_if_empty(None))
# print(trim_string_or_none_if_empty(''))
# print(trim_string_or_none_if_empty(' '))
# print(trim_string_or_none_if_empty('  '))
# print(trim_string_or_none_if_empty('   \t '))
# print(trim_string_or_none_if_empty(' foo '))
# print(trim_string_or_none_if_empty('   bar'))
# print(trim_string_or_none_if_empty('baz    '))
# print(trim_string_or_none_if_empty('bat'))

In [None]:
# 'issue_date|valid_date|valid_time|time_zone|record_sequence|data_type|primary|reporting_area|state_code|latitude|longitude|pollutant|aqi_value|aqi_category|action_day|discussion|forecast_source'
def process_forecast(reporting_area_id, forecast_data):
    global reporting_area_id_to_forecasts

    # convert dates to strings in YYYY-MM-DD format because MM/DD/YY is silly.
    issue_date = datetime.datetime.strptime(forecast_data['issue_date'], '%m/%d/%y')
    valid_date = datetime.datetime.strptime(forecast_data['valid_date'], '%m/%d/%y')
    issue_date_str = datetime.datetime.strftime(issue_date, '%Y-%m-%d')
    valid_date_str = datetime.datetime.strftime(valid_date, '%Y-%m-%d')

    # pick out the other fields we care about for forecasts
    time_zone = trim_string_or_none_if_empty(forecast_data['time_zone'])
    is_primary = trim_string_or_none_if_empty(forecast_data['primary']) == 1
    pollutant = trim_string_or_none_if_empty(forecast_data['pollutant'])
    aqi_value = trim_string_or_none_if_empty(forecast_data['aqi_value'])
    aqi_category = trim_string_or_none_if_empty(forecast_data['aqi_category'])
    is_action_day = trim_string_or_none_if_empty(forecast_data['action_day']) == 'Yes'
    discussion = trim_string_or_none_if_empty(forecast_data['discussion'])
    forecast_source = trim_string_or_none_if_empty(forecast_data['forecast_source'])

    # convert the value to an int if not None (i.e. if it's currently a string)
    if type(aqi_value) == str:
        aqi_value = int(aqi_value)

    # make sure this reporting area is in the reporting_area_id_to_forecasts map
    if reporting_area_id not in reporting_area_id_to_forecasts:
        reporting_area_id_to_forecasts[reporting_area_id] = {}

    # get a more concise name for this reporting area's forecasts
    forecasts = reporting_area_id_to_forecasts[reporting_area_id]

    # make sure this pollutant is in the forecasts for this reporting area
    if pollutant not in forecasts:
        forecasts[pollutant] = {}

    # Now see whether the valid_date already exists in our cache for this pollutant. If not, then we want to insert it.
    # Otherwise, see whether the issue_date from this record is newer than what's in reporting_area_id_to_forecasts,
    # and, if so, then replace the old record in reporting_area_id_to_forecasts
    will_keep_forecast = False
    if valid_date_str in forecasts[pollutant]:
        existing_forecast = forecasts[pollutant][valid_date_str]
        existing_forecast_issue_date = datetime.datetime.strptime(existing_forecast['issueDate'], '%Y-%m-%d')
        will_keep_forecast = issue_date > existing_forecast_issue_date
    else:
        will_keep_forecast = True

    if will_keep_forecast:
        forecasts[pollutant][valid_date_str] = {
            'issueDate' : issue_date_str,
            'timezone' : time_zone,
            'isPrimary' : is_primary,
            'aqiValue' : aqi_value,
            'aqiCategory' : aqi_category,
            'isActionDay' : is_action_day,
            'discussion' : discussion,
            'source' : forecast_source
        }

    return will_keep_forecast


In [None]:
def process_file(file):
    # Read the file line-by-line, picking out the forecast records.
    with open(file, mode='r') as data_file:
        data_file_reader = csv.DictReader(data_file, delimiter="|", fieldnames=AirnowCommon.REPORTING_AREA_DATA_FILE_FIELDNAMES)

        # run through all records, determine data type, and write to the appropriate file
        num_new_forecasts = 0
        for row in data_file_reader:
            data_type = row['data_type']
            if data_type == 'F':
                name = row['reporting_area']
                state_code = row['state_code']
                id = find_reporting_area_id(name, state_code)
                if (id):
                    if process_forecast(id, row):
                        num_new_forecasts += 1
                else:
                    Stat.info(f"Skipping unknown reporting area [{name}|{state_code}]", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

        print(f"Found {num_new_forecasts} new forecasts in {file}")

In [None]:
def write_json_file(data, filename):
    tmp = filename + '.tmp' + str(os.getpid())
    os.makedirs(os.path.dirname(tmp), exist_ok=True)
    with open(tmp, 'w') as json_file:
        json.dump(data, json_file, sort_keys=True)
    os.rename(tmp, filename)

    # make the JSON file readable by everyone
    os.chmod(filename, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP | stat.S_IROTH)

In [None]:
# Iterate over the reporting area IDs and save individual JSON files for each reporting area
def write_forecast_json_files():
    global reporting_area_id_to_forecasts

    for id, forecast in reporting_area_id_to_forecasts.items():
        # scrub the ID, making sure it it's alphanumeric (and underscore)...no malicious "../../" or whatnot in there
        clean_id = re.sub(r'\W+', '', id)

        # construct the absolute filename and write the file
        json_filename = FORECASTS_DIRECTORY + '/' + clean_id + '.json'
        write_json_file(forecast, json_filename)
    Stat.info(f"Wrote {len(reporting_area_id_to_forecasts)} forecast JSON files", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

In [None]:
def process():
    starting_timestamp = datetime.datetime.now().timestamp()

    files = compute_files_to_check()
    if len(files) > 0:
        Stat.info(f"Processing {len(files)} data files for forecasts", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

        # Load the reporting area ID lookup JSON file into memory to quickly find a reporting area's ID from its name and state code.
        # I need it here to quickly find a reporting area's ID because Airnow's reporting area data file (reportingarea.dat) unhelpfully
        # references reporting areas by name and lat/long rather than just using the unique ID. Boo.
        load_reporting_area_id_lookup()

        # initialize the in-memory map of reporting area ID to forecasts
        initialize_reporting_area_id_to_forecasts()

        # Process the files to pick out reporting area forecasts
        for file in files:
            process_file(file)

        # Now that all the files are processed, we have an in-memory map of reporting area ID to forecasts. So all that's left
        # is to write the JSON files
        write_forecast_json_files()
    else:
        Stat.info("No data files found!", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

    elapsed_seconds = datetime.datetime.now().timestamp() - starting_timestamp
    Stat.up('Done processing %d data files (elapsed time: %d seconds)' % (len(files), elapsed_seconds), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=RUN_INTERVAL_SECONDS*1.5)

In [None]:
def process_forever():
    while True:
        process()
        sleep_until_next_period(RUN_INTERVAL_SECONDS)

In [None]:
process()