Mirror AirNow data
==================

Downloads Airnow's hourly data files, saving them in `./AirNow`.

Reports to stat.createlab.org as `MirrorAirnow`.

Docs for the daily data files are here: https://docs.airnowapi.org/docs/HourlyDataFactSheet.pdf

In [0]:
import datetime, dateutil, email, ftplib, glob, json, os, re, time, requests, socket, subprocess, sys, traceback
from dateutil import rrule, tz, parser

In [0]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb


def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())


exec_ipynb('python-utils/utils.ipynb')


In [0]:
STAT_SERVICE_NAME = 'MirrorAirnow'
STAT_AIRNOW_HOSTNAME = 'airnow'
STAT_AIRNOW_SHORTNAME = 'airnow-hourly-data'

Stat.set_service(STAT_SERVICE_NAME)

In [0]:
def datetime2epoch(dt):
    return (dt - datetime.datetime(1970, 1, 1, tzinfo=tz.tzutc())).total_seconds()

def directory_from_date(dt):
    return dt.strftime('%Y/%Y%m%d')

# Returns true if the file was actually mirrored (i.e. both newer than the current version, and successfully downloaded), returns false otherwise.
def mirror_file_using_modtime(src_url, dest):
    headers = {}
    # If destination already exists, mirror only if newer
    try:
        filestat = os.stat(dest)
        date = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(filestat.st_mtime))
        headers['If-Modified-Since'] = date
    except:
        pass

    response = requests.get(src_url, headers=headers)
    if response.status_code == 200:
        data = response.content

        server_modtime = dateutil.parser.parse(response.headers['Last-Modified'])
        server_modtime_epoch = datetime2epoch(server_modtime)
        tmp = dest + '.tmp' + str(os.getpid())
        os.makedirs(os.path.dirname(tmp), exist_ok=True)
        open(tmp, 'wb').write(data)
        os.rename(tmp, dest)
        os.utime(dest, (server_modtime_epoch, server_modtime_epoch))
        print('Wrote %d bytes to %s' % (len(data), dest))
        Stat.info('Successfully mirrored %s to %s (%d bytes)' % (src_url, dest, len(data)), host=STAT_AIRNOW_HOSTNAME, shortname=STAT_AIRNOW_SHORTNAME)
        return True
    elif response.status_code == 304:
        print('Local mirror of %s is up to date.  Skipping.' % (src_url))
        #Stat.info('Local mirror of %s is up to date.  Skipping.' % (src_url), host=STAT_AIRNOW_HOSTNAME, shortname=STAT_AIRNOW_SHORTNAME)
    elif response.status_code == 404:
        Stat.warning('File %s not found (HTTP %d). Skipping.' % (src_url, response.status_code), host=STAT_AIRNOW_HOSTNAME, shortname=STAT_AIRNOW_SHORTNAME)
    else:
        Stat.warning('Received status code %d while fetching %s.  Skipping.' % (response.status_code, src_url), host=STAT_AIRNOW_HOSTNAME, shortname=STAT_AIRNOW_SHORTNAME)
    return False


# Returns True if the file was actually mirrored (i.e. both newer than the current version, and successfully downloaded), returns False otherwise.
def mirror_airnow_file(src, dest):
    return mirror_file_using_modtime('https://files.airnowtech.org/airnow/' + src, dest)

#mirror_airnow_file('2016/20161231/monitoring_site_locations.dat', 'AirNowTest/monitoring_site_locations.dat')

In [0]:
def mirror_timestamp(timestamp):
    filename = timestamp.strftime('%Y%m%d%H.dat')
    src = directory_from_date(timestamp) + '/HourlyData_' + filename
    dest = 'AirNow/' + filename
    mirror_airnow_file(src, dest)

#mirror_timestamp(dateutil.parser.parse('2016-12-31 00:00'))

In [0]:
def compute_first_date_to_check():
    files = glob.glob('AirNow/[0-9]*.dat')
    if len(files) == 0:
        return datetime.datetime(2013, 8, 1)
    last_file = sorted(files)[-1]
    last_date = datetime.datetime.strptime(last_file, "AirNow/%Y%m%d%H.dat")
    Stat.debug('Most recently mirrored data file is %s (%s)' % (last_file, last_date), host=STAT_AIRNOW_HOSTNAME, shortname=STAT_AIRNOW_SHORTNAME)
    first_date_to_check = last_date - datetime.timedelta(days = 30)
    Stat.debug('Checking for updates starting with date %s' % (first_date_to_check), host=STAT_AIRNOW_HOSTNAME, shortname=STAT_AIRNOW_SHORTNAME)
    return first_date_to_check

In [0]:
def mirror():
    start = compute_first_date_to_check()
    now = datetime.datetime.utcnow()

    mirror_airnow_file(directory_from_date(now) + '/monitoring_site_locations.dat', 'AirNow/monitoring_site_locations.dat')

    timestamps_to_mirror = list(rrule.rrule(rrule.HOURLY, dtstart=start, until=now))
    Stat.info('Mirroring %d data files, starting with %s... (up-to-date files will not be logged here)' % (len(timestamps_to_mirror), start), host=STAT_AIRNOW_HOSTNAME, shortname=STAT_AIRNOW_SHORTNAME)

    for timestamp in timestamps_to_mirror:
        mirror_timestamp(timestamp)

try:
    mirror()
    Stat.up('Mirror completed successfully', host=STAT_AIRNOW_HOSTNAME, shortname=STAT_AIRNOW_SHORTNAME)
except Exception as e:
    Stat.down('Exception recorded %s' % e, host=STAT_AIRNOW_HOSTNAME, shortname=STAT_AIRNOW_SHORTNAME)