Mirroring AirNow data
=====================



In [1]:
import datetime, dateutil.parser, email, ftplib, glob, json, os, re, time, socket, subprocess, sys, traceback, urllib2
from dateutil import rrule, tz

In [2]:
def mirror_airnow_file(src, dest):
    headers = {}
    # If destination already exists, mirror only if newer
    try:
        stat = os.stat(dest)
        date = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(stat.st_mtime))
        headers['If-Modified-Since'] = date
    except:
        pass
    
    src_url = 'https://files.airnowtech.org/airnow/' + src
    request = urllib2.Request(src_url, headers=headers)
    try:
        response = urllib2.urlopen(request)
        data = response.read()
        server_modtime = dateutil.parser.parse(response.headers.get('Last-Modified'))
        server_modtime_epoch = datetime2epoch(server_modtime)
        tmp = dest + '.tmp' + str(os.getpid())
        open(tmp, 'w').write(data)
        os.rename(tmp, dest)
        os.utime(dest, (server_modtime_epoch, server_modtime_epoch))
        print 'Wrote %d bytes to %s' % (len(data), dest)
    except urllib2.HTTPError, e:
        if e.code == 304:
            print 'Not copying %s to %s because destination is up-to-date' % (src, dest)
            return
        if e.code == 404:
            print 'Source %s does not exist, skipping' % src_url
            return
        print 'When requesting %s, HTTPError code %d' % (src_url, e.code)
        raise
        
#mirror_airnow_file('2016/20161231/monitoring_site_locations.dat', 'AirNowTest/monitoring_site_locations.dat')

In [12]:
def datetime2epoch(dt):
    return (dt - datetime.datetime(1970, 1, 1, tzinfo=tz.tzutc())).total_seconds()

def directory_from_date(dt):
    return dt.strftime('%Y/%Y%m%d')

def mirror_timestamp(timestamp):
    filename = timestamp.strftime('%Y%m%d%H.dat')
    src = directory_from_date(timestamp) + '/HourlyData_' + filename
    dest = 'AirNow/' + filename
    mirror_airnow_file(src, dest)

#mirror_timestamp(dateutil.parser.parse('2016-12-31 00:00'))

In [13]:
def compute_first_date_to_check():
    files = glob.glob('AirNow/[0-9]*.dat')
    if len(files) == 0:
        return datetime.datetime(2013, 8, 1)
    last_file = sorted(files)[-1]
    last_date = datetime.datetime.strptime(last_file, "AirNow/%Y%m%d%H.dat")
    print 'Last mirrored file is %s, date %s' % (last_file, last_date)
    sys.stdout.flush()
    first_date_to_check = last_date - datetime.timedelta(days = 30)
    print 'Check for updates starting at date %s' % first_date_to_check
    sys.stdout.flush()
    return first_date_to_check

In [14]:
def mirror():
    start = compute_first_date_to_check()
    now = datetime.datetime.utcnow()

    mirror_airnow_file(directory_from_date(now) + '/monitoring_site_locations.dat', 'AirNow/monitoring_site_locations.dat')

    timestamps_to_mirror = list(rrule.rrule(rrule.HOURLY, dtstart=start, until=now))
    print 'Mirroring %s timestamps, starting %s' % (len(timestamps_to_mirror), start)

    for timestamp in timestamps_to_mirror:
        mirror_timestamp(timestamp)
        
mirror()

Last mirrored file is AirNow/2016123120.dat, date 2016-12-31 20:00:00
Check for updates starting at date 2016-12-01 20:00:00
Source https://files.airnowtech.org/airnow/2017/20170101/monitoring_site_locations.dat does not exist, skipping
Mirroring 725 timestamps, starting 2016-12-01 20:00:00
Wrote 586603 bytes to AirNow/2016120120.dat
Wrote 588963 bytes to AirNow/2016120121.dat
Wrote 590319 bytes to AirNow/2016120122.dat
Wrote 577169 bytes to AirNow/2016120123.dat
Wrote 596145 bytes to AirNow/2016120200.dat
Wrote 598680 bytes to AirNow/2016120201.dat
Wrote 598637 bytes to AirNow/2016120202.dat
Wrote 595939 bytes to AirNow/2016120203.dat
Wrote 590879 bytes to AirNow/2016120204.dat
Wrote 589419 bytes to AirNow/2016120205.dat
Wrote 591502 bytes to AirNow/2016120206.dat
Wrote 583998 bytes to AirNow/2016120207.dat
Wrote 591462 bytes to AirNow/2016120208.dat
Wrote 592367 bytes to AirNow/2016120209.dat
Wrote 595197 bytes to AirNow/2016120210.dat
Wrote 587900 bytes to AirNow/2016120211.dat
Wrot