Mirroring AirNow data
=====================



In [3]:
import datetime, dateutil.parser, email, ftplib, glob, json, os, re, time, requests, socket, subprocess, sys, traceback
from dateutil import rrule, tz

In [26]:
def datetime2epoch(dt):
    return (dt - datetime.datetime(1970, 1, 1, tzinfo=tz.tzutc())).total_seconds()

def directory_from_date(dt):
    return dt.strftime('%Y/%Y%m%d')

def mirror_file_using_modtime(src_url, dest):
    headers = {}
    # If destination already exists, mirror only if newer
    try:
        stat = os.stat(dest)
        date = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(stat.st_mtime))
        headers['If-Modified-Since'] = date
    except:
        pass
    
    
    response = requests.get(src_url, headers=headers)
    if response.status_code == 200:
        data = response.content
    
        server_modtime = dateutil.parser.parse(response.headers['Last-Modified'])
        server_modtime_epoch = datetime2epoch(server_modtime)
        tmp = dest + '.tmp' + str(os.getpid())
        os.makedirs(os.path.dirname(tmp), exist_ok=True)
        open(tmp, 'wb').write(data)
        os.rename(tmp, dest)
        os.utime(dest, (server_modtime_epoch, server_modtime_epoch))
        print('Wrote %d bytes to %s' % (len(data), dest))
    else:
        print('Received status code %d while fetching %s.  Skipping' % (response.status_code, src_url))

        
def mirror_airnow_file(src, dest):
    mirror_file_using_modtime('https://files.airnowtech.org/airnow/' + src, dest)
    
#mirror_airnow_file('2016/20161231/monitoring_site_locations.dat', 'AirNowTest/monitoring_site_locations.dat')

Received status code 304 while fetching https://files.airnowtech.org/airnow/2016/20161231/monitoring_site_locations.dat.  Skipping


In [27]:

def mirror_timestamp(timestamp):
    filename = timestamp.strftime('%Y%m%d%H.dat')
    src = directory_from_date(timestamp) + '/HourlyData_' + filename
    dest = 'AirNow/' + filename
    mirror_airnow_file(src, dest)

#mirror_timestamp(dateutil.parser.parse('2016-12-31 00:00'))

In [28]:
def compute_first_date_to_check():
    files = glob.glob('AirNow/[0-9]*.dat')
    if len(files) == 0:
        return datetime.datetime(2013, 8, 1)
    last_file = sorted(files)[-1]
    last_date = datetime.datetime.strptime(last_file, "AirNow/%Y%m%d%H.dat")
    print('Last mirrored file is %s, date %s' % (last_file, last_date))
    sys.stdout.flush()
    first_date_to_check = last_date - datetime.timedelta(days = 30)
    print('Check for updates starting at date %s' % first_date_to_check)
    sys.stdout.flush()
    return first_date_to_check

In [29]:
def mirror():
    start = compute_first_date_to_check()
    now = datetime.datetime.utcnow()

    mirror_airnow_file(directory_from_date(now) + '/monitoring_site_locations.dat', 'AirNow/monitoring_site_locations.dat')

    timestamps_to_mirror = list(rrule.rrule(rrule.HOURLY, dtstart=start, until=now))
    print('Mirroring %s timestamps, starting %s' % (len(timestamps_to_mirror), start))

    for timestamp in timestamps_to_mirror:
        mirror_timestamp(timestamp)
        
mirror()

Last mirrored file is AirNow/2016123100.dat, date 2016-12-31 00:00:00
Check for updates starting at date 2016-12-01 00:00:00
Received status code 304 while fetching https://files.airnowtech.org/airnow/2019/20190724/monitoring_site_locations.dat.  Skipping
Mirroring 23181 timestamps, starting 2016-12-01 00:00:00
Received status code 304 while fetching https://files.airnowtech.org/airnow/2016/20161201/HourlyData_2016120100.dat.  Skipping
Received status code 304 while fetching https://files.airnowtech.org/airnow/2016/20161201/HourlyData_2016120101.dat.  Skipping
Received status code 304 while fetching https://files.airnowtech.org/airnow/2016/20161201/HourlyData_2016120102.dat.  Skipping
Received status code 304 while fetching https://files.airnowtech.org/airnow/2016/20161201/HourlyData_2016120103.dat.  Skipping
Received status code 304 while fetching https://files.airnowtech.org/airnow/2016/20161201/HourlyData_2016120104.dat.  Skipping
Received status code 304 while fetching https://file

KeyboardInterrupt: 

In [24]:
!ls -l ../..

total 36281644
drwxr-xr-x 2 rsargent users           4096 Jul 24 15:42 AirNow
drwx------ 2 root     root            4096 Jul 24 15:43 AirNowTest
drwx------ 2 root     root            4096 Jul 24 15:43 EPA-AirData
drwxrwxr-x 3 rsargent rsargent        4096 Jul 24 15:46 foo
-rw-r--r-- 1 rsargent users          70792 Jul 24 15:12 Mirror-Airnow.ipynb
-rw-r--r-- 1 rsargent users              0 Jul 24 15:40 Mirror-Airnow.ipynb.lock
-rw-r--r-- 1 rsargent users    37124618856 Jul 24 15:42 Mirror-Airnow.ipynb.log
-rw-r--r-- 1 rsargent users           5408 Jul 24 15:10 Mirror-EPA-AirData.ipynb
-rw-r--r-- 1 rsargent users              0 Jul 24 01:05 Mirror-EPA-AirData.ipynb.lock
-rw-r--r-- 1 rsargent users       27529897 Jul 24 01:05 Mirror-EPA-AirData.ipynb.log
drwx------ 2 root     root            4096 Jul 24 15:43 python-utils
-rw-r--r-- 1 rsargent users              0 Dec 21  2014 Untitled0.ipynb.lock
-rw-r--r-- 1 rsargent users           4759 Dec 21  2014 Untitled0.ipynb.log
-r