### Directories

- mirror:  Old mirrored PDF files
- mirror-csv:  New (2017) mirrored CSV files

In [None]:
import glob, os, subprocess, tempfile, time, re, json, datetime

In [None]:
source_url = "http://www.achd.net/airqual/DailySummary.PDF"
dest_dir = "mirror-csv"
tmp_dir = "tmp"

print("Mirror directory is %s" % os.path.abspath(dest_dir))

In [None]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb

def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())

exec_ipynb('./python-utils/utils.ipynb')

In [None]:
def now():
    return time.strftime('%Y-%m-%d-%H:%M:%S%z')

def find_most_recent_path(dest_dir):
    mirrored_files = sorted(glob.glob(dest_dir + "/AirQualityDataSummary-????-??-??-??:??:??*.csv"))
    if len(mirrored_files) == 0:
        return None
    return mirrored_files[-1]

def mirror_achd_sftp(dest_dir, tmp_dir):
    try:
        os.mkdir(dest_dir)
    except OSError:
        pass
    try:
        os.mkdir(tmp_dir)
    except OSError:
        pass

    tmp_filename = tempfile.NamedTemporaryFile(dir=tmp_dir, delete=False).name

    src = 'pdille@moveit.alleghenycounty.us:/WPRDC/Health\ Department/Air\ Quality\ Daily\ Summary.CSV'
    cmd = ['/projects/sshpass/sshpass',
       '-f%s/achd-ftp-passwd-do-not-check-in.txt' % os.getcwd(),
       'sftp',
       src,
       tmp_filename]
    print(' '.join(cmd))

    subprocess_check(cmd)
    data = open(tmp_filename).read()
    print('%s: Fetched %d bytes from %s to %s\n' % (now(), len(data), src, tmp_filename))

    most_recent_path = find_most_recent_path(dest_dir)

    if most_recent_path and open(most_recent_path).read() == data:
        print("%s: Not recording %d bytes read from %s because identical to previous file %s\n" % (now(), len(data), src, most_recent_path))
        os.unlink(tmp_filename)
    else:
        dest = "%s/AirQualityDataSummary-%s.csv" % (dest_dir, now())
        os.chmod(tmp_filename, 0o0644)
        os.rename(tmp_filename, dest)
        print("%s: Stored %d bytes read from %s at path %s\n" % (now(), len(data), src, dest))


In [None]:
mirror_achd_sftp(dest_dir, tmp_dir)