# Download PurpleAir VOC

Downloads VOC data from public PurpleAir devices within a specific lat/long bounding box (defined in `BOUNDING_BOX`, below).

Data is downloaded from PurpleAir every 2.5 minutes and saved as JSON in a `YYYYMMDD` subdirectory of `/esdr-connectors/mirror-purpleair-to-esdr/mirror-voc`.

In [None]:
import os, re, json, datetime, requests, html, time

In [None]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb

def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())

exec_ipynb('./python-utils/utils.ipynb')
exec_ipynb('./purpleair-common.ipynb')

In [None]:
STAT_SERVICE_NAME = 'PurpleAir Download VOC'
STAT_HOSTNAME = 'hal21'
STAT_SHORTNAME = 'purpleair-download-voc'

Stat.set_service(STAT_SERVICE_NAME)

In [None]:
RUN_INTERVAL_SECONDS = 60 * 2.5    # every 2.5 minutes

DEFAULT_HTTP_RETRY_SECONDS = 20

BOUNDING_BOX = {
    'nw': {
        'lat': 45.41701469532239,
        'lon': -84.8975200699811
        },
    'se': {
        'lat': 34.884595781069194,
        'lon': -77.4944799300174
        }
    }

URL = PurpleAirCommon.ROOT_URL + f"?opt=1/e/mVOC/a10/cC0&fetch=true&nwlat={BOUNDING_BOX['nw']['lat']}&selat={BOUNDING_BOX['se']['lat']}&nwlng={BOUNDING_BOX['nw']['lon']}&selng={BOUNDING_BOX['se']['lon']}&fields=voc"

In [None]:
# Repeatedly attempts to fetch the JSON from PurpleAir. Upon success, returns a tuple of
# the JSON, epoch timestamp of the request, and a suggested filename path (constructed from
# date and time of the request). Upon failure, it waits for some amount of time and then retries.
def fetch_json(url):
    while True:
        sleep_time_secs = DEFAULT_HTTP_RETRY_SECONDS
        dest_path = datetime.datetime.utcnow().strftime(PurpleAirCommon.VOC_DATA_DIRECTORY + '/%Y%m%d/%H%M%Sutc.json')
        timestamp = datetime.datetime.now().timestamp()
        response = requests.get(url, timeout=120)
        if response.status_code == 200:
            try:
                return response.json(), timestamp, dest_path
            except Exception as ex:
                Stat.warning(f"Exception {ex} parsing result from {url}", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        elif response.status_code == 429:
            response_json = response.json()
            if response_json:
                message = response_json['message']

                # use regex to try to pick out the suggested wait time
                search_result = re.search('^Rate limit exceeded. Try again in (\d+) milli seconds.$', message)
                if search_result:
                    sleep_time_secs = int(search_result.group(1)) / 1000
                Stat.warning(f"Rate limit exceeded, sleeping for {sleep_time_secs} seconds.", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            else:
                Stat.warning(f"Rate limit exceeded, but could not determine suggested wait time.  Sleeping for {sleep_time_secs} seconds.", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

        else:
            Stat.warning('Response code %d from %s: %s' % (response.status_code, url, html.escape(response.text)), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

        time.sleep(sleep_time_secs)

In [None]:
def extract_voc_records(data):
    fields = data['fields']
    records = data['data']
    num_voc = 0;
    voc_records = {};
    for r in records:
        try:
            record = dict(zip(fields, r))
            if record['Voc'] is not None:
                num_voc += 1
                voc_records[record['ID']] = {
                    'lat' : record['Lat'],
                    'lon' : record['Lon'],
                    'voc' : record['Voc']
                }
        except Exception as ex:
            Stat.warning(f"Skipping record with unexpected number of fields: {r}", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    return voc_records

In [None]:
def write_json_file(data, filename):
    tmp = filename + '.tmp' + str(os.getpid())
    os.makedirs(os.path.dirname(tmp), exist_ok=True)
    with open(tmp, 'w', encoding='utf8') as json_file:
        json.dump(data, json_file, sort_keys=True, ensure_ascii=False)
    os.rename(tmp, filename)

In [None]:
def mirror_once():
    global URL
    starting_timestamp = datetime.datetime.now().timestamp()

    # fetch the JSON
    data, request_timestamp, dest_path = fetch_json(URL)

    # extract only the VOC records and keep only essential data about the monitor (e.g. ID, lat/lon, VOC value)
    voc_data = extract_voc_records(data);

    # if we got data, then write the JSON to disk
    if voc_data is not None and len(voc_data) > 0:
        # insert the request_timestamp into the json, and write to disk
        json_data = {'epoch_time' : request_timestamp, 'monitors' : voc_data}
        write_json_file(json_data, dest_path)

        elapsed_seconds = datetime.datetime.now().timestamp() - starting_timestamp
        Stat.up(f"Downloaded VOC data for {len(voc_data)} monitors, saved to {dest_path}", details='Elapsed time: %.1f s' % (elapsed_seconds), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=RUN_INTERVAL_SECONDS * 1.5)
    else:
        Stat.down(f"No VOC data found", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=RUN_INTERVAL_SECONDS * 1.5)

In [None]:
def mirror_forever():
    while True:
        mirror_once()
        sleep_until_next_period(RUN_INTERVAL_SECONDS)

In [None]:
mirror_forever()