# Upload PurpleAir VOC

Processes PurpleAir VOC data files, aggregating by monitor, and uploads to ESDR.  Runs every 5 minutes.

In [None]:
import os, re, json, datetime, html, subprocess, sys, dateutil, datetime, glob, stat, time

from dateutil import rrule, tz, parser
from sqlitedict import SqliteDict
from collections import defaultdict
import pandas as pd

In [None]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb

def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())

exec_ipynb('./python-utils/utils.ipynb')
exec_ipynb('./python-utils/esdr-library.ipynb')
exec_ipynb('./purpleair-common.ipynb')

In [None]:
STAT_SERVICE_NAME = 'PurpleAir Upload VOC'
STAT_HOSTNAME = 'hal21'
STAT_SHORTNAME = 'purpleair-upload-voc'

Stat.set_service(STAT_SERVICE_NAME)

In [None]:
# Runs every 5 minutes
RUN_INTERVAL_MINUTES = 5
RUN_INTERVAL_SECONDS = 60 * RUN_INTERVAL_MINUTES

# Maximum number of JSON data files to process per batch
NUM_FILES_PER_BATCH = 500

# let this script restart once per day, to deal with ESDR OAuth token refresh--see below
NUM_ITERATIONS_BETWEEN_RESTARTS = int(60 * 24 / RUN_INTERVAL_MINUTES)

SQLITE_PROGRESS_FILE = 'purpleair-upload-voc.sqlite'

In [None]:
progress = SqliteDict(SQLITE_PROGRESS_FILE, autocommit=True)

In [None]:
esdr = Esdr('esdr-auth-purpleair-uploader.json', user_agent='esdr-library.py['+STAT_SERVICE_NAME+']')

In [None]:
accumulator = None

In [None]:
purpleair_product = None

def get_purpleair_product():
    try:
        global esdr, purpleair_product
        if not esdr:
            esdr = Esdr('esdr-auth-airnow-uploader.json', user_agent='esdr-library.py['+STAT_SERVICE_NAME+']')
        if not purpleair_product:
            purpleair_product = esdr.get_product_by_name('PurpleAir')
        return purpleair_product
    except requests.HTTPError as e:
        Stat.warning(f"Failed to get PurpleAir ESDR product due to error: {str(e)}", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        return None

#get_purpleair_product()

In [None]:
def path_top_dir(path):
    return path.split('/')[0]


def path_sans_top_dir(path):
    tokens = path.split('/')
    return '/'.join(tokens[1:])


# Get newer files than path in dir
def get_newer_files(dir, newer_than, num_subdir_levels):
    ret = []
    if num_subdir_levels:
        for subdir in sorted(os.listdir(dir)):
            if not newer_than or subdir > path_top_dir(newer_than):
                for file in get_newer_files(dir + '/' + subdir, None,
                                            num_subdir_levels - 1):
                    ret.append(subdir + '/' + file)
            elif subdir == path_top_dir(newer_than):
                for file in get_newer_files(dir + '/' + subdir,
                                            path_sans_top_dir(newer_than),
                                            num_subdir_levels - 1):
                    ret.append(subdir + '/' + file)
    else:
        for file in sorted(os.listdir(dir)):
            if not newer_than or file > newer_than:
                ret.append(file)
    return ret

In [None]:
def compute_lat_lon(record):
    return (float(record['lat']), float(record['lon']))

def compute_esdr_id(monitor_id, record):
    lat, lon = compute_lat_lon(record)

    id = "%s_%06d%s%06d%s" % (monitor_id,
                              round(1000 * abs(lat)), 'NS'[lat < 0],
                              round(1000 * abs(lon)), 'EW'[lon < 0])
    return id.replace('.','_')

#compute_esdr_id(50087, {"lat" : 40.10513, "lon" : -80.713005, "voc" : 92.44})

In [None]:
def accumulate_reset():
    global accumulator
    accumulator = defaultdict(lambda: {'records':defaultdict(lambda: {})})

In [None]:
def accumulate_json(path):
    with open(path) as f:
        data = json.load(f)
        timestamp = data['epoch_time']
        monitors = data['monitors']
        for monitor_id, monitor in monitors.items():
            esdr_id = compute_esdr_id(monitor_id, monitor)
            a = accumulator[esdr_id]
            a['records'][timestamp] = {'time': timestamp, 'voc': float(monitor['voc'])}
            lat, lon = compute_lat_lon(monitor)
            a['lat'] = lat
            a['lon'] = lon
    Stat.info(f"After merge of {path}, total of {len(accumulator)} ESDR IDs", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

In [None]:
# Loads metadata for the PurpleAir feed with the given serial number as the prefix of the feed name. For example,
# calling load_esdr_feed_by_serial_number('94497_040689N080292W') will return {'id': 60360, 'name': '94497_040689N080292W PurpleAir'}
def load_esdr_feed_by_serial_number(serial_number):
    try:
        # On the offchance there are multiple feeds with the same name, then we want the most recently
        # created one, so we order by id in descending order. Note also that I get the name and device
        # ID here because esdr.upload_dicts() uses it in it's print statement after uploading. Ugh.
        response = esdr.query_first('/api/v1/feeds', {'whereAnd' : 'productId=%d,name=%s' % (purpleair_product['id'],serial_number + ' PurpleAir'), 'fields' : 'id,name,deviceId', 'limit': 1, 'orderBy' : '-id'})
        return response
    except requests.HTTPError as e:
        Stat.warning(f"Failed to upload to feed corresponding to ID {id} (HTTP {e.response.status_code})", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    except:
        Stat.warning(f"Failed to upload to feed corresponding to ID {id}", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

# get_purpleair_product()
# print(load_esdr_feed_by_serial_number('119450_038680N121082W')) # {'id': 87520, 'name': '119450_038680N121082W PurpleAir'}
# print(load_esdr_feed_by_serial_number('94497_040689N080292W')) # {'id': 60360, 'name': '94497_040689N080292W PurpleAir'}
# print(load_esdr_feed_by_serial_number('bogus')) # None

In [None]:
def sorted_dict(dict):
    return {k: dict[k] for k in sorted(dict.keys())}


def upload(id):
    try:
        a = accumulator[id]
        if 'lat' in a and 'lon' in a:
            # Look up the feed by name, since it turns out that we can't rely on the lat/lon reporting by the API that this
            # notebook uses as being the same lat/lon reported by the API that the main PurpleAir data mirror uses.  Which
            # really kinda sucks and doesn't make a lot of sense.  Anyway, since the feed name is composed of the serial
            # number (e.g. "94497_040689N080292W") plus the string " PurpleAir", then--for MOST cases--we should be able to
            # simply query for a feed belonging to the PurpleAir product AND which has a name like "94497_040689N080292W PurpleAir"
            feed = load_esdr_feed_by_serial_number(id)
            if feed is None:
                Stat.warning(f"Skipping ID {id} because no matching ESDR feed could be found", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
            else:
                dicts = list(sorted_dict(a['records']).values())

                print(f"Uploading to feed {feed['id']} which is {feed['name']}")
                esdr.upload_dicts(feed, dicts)
                return True
        else:
            Stat.warning(f"Skipping upload for ID {id} because it has no lat/lon", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    except requests.HTTPError as e:
        Stat.warning(f"Failed to upload to feed corresponding to ID {id} (HTTP {e.response.status_code})", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    except Exception as ex:
        Stat.warning(f"Failed to upload to feed corresponding to ID {id}: " + str(ex), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    return False

In [None]:
def process_one_batch_of_files():
    lastUploaded = progress.get('lastUploaded', None)

    files = get_newer_files(PurpleAirCommon.VOC_DATA_DIRECTORY, lastUploaded, num_subdir_levels=1)

    # Only json files
    files = list(filter(re.compile(r'\.json$').search, files))

    # A maximum of the first NUM_FILES_PER_BATCH files
    files = files[0:NUM_FILES_PER_BATCH]

    if len(files) > 0:
        accumulate_reset()

        Stat.up(f"Processing {len(files)} data files", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=RUN_INTERVAL_SECONDS * 1.5)
        for file in files:
            accumulate_json(PurpleAirCommon.VOC_DATA_DIRECTORY + '/' + file)

        successful_upload_count = 0;
        for id in sorted(accumulator.keys()):
            if upload(id):
                successful_upload_count += 1
            # be nice to ESDR and sleep a bit
            time.sleep(0.5)

        Stat.up(f"Successfully uploaded {successful_upload_count} of {len(accumulator)} devices, with data from {len(files)} files [{files[0]} ... {files[-1]}]", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME, valid_for_secs=RUN_INTERVAL_SECONDS * 1.5)
        progress['lastUploaded'] = files[-1]
        return len(files)
    else:
        return 0


In [None]:
get_purpleair_product()

# Do this NUM_ITERATIONS_BETWEEN_RESTARTS times and then exit, relying on the cron job to start it up again. We do this
# as a simple way to deal with refreshing the ESDR OAuth2 token, rather than adding code to catch the HTTP 401/403 error
# upon upload, yada yada.
for i in list(range(NUM_ITERATIONS_BETWEEN_RESTARTS)):
    num_files_processed = process_one_batch_of_files()
    if num_files_processed == 0:
        Stat.info(f"No files remaining to process, sleeping until next run period", host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        sleep_until_next_period(RUN_INTERVAL_SECONDS)