In [None]:
import bz2, html, json, os, re, requests, subprocess, sys, dateutil, datetime, glob, stat

from dateutil import rrule, tz, parser
from sqlitedict import SqliteDict
from collections import defaultdict
import pandas as pd

# if not os.path.exists('python-utils'):
#     subprocess.check_output('git clone https://github.com/CMU-CREATE-Lab/python-utils.git', shell=True)

# def exec_ipynb(filename_or_url):
#     nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
#     if(nb['nbformat'] >= 4):
#         src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
#     else:
#         src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']
#     exec('\n'.join(src), globals())

# os.chdir('/t/esdr-connectors/mirror-purpleair-to-esdr/')
# notebook_wide_display()

In [None]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb


def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())


exec_ipynb('./python-utils/utils.ipynb')
exec_ipynb('./python-utils/esdr-library.ipynb')

In [None]:
STAT_SERVICE_NAME = 'Purpleair upload to ESDR realtime'
STAT_HOSTNAME = 'hal21'
STAT_SHORTNAME = 'purpleair-upload-to-esdr-realtime'

In [None]:
Stat.set_service(STAT_SERVICE_NAME)
progress = SqliteDict('upload-purpleair-realtime.sqlite', autocommit=True)

In [None]:
# print(progress['lastUploaded'])
#
# On 2020-10-26, CPB fast-forwarded the uploader by manually setting the progress['lastUploaded'] flag to '20201021/161500utc.json.bz2', like this:
#
#    progress['lastUploaded'] = '20201021/161500utc.json.bz2'
#
# The previous value before fast-forwarding was '20200802/113000utc.json.bz2'.

In [None]:
# First time uploading, create a new client like so:

# Esdr.save_client('esdr-auth-purpleair-uploader.json', 'PurpleAir uploader for timemachine1')

# and then follow the directions it prints, which include visiting esdr.cmucreatelab.org and creating
# a client with given parameters, and also editing esdr-auth-baaqm-uploader.json to include your
# username and password

# Do not add esdr-auth-*.json to the git repo
# !echo 'esdr-auth-*.json' >>.gitignore

esdr = Esdr('esdr-auth-purpleair-uploader.json')

In [None]:
############################################
#
# Parse and convert device records to ESDR
#

def computeLatLon(deviceRecord):
    return (float(deviceRecord['Lat']), float(deviceRecord['Lon']))

def computeEsdrId(deviceRecord):
    lat, lon = computeLatLon(deviceRecord)

    id = "%s_%06d%s%06d%s" % (deviceRecord['ID'], 
                              round(1000 * abs(lat)), 'NS'[lat < 0], 
                              round(1000 * abs(lon)), 'EW'[lon < 0])
    return id.replace('.','_')

def computeEpochTimestamp(deviceRecord):
    try:
        return json.loads(deviceRecord['Stats'])['lastModified'] / 1000.0
    except:
        return None

def computeEsdrRecord(deviceRecord):
    data = {}

    data['time'] = computeEpochTimestamp(deviceRecord)

    # As of 2021-01-06, we're only mirroring PM2_5, humidity, pressure, and temp_f
    for key in ['PM2_5Value', 'humidity', 'pressure', 'temp_f']:
        translated_key = key
        if key == 'PM2_5Value':
            translated_key = 'PM2_5'
        try:
            data[translated_key] = float(deviceRecord[key])
        except:
            pass

    # As of 2021-01-06, we're no longer mirroring the stats
    # try:
    #     stats = json.loads(deviceRecord['Stats'])
    #     for key in stats.keys():
    #         if key == 'lastModified' or key == 'timeSinceModified':
    #             continue
    #         data['stats_' + key] = stats[key]
    # except:
    #     # Stats stopped being reported Jan 2018
    #     pass

    return data

def computeEsdrName(deviceRecord):
    if 'Label' in deviceRecord:
        return deviceRecord['Label'].strip()
    else:
        return None

###########################################################
#
# Accumulate deviceRecords from multiple JSON input files
#

def accumulateReset():
    global accumulator
    accumulator = defaultdict(lambda: {'records':defaultdict(lambda: {})})

def accumulateJson(path):
    nUploads = 0
    nonameCount = 0
    locationCounts = defaultdict(lambda:0)
    js = json.load(bz2.open(path, 'r'))
    records = js['results']
    for deviceRecord in records:
        if not ('ID' in deviceRecord):
            locationCounts['noID'] += 1
            continue

        if not ('Lat' in deviceRecord) or not ('Lon' in deviceRecord):
            locationCounts['noLatLon'] += 1
            continue

        epochTimestamp = computeEpochTimestamp(deviceRecord)
        if not epochTimestamp:
            locationCounts['noTimestamp'] += 1
            continue

        location = deviceRecord.get('DEVICE_LOCATIONTYPE','unspecified')
        locationCounts[location] += 1
        if location == 'inside':
            continue

        esdrId = computeEsdrId(deviceRecord)

        a = accumulator[esdrId]
        record = computeEsdrRecord(deviceRecord)
        name = computeEsdrName(deviceRecord)
        if not name:
            nonameCount += 1
            continue
        #if 'name' in a and a['name'] != name:
        #    raise Exception('Trying to add %s to %s, but name %s is different from %s' %
        #                    (record, a, name, a['name']))
        a['records'][epochTimestamp] = record
        a['name'] = name
        lat, lon = computeLatLon(deviceRecord)
        a['lat'] = lat
        a['lon'] = lon
        nUploads += 1

    Stat.info('%s: using %d of %d records %s' % (path, nUploads, len(records), json.dumps(locationCounts)), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    Stat.info('After merge, total of %d ESDR IDs' % len(accumulator), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    if nonameCount:
        Stat.warning('%d purpleair records had no Label field and could not be merged' % nonameCount, host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)

############################################
#
# Find JSON files we haven't processed yet
#

def pathTopDir(path):
    return path.split('/')[0]

def pathSansTopDir(path):
    tokens = path.split('/')
    return '/'.join(tokens[1:])

# Get newer files than path in dir
def getNewerFiles(dir, newerThan, nSubdirLevels):
    ret = []
    if nSubdirLevels:
        for subdir in sorted(os.listdir(dir)):
            if not newerThan or subdir > pathTopDir(newerThan):
                for file in getNewerFiles(dir + '/' + subdir, None, nSubdirLevels-1):
                    ret.append(subdir + '/' + file)
            elif subdir == pathTopDir(newerThan):
                for file in getNewerFiles(dir + '/' + subdir, pathSansTopDir(newerThan), nSubdirLevels-1):
                    ret.append(subdir + '/' + file)
    else:
        for file in sorted(os.listdir(dir)):
            if not newerThan or file > newerThan:
                ret.append(file)
    return ret

###############################################
#
# Upload to ESDR
#

def sortedDict(dict):
    return {k:dict[k] for k in sorted(dict.keys()) }

def uploadId(id):
    try:
        a = accumulator[id]
        if 'lat' in a and 'lon' in a:
            feed = esdr.cached_get_or_create_product_device_feed('PurpleAir', id, a['lat'], a['lon'])
            dicts = list(sortedDict(a['records']).values())
            df = pd.DataFrame(dicts)
            esdr.upload_dicts(feed, dicts)
            #Stat.info('Successfully uploaded to feed %d' % (id), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
        else:
            Stat.warning('Skipping ID %s because it has no lat/lon' % (id), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    except requests.HTTPError as e:
        Stat.warning('Failed to upload to feed corresponding to ID %s (HTTP %d)' % (id, e.response.status_code), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    except:
        Stat.warning('Failed to upload to feed corresponding to ID %s' % (id), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME)
    sys.stdout.flush()
    sys.stderr.flush()

In [None]:
# Do this 20 times and then exit, relying on the cron job to start it up again. We do this as a simple way to deal with
# refreshing the ESDR OAuth2 token, rather than adding code to catch the HTTP 401/403 error upon upload, yada yada. The
# drawback of doing it this way is that the esdr.cached_get_or_create_product_device_feed() method has to start fresh
# with a new cache, but doing so once every 20 runs is fine.
for i in list(range(20)):
    lastUploaded = progress.get('lastUploaded', None)

    files = getNewerFiles('mirror', lastUploaded, nSubdirLevels=1)
    # Only json.bz2 files
    files = list(filter(re.compile(r'\.json\.bz2$').search, files))

    # A maximum of the first 500 files
    files = files[0:500]

    accumulateReset()

    for file in files:
        accumulateJson('mirror/' + file)

    for id in sorted(accumulator.keys()):
        uploadId(id)

    Stat.up('Uploaded %d devices from %d files (%s ... %s)' % (len(accumulator), len(files), files[0], files[-1]), host=STAT_HOSTNAME, shortname=STAT_SHORTNAME,
            valid_for_secs=3600 * 4)
    progress['lastUploaded'] = files[-1]