In [1]:
import bz2, html, json, os, re, requests, subprocess

from sqlitedict import SqliteDict
from collections import defaultdict
import pandas as pd

if not os.path.exists('python-utils'):
    subprocess.check_output('git clone https://github.com/CMU-CREATE-Lab/python-utils.git', shell=True)

def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']
    exec('\n'.join(src), globals())

os.chdir('/t/esdr-connectors/mirror-purpleair-to-esdr/') 
exec_ipynb('python-utils/utils.ipynb')
exec_ipynb('python-utils/esdr-library.ipynb')
notebook_wide_display()

In [2]:
Stat.set_service('Purpleair upload to ESDR realtime')
progress = SqliteDict('upload-purpleair-realtime.sqlite', autocommit=True)

In [3]:
# First time uploading, create a new client like so:

# Esdr.save_client('esdr-auth-purpleair-uploader.json', 'PurpleAir uploader for timemachine1')

# and then follow the directions it prints, which include visiting esdr.cmucreatelab.org and creating
# a client with given parameters, and also editing esdr-auth-baaqm-uploader.json to include your
# username and password

# Do not add esdr-auth-*.json to the git repo
# !echo 'esdr-auth-*.json' >>.gitignore

esdr = Esdr('esdr-auth-purpleair-uploader.json')

In [4]:
############################################
#
# Parse and convert device records to ESDR
#

def computeLatLon(deviceRecord):
    return (float(deviceRecord['Lat']), float(deviceRecord['Lon']))

def computeEsdrId(deviceRecord):
    lat, lon = computeLatLon(deviceRecord)

    id = "%s_%06d%s%06d%s" % (deviceRecord['ID'], 
                              round(1000 * abs(lat)), 'NS'[lat < 0], 
                              round(1000 * abs(lon)), 'EW'[lon < 0])
    return id.replace('.','_')

def computeEpochTimestamp(deviceRecord):
    try:
        return json.loads(deviceRecord['Stats'])['lastModified'] / 1000.0
    except:
        return None
    
def computeEsdrRecord(deviceRecord):
    data = {}
    
    data['time'] = computeEpochTimestamp(deviceRecord)

    for key in ['PM2_5Value', 'RSSI', 'Uptime', 'humidity', 'pressure', 'temp_f']:
        translated_key = key
        if key == 'PM2_5Value':
            translated_key = 'PM2_5'
        try:
            data[translated_key] = float(deviceRecord[key])
        except:
            pass

    try:
        stats = json.loads(deviceRecord['Stats'])
        for key in stats.keys():
            if key == 'lastModified' or key == 'timeSinceModified':
                continue
            data['stats_' + key] = stats[key]
    except:
        # Stats stopped being reported Jan 2018
        pass

    return data
    
def computeEsdrName(deviceRecord):
    return deviceRecord['Label'].strip()


###########################################################
#
# Accumulate deviceRecords from multiple JSON input files
#

def accumulateReset():
    global accumulator
    accumulator = defaultdict(lambda: {'records':defaultdict(lambda: {})})

def accumulateJson(path):
    nUploads = 0
    locationCounts = defaultdict(lambda:0)
    js = json.load(bz2.open(path, 'r'))
    records = js['results']
    for deviceRecord in records:
        if not ('Lat' in deviceRecord) or not ('Lon' in deviceRecord):
            locationCounts['noLatLon'] += 1
            continue
            
        epochTimestamp = computeEpochTimestamp(deviceRecord)
        if not epochTimestamp:
            locationCounts['noTimestamp'] += 1
            continue

        location = deviceRecord.get('DEVICE_LOCATIONTYPE','unspecified')
        locationCounts[location] += 1
        if location == 'inside':
            continue
        
        esdrId = computeEsdrId(deviceRecord)
        
        a = accumulator[esdrId]
        a['records'][epochTimestamp] = computeEsdrRecord(deviceRecord)
        if 'name' in a:
            assert a['name'] == computeEsdrName(deviceRecord)
        a['name'] = computeEsdrName(deviceRecord)
        lat, lon = computeLatLon(deviceRecord)
        a['lat'] = lat
        a['lon'] = lon
        nUploads += 1

    Stat.info('%s: using %d of %d records %s' % (path, nUploads, len(records), json.dumps(locationCounts)))
    Stat.info('After merge, total of %d ESDR IDs' % len(accumulator))

############################################
#
# Find JSON files we haven't processed yet
#
    
def pathTopDir(path):
    return path.split('/')[0]

def pathSansTopDir(path):
    tokens = path.split('/')
    return '/'.join(tokens[1:])

# Get newer files than path in dir
def getNewerFiles(dir, newerThan, nSubdirLevels):
    ret = []
    if nSubdirLevels:
        for subdir in sorted(os.listdir(dir)):
            if not newerThan or subdir > pathTopDir(newerThan):
                for file in getNewerFiles(dir + '/' + subdir, None, nSubdirLevels-1):
                    ret.append(subdir + '/' + file)
            elif subdir == pathTopDir(newerThan):
                for file in getNewerFiles(dir + '/' + subdir, pathSansTopDir(newerThan), nSubdirLevels-1):
                    ret.append(subdir + '/' + file)
    else:
        for file in sorted(os.listdir(dir)):
            if not newerThan or file > newerThan:
                ret.append(file)
    return ret

###############################################
#
# Upload to ESDR
#

def sortedDict(dict):
    return {k:dict[k] for k in sorted(dict.keys()) }

def uploadId(id):
    a = accumulator[id]
    feed = esdr.cached_get_or_create_product_device_feed('PurpleAir', id, a['lat'], a['lon'])
    dicts = list(sortedDict(a['records']).values())
    df = pd.DataFrame(dicts)
    esdr.upload_dicts(feed, dicts)
    


In [5]:
while True:
    lastUploaded = progress.get('lastUploaded', None)

    files = getNewerFiles('mirror', lastUploaded, nSubdirLevels=1)
    # Only json.bz2 files
    files = list(filter(re.compile(r'\.json\.bz2$').search, files))

    # A maximum of the first 100 files
    files = files[0:100]

    accumulateReset()

    for file in files:
        accumulateJson('mirror/' + file)

    for id in sorted(accumulator.keys()):
        uploadId(id)

    Stat.up('Uploaded %d devices from %d files (%s ... %s)' % (len(accumulator), len(files), files[0], files[-1]))
    progress['lastUploaded'] = files[-1]
    

