In [None]:
import json, os, re, requests, subprocess, sys, datetime, math

In [None]:
# Boilerplate to load utils.ipynb
# See https://github.com/CMU-CREATE-Lab/python-utils/blob/master/utils.ipynb


def exec_ipynb(filename_or_url):
    nb = (requests.get(filename_or_url).json() if re.match(r'https?:', filename_or_url) else json.load(open(filename_or_url)))
    if(nb['nbformat'] >= 4):
        src = [''.join(cell['source']) for cell in nb['cells'] if cell['cell_type'] == 'code']
    else:
        src = [''.join(cell['input']) for cell in nb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']

    tmpname = '/tmp/%s-%s-%d.py' % (os.path.basename(filename_or_url),
                                    datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'),
                                    os.getpid())
    src = '\n\n\n'.join(src)
    open(tmpname, 'w').write(src)
    code = compile(src, tmpname, 'exec')
    exec(code, globals())


exec_ipynb('./python-utils/utils.ipynb')
exec_ipynb('./python-utils/esdr-library.ipynb')

In [None]:
PURPLE_AIR_V2_ESDR_PRODUCT_NAME = 'purpleair_v2' # https://esdr.cmucreatelab.org/api/v1/products/purpleair_v2

# First time uploading, create a new client like so:

# Esdr.save_client('esdr-auth-purpleair-uploader.json', 'PurpleAir uploader for timemachine1')

# and then follow the directions it prints, which include visiting esdr.cmucreatelab.org and creating
# a client with given parameters, and also editing esdr-auth-baaqm-uploader.json to include your
# username and password

# Do not add esdr-auth-*.json to the git repo
# !echo 'esdr-auth-*.json' >>.gitignore

esdr = Esdr('esdr-auth-purpleair-uploader.json')

# load the PurpleAir v2 product
purpleair_product = esdr.get_product_by_name(PURPLE_AIR_V2_ESDR_PRODUCT_NAME)

feed_cache = {}

In [None]:
# Find pre-2019 PurpleAirs

epoch = "1569888000"
url = f"https://esdr.cmucreatelab.org/api/v1/feeds?whereAnd=productId=69,minTimeSecs%3C={epoch}"
response = requests.get(url, timeout=120)
response_data = response.json()
count = response_data['data']['totalCount']
num_offsets = math.ceil(count / 1000)
sensors = response_data['data']['rows']
for offset_num in range(num_offsets):
    url = f"https://esdr.cmucreatelab.org/api/v1/feeds?whereAnd=productId=69,minTimeSecs%3C={epoch}&offset={(offset_num + 1) * 1000}"
    print(url)
    response = requests.get(url, timeout=120)
    response_data = response.json()
    sensors += response_data['data']['rows']

In [None]:
# Link pre-2019 A&B PurpleAirs together

dict = {}
count_match = 0
fixed_channels = set(['PM2_5', 'PM2_5_a', 'PM2_5_b', 'humidity', 'pressure', 'temp_f'])
for sensor in sensors:
    matched = False

    # Skip garbage ids
    if sensor['id'] in [25740,25657,24741,24614,32844,21831,53721,53722,15007,15798]:
        continue

    for d in dict.values():
        if 'longitude' in d and d['longitude'] == sensor['longitude'] and d['latitude'] == sensor['latitude']:
            matched = True
            matches = []
            if 'matches' in d:
                matches = d['matches']
                filtered_channels = list(fixed_channels & set(list(sensor['channelBounds']['channels'].keys())))
                matches[sensor['id']] = {'name' : sensor['name'], 'channels' : filtered_channels}
                dict[d['id']]['matches'] = matches
                break

    if not matched:
        filtered_channels = list(fixed_channels & set(list(sensor['channelBounds']['channels'].keys())))
        channels = filtered_channels
        dict[sensor['id']] = {'matches' : {}, 'id': sensor['id'], 'name': sensor['name'], 'channels' : channels, 'latitude': sensor['latitude'], 'longitude' : sensor['longitude'], 'exposure' : sensor['exposure']}


In [None]:
# Delete pairings that matched more than 2 sensors at the same lat/lon. ~130 are removed. 
print(len(dict.keys()))
for sensor in list(dict.values()):
    if (len(sensor['matches'].keys()) > 2):
        del dict[sensor['id']]
print(len(dict.keys()))

In [None]:
# Match v1 ids to new v2 ids

# Only 1 match means just channelA
# matches may not have equal channels amongst each other, so we need to do the intersection of channel names

v2_pairings = {}

for sensor in dict.values():
    response = esdr.api('GET', '/api/v1/feeds', {'whereAnd': 'latitude=%s,longitude=%s,productId=%s' % (sensor['latitude'], sensor['longitude'], '101')})
    #response = esdr.api('GET', '/api/v1/feeds', {'whereAnd': 'name=%s,productId=%s' % (re.sub("\s+B\s+", " ", sensor['name']), '101')})
    if len(response['data']['rows']) > 0:
        pairing = {response['data']['rows'][0]['id'] : [sensor['id']] + list(sensor['matches'].keys())}
        v2_pairings[response['data']['rows'][0]['id']] = [sensor['id']] + list(sensor['matches'].keys())
        print(pairing)


In [None]:
v2_pairings_flattened = [item for sublist in list(v2_pairings.values()) for item in sublist]
# print(v2_pairings_flattened)

In [None]:
# Export data

data_list = []
count = 0
epoch = '1569888000'
for sensor in dict.values():
    if sensor['id'] not in v2_pairings_flattened:
        continue
    data = {'a' : {}, 'b' : {}, 'avg': []}
    ids = [sensor['id']] + list(sensor['matches'].keys())
    if (len(ids) == 2):
        channels = {ids[0] : dict[ids[0]]['channels'], ids[1] : dict[ids[0]]['matches'][ids[1]]['channels']}
    else:
        channels = {ids[0] : dict[ids[0]]['channels']}
        
    for idx, id in enumerate(ids):
        channels_lookup = ""
        for channel in channels[id]:
            channels_lookup += str(id) + "." + channel + ","
        url = f"https://esdr.cmucreatelab.org/api/v1/feeds/export/{channels_lookup}?to={epoch}&format=json"
        print(url)
        response = requests.get(url, timeout=120)
        
        # Ignore ones with < 20 data points
        if len(response.json()['data']) < 20:
            print(f"Found less than 20 data points for sensor {id}")
            continue

        if sensor['id'] == id:
            name = sensor['name']
        elif id in sensor['matches']:
            name = sensor['matches'][id]['name']

        if ' B ' in name:
            data['b'][id] = response.json()
        else:
            if idx > 0 and len(data['a'].keys()) > 0:
                data['b'][id] = response.json()
            else:
                data['a'][id] = response.json()

    if len(data['a'].keys()) > 0 or len(data['b'].keys()) > 0:
        data_list.append(data)
    else:
        print("skipping add")
        
    # count += 1
    # if count == 1:
    #     break

In [None]:
import pandas as pd
import numpy as np

# print(len(data_dict[12674]['data']))
# print(len(data_dict[12673]['data']))
# print(len(dict.keys()))
for idx, exports in enumerate(data_list):
    print(f"Processing {idx+1} out of {len(data_list)}")
    pairs = []
    if len(exports['a'].keys()) > 0:
        pairs = [list(exports['a'].keys())[0]] + pairs
    if len(exports['b'].keys()) > 0:
        pairs.append(list(exports['b'].keys())[0])
    # pairs = [list(exports['a'].keys())[0],list(exports['b'].keys())[0]]
    if len(pairs) == 2:
        a_channels = [element.split(".")[-1] for element in exports['a'][pairs[0]]['channel_names']]
        df_channels = ['epoch'] + a_channels
        df = pd.DataFrame(exports['a'][pairs[0]]['data'], columns=df_channels).sort_values(by=['epoch'], ascending=True)
        
        b_channels = [element.split(".")[-1] for element in exports['b'][pairs[1]]['channel_names']]
        df2_channels = ['epoch'] + b_channels
        df2 = pd.DataFrame(exports['b'][pairs[1]]['data'], columns=df2_channels).sort_values(by=['epoch'], ascending=True)


        # Find intersection of channels
        # Not relevant if you ensure channels pulled from ESDR will have data, otherwise ESDR will create the union and fill in with Nones
        a = set(df_channels)
        b = set(df2_channels)
        channel_intersection = list(a & b)

        cols_to_remove = [col for col in df.columns if col not in channel_intersection]
        df = df.drop(cols_to_remove, axis=1)
        cols_to_remove = [col for col in df2.columns if col not in channel_intersection]
        df2 = df2.drop(cols_to_remove, axis=1)

        # Compute averages between the two matched sensors
        counter = 0
        new_data = []
        ## for epoch in df2['epoch']:
        ##     #found_idx = df['epoch'].sub(epoch).abs().idxmin()
        ##     found_idx = abs(df['epoch'] - epoch).idxmin()
        ##     # print(f"for time {epoch} found closest match at index {found_idx}")

        ##     # df.iloc[found_idx]
        ##     # df2.iloc[counter]
        ##     # for val in df2.iloc[counter]

        ##     new_data.append(np.nanmean(np.array([df.iloc[found_idx], df2.iloc[counter]]), axis=0).tolist())
        ##     counter += 1

        # avg_df_df2 = pd.DataFrame(new_data, columns=df.columns)
        
        final_cols = df.columns.tolist()
        final_cols.remove("epoch")
        ##exports['avg'] = {"channel_names":final_cols,"data":new_data} #pd.DataFrame(new_data, columns=df.columns)

        cols_to_remove = [col for col in df.columns if col not in ['epoch', 'PM2_5']]
        df_a = df.drop(cols_to_remove, axis=1)
        cols_to_remove = [col for col in df2.columns if col not in ['epoch', 'PM2_5']]
        df2_b = df2.drop(cols_to_remove, axis=1)

        exports['a'] = {pairs[0] : {"channel_names":['PM2_5_a'],"data":df_a.values.tolist()}}
        exports['b'] = {pairs[1] : {"channel_names":['PM2_5_b'],"data":df2_b.values.tolist()}}

        #print(df.iloc[3])

        #data = np.array([df.iloc[3], df2.iloc[2]])
        #print(np.average(data, axis=0).tolist())

        # df[:idx].groupby(df.index[:idx] // 2).mean()
        #print(df.iloc[3].groupby(df2.iloc[2] // 2).mean())

        #for idx, data in enumerate(data_dict[12668]['data']):
        #    print(abs(data[0] - data_dict[12777]['data'][idx][0]))
    else:
        print(f"Exports length was not 2 but was instead {len(pairs)} for {exports['a'].keys(), exports['b'].keys()}")

        # Somehow we sometimes get something shoved in b when it should be in a....
        tmp = exports['a']
        if len(exports['a'].keys()) == 0:
            tmp = exports['b']
        a_channels = [element.split(".")[-1] for element in tmp[pairs[0]]['channel_names']]
        df_channels = ['epoch'] + a_channels
        df = pd.DataFrame(tmp[pairs[0]]['data'], columns=df_channels).sort_values(by=['epoch'], ascending=True)

        cols_to_remove = [col for col in df.columns if col not in ['epoch', 'PM2_5']]
        df_a = df.drop(cols_to_remove, axis=1)

        exports['a'] = {pairs[0] : {"channel_names":['PM2_5_a'],"data":df_a.values.tolist()}}
        exports['b'] = {}
        exports['avg'] = {pairs[0] : {"channel_names":['PM2_5'],"data":df_a.values.tolist()}}


In [None]:
def find_v2_id(id):
    for v2_id in v2_pairings:
        if id in v2_pairings[v2_id]:
            return v2_id
    return None


for upload in data_list:
    # # TODO: Hack until all code is rerun
    # if len(upload['a']) == 0:
    #     print("skipping", upload['a'].keys(), upload['b'].keys())
    #     continue
    # 'a' and 'b' are already paired, so we just need to look at one of them to get the v2 id
    old_id = list(upload['a'].keys())[0]
    v2_id = find_v2_id(old_id)
    v2_feed = esdr.get_feed_by_id(v2_id)
    #print(v2_feed)
    print(old_id, v2_id)
    # a, b, avg
    for key in upload.keys():
        if len(upload[key]) > 0:
            data = list(upload[key].values())[0]
            #print(data['channel_names'])
            esdr.upload(v2_feed, data)
