In [1]:
# Reference: https://jupyterbook.org/interactive/hiding.html
# Use {hide, remove}-{input, output, cell} tags to hiding content

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

In [9]:
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

# Finding Collocated Sensors

When starting an analysis, we like to begin with a research question.
In this case, our research question might be:
"How do we correct PurpleAir sensor readings so that they match AQS sensor readings?"

Our analysis begins by finding collocated pairs of AQS and PurpleAir sensors---sensors that are placed immediately next to each other.
This step is important because it lets us reduce the effects of other variables that might caused differences in sensor readings.
Consider what would happen if we compared an AQS sensor placed inside a building with a PurpleAir sensor placed outside the building. 
The two sensors would have different readings, but some of these differences
would happen because the sensors are exposed to different environments.
Ensuring that sensors are truly collocated lets us say that the differences in sensor
readings is caused the different ways the sensors are built, rather than
other potential confounding variables.

Barkjohn et al.'s analysis found pairs of AQS and PurpleAir sensors that
are installed within 50 meters of each other.
Then, they contacted each AQS site to see whether the people maintaining the site
also maintained a PurpleAir sensor.
This extra effort gave them confidence that their sensor pairs were truly collocated.

In this section, we'll explore and clean data from the AQS and PurpleAir.
Then, we'll perform a similar join to construct a list of potentially collocated sensors.
We won't contact AQS sites ourselves;
instead, we'll proceed with the analysis by reusing Barkjohn et al.'s list of truly
collocated sensors.

We've downloaded a list of AQS and PurpleAir sensors in `data/list_of_aqs_sites.csv`
and `data/list_of_purpleair_sensors.json`.
Let's begin by reading these files into `pandas`.
First, we check file sizes to see whether they are reasonable to load into memory.

In [23]:
!ls -lLh data/list_of*

-rw-r--r--  1 sam  staff   4.8M Oct 27 16:54 data/list_of_aqs_sites.csv
-rw-r--r--  1 sam  staff   3.8M Oct 22 16:10 data/list_of_purpleair_sensors.json


Both files are relatively small. We'll start with the list of AQS sites.

## Wrangling and Exploring the List of AQS Sites

Let's load the CSV file into `pandas`.

In [29]:
aqs_sites = pd.read_csv('data/list_of_aqs_sites.csv')

# The table output is fairly large, so display 2 rows
aqs_sites.head(2)

Unnamed: 0,AQS_Site_ID,POC,State,City,...,Reporting_Agency,Parameter_Name,Annual_URLs,Daily_URLs
0,01-003-0010,1,Alabama,Fairhope,...,Al Dept Of Env Mgt,PM2.5,<a href='https://www3.epa.gov/cgi-bin/broker?_...,<a href='https://www3.epa.gov/cgi-bin/broker?_...
1,01-027-0001,1,Alabama,Ashland,...,Al Dept Of Env Mgt,PM2.5,<a href='https://www3.epa.gov/cgi-bin/broker?_...,<a href='https://www3.epa.gov/cgi-bin/broker?_...


In [32]:
# Number of rows and columns in the table
aqs_sites.shape

(1333, 28)

There are 28 columns in the table.
Jupyter hides the middle columns to avoid overfilling the screen.
However, we want to see what the columns are so that we can remove columns that
aren't useful to us.
To do this, we'll use a trick. We'll slice out the first row of `aqs_sites`, then
convert it to a dataframe.

In [33]:
aqs_sites.iloc[0].to_frame()

Unnamed: 0,0
AQS_Site_ID,01-003-0010
POC,1
State,Alabama
...,...
Parameter_Name,PM2.5
Annual_URLs,<a href='https://www3.epa.gov/cgi-bin/broker?_...
Daily_URLs,<a href='https://www3.epa.gov/cgi-bin/broker?_...


This converts the columns to the index of the dataframe for display purposes.
Next, we'll tell `pandas` to display more rows so that we can browse all the
columns are once.

In [34]:
from IPython.display import display

rows_to_show = 28
with pd.option_context('display.max_rows', rows_to_show):
    display(aqs_sites.iloc[0].to_frame())

Unnamed: 0,0
AQS_Site_ID,01-003-0010
POC,1
State,Alabama
City,Fairhope
CBSA,"Daphne-Fairhope-Foley, AL"
Local_Site_Name,"FAIRHOPE, Alabama"
Address,"FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE..."
Datum,NAD83
Latitude,30.5
Longitude,-87.88


This 

In [20]:
from pathlib import Path

data_dir = Path('data')

!ls -lLh data/

total 19376
-rw-r--r--   1 sam  staff   867K Oct 22 16:10 aqs_06-067-0010.csv
drwxr-xr-x  10 sam  staff   320B Oct 22 16:10 [34mcleaned_purpleair_aqs[m[m
-rw-r--r--   1 sam  staff   4.8M Oct 27 16:54 list_of_aqs_sites.csv
-rw-r--r--   1 sam  staff   3.8M Oct 22 16:10 list_of_purpleair_sensors.json
-rw-r--r--   1 sam  staff   2.5K Oct 22 16:10 matched_pa_aqs.csv
drwxr-xr-x   7 sam  staff   224B Oct 22 16:10 [34mpurpleair_AMTS[m[m


In [11]:
aqs_sites = pd.read_csv(data_dir / 'list_of_aqs_sites.csv')
aqs_sites

Unnamed: 0,AQS_Site_ID,POC,State,City,...,Reporting_Agency,Parameter_Name,Annual_URLs,Daily_URLs
0,01-003-0010,1,Alabama,Fairhope,...,Al Dept Of Env Mgt,PM2.5,<a href='https://www3.epa.gov/cgi-bin/broker?_...,<a href='https://www3.epa.gov/cgi-bin/broker?_...
1,01-027-0001,1,Alabama,Ashland,...,Al Dept Of Env Mgt,PM2.5,<a href='https://www3.epa.gov/cgi-bin/broker?_...,<a href='https://www3.epa.gov/cgi-bin/broker?_...
2,01-049-1003,1,Alabama,Crossville,...,Al Dept Of Env Mgt,PM2.5,<a href='https://www3.epa.gov/cgi-bin/broker?_...,<a href='https://www3.epa.gov/cgi-bin/broker?_...
...,...,...,...,...,...,...,...,...,...
1330,80-002-0012,3,Country Of Mexico,Mexicali,...,Tracer Technologies,PM2.5,<a href='https://www3.epa.gov/cgi-bin/broker?_...,<a href='https://www3.epa.gov/cgi-bin/broker?_...
1331,80-002-0012,4,Country Of Mexico,Mexicali,...,Tracer Technologies,PM2.5,<a href='https://www3.epa.gov/cgi-bin/broker?_...,<a href='https://www3.epa.gov/cgi-bin/broker?_...
1332,80-002-0014,3,Country Of Mexico,Mexicali,...,Tracer Technologies,PM2.5,<a href='https://www3.epa.gov/cgi-bin/broker?_...,<a href='https://www3.epa.gov/cgi-bin/broker?_...


In [12]:
df_interact(aqs_sites)

interactive(children=(IntSlider(value=0, description='row', max=1333, step=7), IntSlider(value=0, description=…

(1333 rows, 28 columns) total


In [4]:
!head data/list_of_purpleair_sensors.json | cut -c 1-70

{"version":"7.0.30",
"fields":
["ID","pm","pm_cf_1","pm_atm","age","pm_0","pm_1","pm_2","pm_3","pm_4"
"data":[
[20,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,97,0.0,0.0,0.0,0.0,0.0,0
[47,null,null,null,4951,null,null,null,null,null,null,null,96,null,nul
[53,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.2,5.2,6.0,97,0.0,0.5,702.3,57.5,6.
[74,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,97,0.0,0.0,0.0,0.0,0.0,0
[77,9.8,9.8,9.8,1,9.8,10.7,11.0,11.2,13.8,15.1,15.5,97,9.7,9.8,6523.5,
[81,6.5,6.5,6.5,0,6.5,6.1,6.1,6.6,8.1,8.3,9.7,97,5.9,6.8,4058.9,346.1,


In [5]:
import json

with open(data_dir / 'list_of_purpleair_sensors.json') as f:
    pa_json = json.load(f)

list(pa_json.keys())

['version', 'fields', 'data', 'count']

In [6]:
pa_sites = pd.DataFrame(pa_json['data'], columns=pa_json['fields'])
pa_sites

Unnamed: 0,ID,pm,pm_cf_1,pm_atm,...,Voc,Ozone1,Adc,CH
0,20,0.0,0.0,0.0,...,,,0.01,1
1,47,,,,...,,0.72,0.72,0
2,53,0.0,0.0,0.0,...,,,0.00,1
...,...,...,...,...,...,...,...,...,...
23135,132237,5.3,5.3,5.3,...,,,0.00,3
23136,132431,3.2,3.2,3.2,...,,,0.03,3
23137,132471,0.5,0.5,0.5,...,,,0.05,3


Here, we would do a join on latitude and longitude to find PA and AQS sensors
that are within 50 meters of each other. The paper authors then contacted each
AQS site to see whether the sensors are actually co-located. We'll use their
list of co-located sensors.

Final list of co-located sensors:

In [7]:
matched = pd.read_csv(data_dir / 'matched_pa_aqs.csv')
matched

Unnamed: 0,PA ID,PA name (not included for private sensors),AQS site,Start Date,End Date
0,DE1,Private,10-030-2004,7/27/2019,11/18/2019
1,AK2,Ncore 2,02-090-0034,11/7/2018,1/12/2019
2,AK4,Ncore 3,02-090-0034,1/15/2019,6/16/2019
...,...,...,...,...,...
47,WI2,Private,55-087-0009,1/6/2019,10/24/2019
48,WI3,Private,55-087-0009,3/30/2019,4/4/2019
49,WI6,Private,55-133-0027,1/1/2019,3/27/2019
