In [115]:
# Reference: https://jupyterbook.org/interactive/hiding.html
# Use {hide, remove}-{input, output, cell} tags to hiding content

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

In [116]:
def df_interact(df):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0):
        return df[row:row + 5]
    interact(peek, row=(0, len(df), 5))
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

In [19]:
import requests

# Outline for PurpleAir Case Study

Introduction

- wildfires, esp in california
  - want rapid response for evacuation and safety
- us EPA has sensors
  - highly accurate, but expensive, sparse and slow
- purpleair sensors
  - cheap, ubiquitous, fast. but not as accurate
- the AQI from purpleair are often different than US EPA sensors
  - figure: two screenshots comparing purpleair with us EPA maps
- is it possible to combine the readings to get accurate, timely, and spatially
  close readings?
- barkjohn et al. investigated this question and found that yes, you can adjust
  the purpleair data so that it closely follows the more accurate us epa data.
- illustrates an important application of data science: using large, inaccurate
  data to amplify usefulness of small, accurate data.
  - the results of this work are currently used in the official US AQI websites
    and on purpleair as the "EPA correction".
- in this case study, we replicate parts of barkjohn's analysis.
  - serves as a real-world, extended example of EDA, data cleaning, and
    visualization.
  - also has ties to subsequent chapters on web data and modeling

Overview of analysis

- goal of analysis is to find a correction for purpleair data to make it match
  us epa data.
- this involves the following steps, which we recreate from the barkjohn study.
  - get purpleair and epa data
  - find co-located purpleair and EPA sensors.
  - explore, clean, and visualize each dataset
  - join datasets together so that they can be directly compared
  - construct a model to the minimize the difference between purpleair and epa
    data.

## Co-located Sensors