In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
# Air Quality datasets 
# https://aqs.epa.gov/aqsweb/airdata/download_files.html#Raw
!wget https://aqs.epa.gov/aqsweb/airdata/hourly_42602_2017.zip
!unzip hourly_42602_2017.zip

In [None]:
!ls

In [None]:
# Load dataset 
# aq_data = pd.read_csv('./hourly_42602_2017.csv')
aq_data = pd.read_csv('/Users/kkbankol@us.ibm.com/Downloads/hourly_42602_2017.csv')

In [None]:
# View first 5 rows
aq_data.head()

In [None]:
# View titles of all columns
aq_data.columns

In [None]:
aq_data['Sample Measurement'].describe()

In [None]:
# print list of all unique monitoring site numbers
aq_data['Site Num'].unique()

In [None]:
# get number of aq sites in a single state, "California"
aq_data.loc[aq_data['State Name'] == "California"]['Site Num'].unique().shape

In [None]:
# show data associated 
los_angeles_aq = aq_data.loc[ \
                     (aq_data['Longitude'] > -118.455937) & \
                     (aq_data['Longitude'] < -117.842111) & \
                     (aq_data['Latitude'] > 33.764836) & \
                     (aq_data['Latitude'] < 34.173729) ]
los_angeles_aq

In [None]:
plt.ylabel('Air Quality Level')
plt.xlabel('Timestamp')
plt.legend( los_angeles_aq['Site Num'].unique().tolist(), loc='upper left')
# plt.legend( los_angeles_aq['Site Num'].unique().tolist() )
colors = [plt.cm.spectral(i) for i in np.linspace(0, 1, los_angeles_aq['Site Num'].unique().shape[0])]
ax.set_prop_cycle('color', colors)
for site in los_angeles_aq['Site Num'].unique():
    plt.plot( los_angeles_aq_trimmed.loc[los_angeles_aq_trimmed['Site Num'] == site]['Timestamp'], los_angeles_aq_trimmed.loc[los_angeles_aq_trimmed['Site Num'] == site]['Sample Measurement'])


In [None]:
# Print aq changes throughout the year, and create a seperate chart for each air quality monitoring site
ncols = 3
nrows = 4 #los_angeles_aq['Site Num'].unique().shape[0]
plt.subplots(nrows, ncols, figsize=(15,15))
idx = 1
data_summary = los_angeles_aq['Sample Measurement'].describe()
for site in los_angeles_aq['Site Num'].unique():
    plt.ylim(0, data_summary['max'] * 1.25)
    plt.subplot(nrows, ncols, idx)
    plt.plot( los_angeles_aq.loc[los_angeles_aq['Site Num'] == site]['Timestamp'], los_angeles_aq.loc[los_angeles_aq['Site Num'] == site]['Sample Measurement'])
    plt.title("site " + str(site))
    idx = idx + 1

In [None]:
# Print aq changes throughout the day from 0:00 - 24:00. Break into seperate plots by weekday
nrows = 4
ncols = 2
plt.subplots(nrows, ncols, figsize=(15,15))
weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]
idx = 1
values_by_weekday = {}
for day in weekdays:
    plt.subplot(nrows, ncols, idx)
    # for loop 0:00 - 24:00
    # sum measurements where weekday == and time ==
    times = []
    for hour in range(0, 23):
        if hour <= 9:
          time = "0" + str(hour) + ":00"
        else:
          time = str(hour) + ":00"
        # select subset matching given weekday and hour
        subset = los_angeles_aq.loc[ (los_angeles_aq['Weekday'] == day) & (los_angeles_aq['Time Local'] == time) ]['Sample Measurement']
        #         print(subset.mean())
        times.append(subset.sum())
    values_by_weekday[day] = times
    plt.plot( times )
    plt.title(day)
    idx = idx + 1