In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium

#### Data Information

In [2]:
# Load the data
data = pd.read_csv('./raw_csvs/daily_max_8_hour_CO_concentration_houston.csv')
data.head()

Unnamed: 0,Date,Source,Site ID,POC,Daily Max 8-hour CO Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,AQS Parameter Description,Method Code,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude
0,1/1/2023,AQS,482011035,1,0.4,ppm,5,Clinton,19,79,...,Carbon monoxide,593,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,201,Harris,29.733726,-95.257593
1,1/2/2023,AQS,482011035,1,0.3,ppm,3,Clinton,24,100,...,Carbon monoxide,593,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,201,Harris,29.733726,-95.257593
2,1/3/2023,AQS,482011035,1,0.5,ppm,6,Clinton,24,100,...,Carbon monoxide,593,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,201,Harris,29.733726,-95.257593
3,1/4/2023,AQS,482011035,1,0.8,ppm,9,Clinton,9,38,...,Carbon monoxide,593,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,201,Harris,29.733726,-95.257593
4,1/5/2023,AQS,482011035,1,0.9,ppm,10,Clinton,24,100,...,Carbon monoxide,593,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,201,Harris,29.733726,-95.257593


In [3]:
# Show the data types
data['Date'] = pd.to_datetime(data['Date'])
data.dtypes

Date                                 datetime64[ns]
Source                                       object
Site ID                                       int64
POC                                           int64
Daily Max 8-hour CO Concentration           float64
Units                                        object
Daily AQI Value                               int64
Local Site Name                              object
Daily Obs Count                               int64
Percent Complete                              int64
AQS Parameter Code                            int64
AQS Parameter Description                    object
Method Code                                   int64
CBSA Code                                     int64
CBSA Name                                    object
State FIPS Code                               int64
State                                        object
County FIPS Code                              int64
County                                       object
Site Latitud

In [4]:
# Show the unique values for each column
for col in data.columns[1:]:
    print(col, data[col].unique())

Source ['AQS']
Site ID [482011035 482011039 482011052]
POC [1 2]
Daily Max 8-hour CO Concentration [0.4 0.3 0.5 0.8 0.9 0.7 0.6 0.2 0.1 1.2 2.4 1.3 1.4 1.1 1.  1.5 1.7 1.6]
Units ['ppm']
Daily AQI Value [ 5  3  6  9 10  8  7  2  1 14 27 15 16 13 11 17 19 18]
Local Site Name ['Clinton' 'Houston Deer Park #2' 'Houston North Loop']
Daily Obs Count [19 24  9 13  3 16  7 17  8 11 22 20 21 23 10 15  1 18  2 14  4  6  5]
Percent Complete [ 79 100  38  54  13  67  29  71  33  46  92  83  88  96  42  63   4  75
   8  58  17  25  21]
AQS Parameter Code [42101]
AQS Parameter Description ['Carbon monoxide']
Method Code [593  93]
CBSA Code [26420]
CBSA Name ['Houston-The Woodlands-Sugar Land, TX']
State FIPS Code [48]
State ['Texas']
County FIPS Code [201]
County ['Harris']
Site Latitude [29.733726 29.670025 29.81453 ]
Site Longitude [-95.257593 -95.128508 -95.38769 ]


In [5]:
# Set the date as the index
data = data.set_index('Date')
data.sort_index(inplace=True)

#### Data Preprocessing

In [6]:
# Remove columns with only one unique value
for col in data.columns:
    if data[col].unique().size == 1:
        data.drop(col, axis=1, inplace=True)

In [7]:
# Show number of occurrences for each unique value in each column
for col in data.columns:
    if col == 'Daily Max 8-hour CO Concentration':
        continue
    unique_values = data[col].unique()
    for val in unique_values:
        print(col + ": " + str(val) + " " + str(data[data[col] == val].shape[0]))

Site ID: 482011035 338
Site ID: 482011039 360
Site ID: 482011052 364
POC: 1 702
POC: 2 360
Daily AQI Value: 5 99
Daily AQI Value: 2 306
Daily AQI Value: 8 101
Daily AQI Value: 7 74
Daily AQI Value: 3 214
Daily AQI Value: 6 80
Daily AQI Value: 10 40
Daily AQI Value: 9 82
Daily AQI Value: 15 3
Daily AQI Value: 27 1
Daily AQI Value: 16 2
Daily AQI Value: 1 23
Daily AQI Value: 13 8
Daily AQI Value: 11 20
Daily AQI Value: 14 6
Daily AQI Value: 17 1
Daily AQI Value: 19 1
Daily AQI Value: 18 1
Local Site Name: Clinton 338
Local Site Name: Houston Deer Park #2 360
Local Site Name: Houston North Loop 364
Daily Obs Count: 19 4
Daily Obs Count: 24 956
Daily Obs Count: 9 3
Daily Obs Count: 18 32
Daily Obs Count: 13 3
Daily Obs Count: 3 2
Daily Obs Count: 16 10
Daily Obs Count: 7 5
Daily Obs Count: 17 12
Daily Obs Count: 8 4
Daily Obs Count: 11 1
Daily Obs Count: 22 3
Daily Obs Count: 20 5
Daily Obs Count: 21 2
Daily Obs Count: 23 4
Daily Obs Count: 10 2
Daily Obs Count: 15 4
Daily Obs Count: 1 1
D

In [60]:
sites = [data['Local Site Name']['2023-01-01'], data['Site Longitude']['2023-01-01'], data['Site Latitude']['2023-01-01']]
sites = np.array(sites).T
min_lat, max_lat = 29.4, 30.1
min_long, max_long = -95.85, -94.9
# Create a map only showing Houston
m = folium.Map(max_bounds=True, location=[29.7604, -95.3698], tiles='cartodb positron', zoom_start=11, min_zoom=10, min_lon=min_long, max_lon=max_long, min_lat=min_lat, max_lat=max_lat)
for site in sites:
    folium.Marker(location=[site[2], site[1]], popup=site[0]).add_to(m)
m