# Project#1 -- Air Quality Analysis - UNMC
Cameron Skeels - 
2/2/2026 - 
AirQuality_Daily_StudentVersion

# Imports

In [3]:
# Import libraries

import pandas as pd
import numpy as np

# Load Data

In [5]:
# Load PurpleAir csv
df = pd.read_csv("AirQuality_Daily_StudentVersion.csv")

# Preview data
df.head()

Unnamed: 0,date,monitor_index,humidity,pressure,temperature,voc,analog_input,pm2.5_alt,pm1.0_atm,pm2.5_atm,pm10.0_atm,sensor.latitude,sensor.longitude,sensor.altitude,sensor.name
0,02/23/24,195089,14.377667,912.884333,62.266667,51.998667,0.051333,0.1,0.0,0.0025,0.039667,40.050922,-101.53357,3005,Swnphd-Benklemen
1,02/23/24,195365,12.2236,926.403,71.1934,64.9208,0.0,0.18,0.0048,0.02,0.176,40.20033,-100.639885,2576,Swnphd-mccook
2,02/23/24,195541,20.09575,905.67075,61.00825,68.307,0.02,0.1625,0.004125,0.014812,0.063937,41.128284,-101.72022,3220,Swnphd-ogallala
3,02/24/24,195089,25.368,911.708833,51.462458,91.17675,0.052667,0.4375,0.099542,0.170667,0.355208,40.050922,-101.53357,3005,Swnphd-Benklemen
4,02/24/24,195365,23.703083,925.282125,56.818208,107.863708,0.0,0.475,0.099208,0.231687,0.548583,40.20033,-100.639885,2576,Swnphd-mccook


# Data Clean and Formatting

In [9]:
# Convert date column to datetime format
df["date"] = pd.to_datetime(df["date"])

# Standardize column names
df.columns = df.columns.str.lower().str.strip()

# Remove rows with missing critical values and create a clean copy
df = df.dropna(
    subset=["sensor.name", "pm2.5_atm", "pm10.0_atm", "voc"]
).copy()

# Humidity Classification

In [13]:
# Define humidity category function
def humidity_category(h):
    if h < 50:
        return "Low"
    elif 50 <= h <= 80:
        return "High"
    else:
        return "Very High"

# Apply humidity categories
df["humidity_category"] = df["humidity"].apply(humidity_category)

# Temperature Classification

In [20]:
# Define temperature category function
def temperature_category(t):
    if t < 32:
        return "Below Freezing"
    elif 32 <= t <= 50:
        return "Cool"
    elif 51 <= t <= 70:
        return "Warm"
    else:
        return "Hot"

# Apply temperature categories
df["temperature_category"] = df["temperature"].apply(temperature_category)

# Mean & Median Pollutant Concentrations

In [21]:
# Calculate mean and median pollutant concentrations by sensor
summary_stats = (
    df.groupby("sensor.name")[["voc", "pm2.5_atm", "pm10.0_atm"]]
    .agg(["mean", "median"])
)

summary_stats

Unnamed: 0_level_0,voc,voc,pm2.5_atm,pm2.5_atm,pm10.0_atm,pm10.0_atm
Unnamed: 0_level_1,mean,median,mean,median,mean,median
sensor.name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
#16 - Richardson County Courthouse,86.75865,84.4705,37.84601,7.730917,39.210342,8.857729
#17 - Otoe County,251.427296,234.953542,9.787754,7.68875,11.709849,9.414708
#18 - Southeast District Health Department- Tecumseh,196.150771,138.582583,613.175352,10.322875,614.227248,11.433729
Ainsworth Public School #9,219.200274,217.8975,10.697058,8.38974,12.390708,9.449781
Broken Bow,158.285807,138.559646,928.710593,36.05024,929.678512,43.179094
Buffalo County TRPHD #26,328.835391,343.764625,8.479442,5.6985,9.596038,6.656625
ELVPHD Norfolk HD 4,360.833744,368.5805,13.65942,10.044583,16.232581,11.722896
ELVPHD Tekamah HD 3,221.091595,145.515479,9.482822,6.418302,10.683828,7.612552
ELVPHD Wisner HD 5,102.291126,100.709271,11.993665,8.876021,14.323023,10.357156
FCHD-YPS,372.46272,375.3835,9.167842,5.838188,11.138241,7.44275


# Top 5 sensors (PM2.5 Mean and Median)

In [22]:
# Identify top 5 sensors by mean PM2.5
top5_pm25_mean = (
    df.groupby("sensor.name")["pm2.5_atm"]
    .mean()
    .sort_values(ascending=False)
    .head(5)
)

# Identify top 5 sensors by median PM2.5
top5_pm25_median = (
    df.groupby("sensor.name")["pm2.5_atm"]
    .median()
    .sort_values(ascending=False)
    .head(5)
)

top5_pm25_mean, top5_pm25_median


(sensor.name
 Broken Bow                                              928.710593
 #18 - Southeast District Health Department- Tecumseh    613.175352
 NCDHD O'Neill #11                                       164.495078
 Swnphd-mccook                                           123.011622
 #16 - Richardson County Courthouse                       37.846010
 Name: pm2.5_atm, dtype: float64,
 sensor.name
 Broken Bow                                              36.050240
 #18 - Southeast District Health Department- Tecumseh    10.322875
 ELVPHD Norfolk HD 4                                     10.044583
 ELVPHD Wisner HD 5                                       8.876021
 Ainsworth Public School #9                               8.389740
 Name: pm2.5_atm, dtype: float64)

# Top 5 Sensors (PM10.0 Mean and Median)

In [23]:
# Identify top 5 sensors by mean PM10.0
top5_pm10_mean = (
    df.groupby("sensor.name")["pm10.0_atm"]
    .mean()
    .sort_values(ascending=False)
    .head(5)
)

# Identify top 5 sensors by median PM10.0
top5_pm10_median = (
    df.groupby("sensor.name")["pm10.0_atm"]
    .median()
    .sort_values(ascending=False)
    .head(5)
)

top5_pm10_mean, top5_pm10_median


(sensor.name
 Broken Bow                                              929.678512
 #18 - Southeast District Health Department- Tecumseh    614.227248
 NCDHD O'Neill #11                                       166.132578
 Swnphd-mccook                                           124.227336
 #16 - Richardson County Courthouse                       39.210342
 Name: pm10.0_atm, dtype: float64,
 sensor.name
 Broken Bow                                              43.179094
 ELVPHD Norfolk HD 4                                     11.722896
 #18 - Southeast District Health Department- Tecumseh    11.433729
 ELVPHD Wisner HD 5                                      10.357156
 Ainsworth Public School #9                               9.449781
 Name: pm10.0_atm, dtype: float64)

# Top 5 Sensors (VOC Mean and Median)

In [24]:
# Identify top 5 sensors by mean VOC
top5_voc_mean = (
    df.groupby("sensor.name")["voc"]
    .mean()
    .sort_values(ascending=False)
    .head(5)
)

# Identify top 5 sensors by median PM2.5
top5_voc_median = (
    df.groupby("sensor.name")["voc"]
    .median()
    .sort_values(ascending=False)
    .head(5)
)

top5_voc_mean, top5_voc_median


(sensor.name
 Swnphd-ogallala                          399.434240
 FCHD-YPS                                 372.462720
 Three Rivers Public Health Department    370.216208
 ELVPHD Norfolk HD 4                      360.833744
 Swnphd-mccook                            353.941581
 Name: voc, dtype: float64,
 sensor.name
 Swnphd-ogallala                          423.082292
 Swnphd-mccook                            381.468479
 Three Rivers Public Health Department    376.810167
 FCHD-YPS                                 375.383500
 ELVPHD Norfolk HD 4                      368.580500
 Name: voc, dtype: float64)

# Maximum Pollution Events

In [25]:
# Identify maximum pollutant values and occurrence dates
max_values = df.loc[
    df.groupby("sensor.name")[["pm2.5_atm", "pm10.0_atm", "voc"]]
    .idxmax()
    .stack()
]

max_values[["sensor.name", "date", "pm2.5_atm", "pm10.0_atm", "voc"]]

Unnamed: 0,sensor.name,date,pm2.5_atm,pm10.0_atm,voc
3241,#16 - Richardson County Courthouse,2024-07-29,1671.833458,1672.959229,95.506250
3241,#16 - Richardson County Courthouse,2024-07-29,1671.833458,1672.959229,95.506250
5273,#16 - Richardson County Courthouse,2024-10-28,21.183458,23.369896,184.185417
3781,#17 - Otoe County,2024-08-20,37.250062,45.739812,125.517667
3781,#17 - Otoe County,2024-08-20,37.250062,45.739812,125.517667
...,...,...,...,...,...
1360,WCDHD City Building,2024-05-14,32.589521,40.104833,109.362167
8010,WCDHD City Building,2025-03-10,2.594146,3.583500,612.052292
7498,WCDHD Thedford Library 29,2025-02-15,36.138729,48.852063,251.385875
7498,WCDHD Thedford Library 29,2025-02-15,36.138729,48.852063,251.385875


# Humidity vs Air Quality

In [26]:
# Evaluate effect of humidity categories on air quality
humidity_effect = (
    df.groupby("humidity_category")[["pm2.5_atm", "pm10.0_atm"]]
    .mean()
)

humidity_effect

Unnamed: 0_level_0,pm2.5_atm,pm10.0_atm
humidity_category,Unnamed: 1_level_1,Unnamed: 2_level_1
High,80.874444,82.561133
Low,76.524933,77.891712
Very High,4.558496,5.418232


# Temperature vs Air Quality

In [27]:
# Evaluate effect of temperature categories on air quality
temperature_effect = (
    df.groupby("temperature_category")[["pm2.5_atm", "pm10.0_atm"]]
    .mean()
)

temperature_effect

Unnamed: 0_level_0,pm2.5_atm,pm10.0_atm
temperature_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Below Freezing,274.235903,276.586581
Cool,141.731525,143.142123
Hot,22.658487,24.194328
Warm,83.102238,84.021411


# AQI Health Risk Events (Proxy)

In [30]:
# Identify potential unhealthy conditions using PM thresholds
aqi_proxy = df[
    (df["pm2.5_atm"] >= 35.5) | (df["pm10.0_atm"] >= 155)
]

aqi_proxy[["sensor.name", "date", "pm2.5_atm", "pm10.0_atm"]]

Unnamed: 0,sensor.name,date,pm2.5_atm,pm10.0_atm
370,ELVPHD Wisner HD 5,2024-03-29,39.761896,50.163333
377,ELVPHD Norfolk HD 4,2024-03-29,38.425396,43.370375
529,TRPHD Harlan Co. Courthouse 24,2024-04-06,37.819938,43.793979
544,Swnphd-ogallala,2024-04-06,39.169042,46.152625
1341,ELVPHD Wisner HD 5,2024-05-14,48.140771,60.299813
...,...,...,...,...
8274,Swnphd-mccook,2025-03-23,981.919875,982.185500
8291,#18 - Southeast District Health Department- Te...,2025-03-24,1692.498333,1692.793896
8293,NCDHD O'Neill #11,2025-03-24,1714.236813,1714.360500
8294,Broken Bow,2025-03-24,1713.840375,1713.964083


# Altitude Impact Analysis

In [28]:
# Summarize air quality metrics by sensor altitude
altitude_summary = (
    df.groupby("sensor.name")[["sensor.altitude", "pm2.5_atm", "pm10.0_atm"]]
    .mean()
    .sort_values("sensor.altitude", ascending=False)
)

altitude_summary

Unnamed: 0_level_0,sensor.altitude,pm2.5_atm,pm10.0_atm
sensor.name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
WCDHD Arthur High School 28,3661.0,6.658474,7.712941
SWNPHD-Imerial,3269.0,6.329091,7.281414
Swnphd-ogallala,3220.0,10.396579,11.347889
Swnphd-Benklemen,3005.0,7.501374,9.04144
WCDHD Thedford Library 29,2850.0,6.21971,7.509269
WCDHD City Building,2800.0,3.971512,4.817597
Broken Bow,2582.0,928.710593,929.678512
Swnphd-mccook,2576.0,123.011622,124.227336
Ainsworth Public School #9,2519.0,10.697058,12.390708
TRPHD Dawson Co. Courthouse 25,2392.0,18.080629,19.548148


# Altitude Correlation

In [29]:
# Compute correlation between altitude and particulate matter
altitude_summary.corr()

Unnamed: 0,sensor.altitude,pm2.5_atm,pm10.0_atm
sensor.altitude,1.0,-0.014693,-0.015633
pm2.5_atm,-0.014693,1.0,0.999998
pm10.0_atm,-0.015633,0.999998,1.0
