In [1]:
import pandas as pd

In [2]:
air_data = pd.read_csv('AirQuality_Daily_StudentVersion.csv')
## Turn data into a dataframe
air_data = pd.DataFrame(air_data)

In [3]:
air_data_grouped = air_data.groupby("sensor.name")

air_data_aggregated = (
    air_data.groupby("sensor.name")
    .agg(
        pm25_mean=("pm2.5_atm", "mean"),
        pm25_median=("pm2.5_atm", "median"),
        pm10_mean=("pm10.0_atm", "mean"),
        pm10_median=("pm10.0_atm", "median"),
        voc_mean=("voc", "mean"),
        voc_median=("voc", "median"),
    )
)

top5_pm25 = air_data_aggregated.nlargest(5, "pm25_mean")
top5_pm10 = air_data_aggregated.nlargest(5, "pm10_mean")
top5_voc  = air_data_aggregated.nlargest(5, "voc_mean")




display(top5_pm25[["pm25_mean", "pm25_median"]])
display(top5_pm10[["pm10_mean", "pm10_median"]])
display(top5_voc[["voc_mean", "voc_median"]])

Unnamed: 0_level_0,pm25_mean,pm25_median
sensor.name,Unnamed: 1_level_1,Unnamed: 2_level_1
Broken Bow,928.710593,36.05024
#16 - Richardson County Courthouse,700.127342,11.977344
#18 - Southeast District Health Department- Tecumseh,613.175352,10.322875
NCDHD O'Neill #11,164.495078,7.251208
Swnphd-mccook,123.011622,4.582281


Unnamed: 0_level_0,pm10_mean,pm10_median
sensor.name,Unnamed: 1_level_1,Unnamed: 2_level_1
Broken Bow,929.678512,43.179094
#16 - Richardson County Courthouse,701.632446,13.305615
#18 - Southeast District Health Department- Tecumseh,614.227248,11.433729
NCDHD O'Neill #11,166.132578,8.427437
Swnphd-mccook,124.227336,5.372073


Unnamed: 0_level_0,voc_mean,voc_median
sensor.name,Unnamed: 1_level_1,Unnamed: 2_level_1
Swnphd-ogallala,399.43424,423.082292
FCHD-YPS,372.46272,375.3835
Three Rivers Public Health Department,370.216208,376.810167
ELVPHD Norfolk HD 4,360.833744,368.5805
Swnphd-mccook,353.941581,381.468479


In [4]:
df = air_data.copy()


df["date"] = pd.to_datetime(df["date"], format="%m/%d/%y", errors="coerce")

df = df.dropna(subset=["date"])

pollutants = {
    "PM2.5": "pm2.5_atm",
    "PM10":  "pm10.0_atm",   
    "VOC":   "voc",        
}


results = []

for pol_name, col in pollutants.items():
    s = pd.to_numeric(df[col], errors="coerce")
    max_val = s.max()

    max_rows = df.loc[s == max_val, ["date", "sensor.name", col]].copy()
    max_rows["pollutant"] = pol_name
    max_rows.rename(columns={"sensor.name": "location", col: "max_value"}, inplace=True)

    results.append(max_rows)

max_occurrences = (
    pd.concat(results, ignore_index=True)
      .sort_values(["pollutant", "date", "location"])
)

max_occurrences

Unnamed: 0,date,location,max_value,pollutant
1,2025-02-18,#16 - Richardson County Courthouse,3784.682542,PM10
0,2025-02-18,#16 - Richardson County Courthouse,3782.823313,PM2.5
2,2024-06-24,Swnphd-ogallala,1209.931571,VOC


In [5]:
def assign_temperature(temp_f):
    if temp_f < 32:
        return "Below Freezing"
    elif 32 <= temp_f <= 50:
        return "Cool"
    elif 51 <= temp_f <= 70:
        return "Warm"
    else:
        return "Hot"

def assign_humidity(rh):
    if rh < 50:
        return "Low"
    elif 50 <= rh <= 80:
        return "High"
    else:
        return "Very High"
        
air_data["temp_category"] = air_data["temperature"].apply(assign_temperature)
air_data["humidity_category"] = air_data["humidity"].apply(assign_humidity)

air_data[["date", "sensor.name", "temperature", "temp_category", "humidity", "humidity_category"]].head(25)




sampled_combo = (
    air_data.groupby(["humidity_category", "temp_category"], group_keys=True)
            .apply(lambda g: g.sample(n=min(5, len(g)), random_state=0),
                   include_groups=False)
)

# Bring group keys back as columns
sampled_combo = sampled_combo.reset_index(level=["humidity_category", "temp_category"])

sampled_combo[[
    "date",
    "sensor.name",
    "temperature",
    "temp_category",
    "humidity",
    "humidity_category",
    "pm2.5_atm",
    "pm10.0_atm",
    "voc"
]]

Unnamed: 0,date,sensor.name,temperature,temp_category,humidity,humidity_category,pm2.5_atm,pm10.0_atm,voc
6639,01/05/25,TRPHD Dawson Co. Courthouse 25,19.902667,Below Freezing,52.716625,High,1.617104,3.001167,109.402042
6632,01/05/25,SWNPHD-Imerial,19.348333,Below Freezing,54.309,High,1.249,1.499104,120.184375
6645,01/05/25,Broken Bow,15.786042,Below Freezing,54.240333,High,2671.969104,2672.387917,82.279125
306,03/25/24,NCDHD O'Neill #11,31.276042,Below Freezing,72.717333,High,6.922625,8.0405,55.35125
321,03/26/24,TRPHD Dawson Co. Courthouse 25,30.533667,Below Freezing,56.603833,High,2.485333,3.342583,70.592542
298,03/25/24,Laurel High School,40.042318,Cool,66.737182,High,3.800909,4.173886,62.867273
7873,03/04/25,TRPHD Dawson Co. Courthouse 25,46.733,Cool,64.983042,High,13.421833,15.339604,366.664167
467,04/03/24,ELVPHD Wisner HD 5,43.623542,Cool,53.525042,High,6.118583,7.012771,128.259792
7887,03/04/25,Swnphd-ogallala,46.540333,Cool,55.924792,High,18.552146,20.132875,343.14525
5491,11/08/24,Swnphd-Benklemen,47.504917,Cool,63.992417,High,15.141125,17.726688,223.195167


In [6]:
def pm25_health_category(pm):
    if pd.isna(pm):
        return None
    if pm <= 12.0:
        return "Good"
    elif pm <= 35.4:
        return "Moderate"
    elif pm <= 55.4:
        return "Unhealthy for Sensitive Groups"
    elif pm <= 150.4:
        return "Unhealthy"
    elif pm <= 250.4:
        return "Very Unhealthy"
    else:
        return "Hazardous"


def pm10_health_category(pm):
    if pd.isna(pm):
        return None
    if pm <= 54:
        return "Good"
    elif pm <= 154:
        return "Moderate"
    elif pm <= 254:
        return "Unhealthy for Sensitive Groups"
    elif pm <= 354:
        return "Unhealthy"
    elif pm <= 424:
        return "Very Unhealthy"
    else:
        return "Hazardous"


air_data["pm2.5_atm"] = pd.to_numeric(air_data["pm2.5_atm"], errors="coerce")
air_data["pm10.0_atm"] = pd.to_numeric(air_data["pm10.0_atm"], errors="coerce")


air_data["PM25_category"] = air_data["pm2.5_atm"].apply(pm25_health_category)
air_data["PM10_category"] = air_data["pm10.0_atm"].apply(pm10_health_category)


risk_levels = [
    "Unhealthy for Sensitive Groups",
    "Unhealthy",
    "Very Unhealthy",
    "Hazardous"
]


pm25_events = air_data[air_data["PM25_category"].isin(risk_levels)].copy()
pm10_events = air_data[air_data["PM10_category"].isin(risk_levels)].copy()


pm25_events_out = pm25_events[["date", "sensor.name", "pm2.5_atm", "PM25_category"]] \
    .sort_values(["date", "sensor.name"])

pm10_events_out = pm10_events[["date", "sensor.name", "pm10.0_atm", "PM10_category"]] \
    .sort_values(["date", "sensor.name"])

print("PM2.5 — Unhealthy for Sensitive Groups or worse:")
display(pm25_events_out)

print("PM10 — Unhealthy for Sensitive Groups or worse:")
display(pm10_events_out)

PM2.5 — Unhealthy for Sensitive Groups or worse:


Unnamed: 0,date,sensor.name,pm2.5_atm,PM25_category
6556,01/01/25,#16 - Richardson County Courthouse,2114.623250,Hazardous
6565,01/01/25,Broken Bow,2107.029354,Hazardous
6576,01/02/25,#16 - Richardson County Courthouse,2135.601729,Hazardous
6585,01/02/25,Broken Bow,1968.824813,Hazardous
6596,01/03/25,#16 - Richardson County Courthouse,2357.012438,Hazardous
...,...,...,...,...
6505,12/29/24,Broken Bow,1720.250604,Hazardous
6516,12/30/24,#16 - Richardson County Courthouse,1946.299250,Hazardous
6525,12/30/24,Broken Bow,1722.934604,Hazardous
6536,12/31/24,#16 - Richardson County Courthouse,2204.763619,Hazardous


PM10 — Unhealthy for Sensitive Groups or worse:


Unnamed: 0,date,sensor.name,pm10.0_atm,PM10_category
6556,01/01/25,#16 - Richardson County Courthouse,2114.860313,Hazardous
6565,01/01/25,Broken Bow,2107.215604,Hazardous
6576,01/02/25,#16 - Richardson County Courthouse,2136.544792,Hazardous
6585,01/02/25,Broken Bow,1969.514229,Hazardous
6596,01/03/25,#16 - Richardson County Courthouse,2359.675646,Hazardous
...,...,...,...,...
6505,12/29/24,Broken Bow,1720.299958,Hazardous
6516,12/30/24,#16 - Richardson County Courthouse,1948.080313,Hazardous
6525,12/30/24,Broken Bow,1723.087500,Hazardous
6536,12/31/24,#16 - Richardson County Courthouse,2205.778262,Hazardous


In [7]:
alt_col = "sensor.altitude"   

sensor_summary = (
    air_data.groupby("sensor.name", as_index=False)
    .agg(
        altitude=(alt_col, "mean"),
        pm25_mean=("pm2.5_atm", "mean"),
        pm10_mean=("pm10.0_atm", "mean"),
        voc_mean=("voc", "mean"),
        n=("pm2.5_atm", "count"),
    )
)

sensor_summary = sensor_summary.dropna(subset=["altitude"])

sensor_summary.head()

Unnamed: 0,sensor.name,altitude,pm25_mean,pm10_mean,voc_mean,n
0,#16 - Richardson County Courthouse,1023.0,700.127342,701.632446,86.75865,318
1,#17 - Otoe County,1055.0,9.755345,11.671141,251.427296,302
2,#18 - Southeast District Health Department- Te...,1143.0,613.175352,614.227248,196.150771,183
3,Ainsworth Public School #9,2519.0,10.697058,12.390708,219.200274,368
4,Broken Bow,2582.0,928.710593,929.678512,158.285807,368
