In [79]:
import numpy as np
import pandas as pd
import os
import glob

In [80]:
OUT_DAILY_AQI = "/Users/liyanwang/Desktop/2026winterProject/stat946/Air-Quality-Data-main/data/NeatData/daily_AQI_EPA.csv"

AQI_BREAKPOINTS = {
    "O3": [
        (0.000, 0.054,   0,  50),
        (0.055, 0.070,  51, 100),
        (0.071, 0.085, 101, 150),
        (0.086, 0.105, 151, 200),
        (0.106, 0.200, 201, 300),
        (0.201, 0.604, 301, 500),
    ],
    "PM2.5": [
        (0.0,   9.0,    0,  50),
        (9.1,  35.4,   51, 100),
        (35.5, 55.4,  101, 150),
        (55.5, 125.4, 151, 200),
        (125.5,225.4, 201, 300),
        (225.5,325.4, 301, 500),
    ],
    "CO": [
        (0.0,  4.4,    0,  50),
        (4.5,  9.4,   51, 100),
        (9.5, 12.4,  101, 150),
        (12.5,15.4,  151, 200),
        (15.5,30.4,  201, 300),
        (30.5,50.4,  301, 500),
    ],
    "SO2": [
        (0,   35,    0,  50),
        (36,  75,   51, 100),
        (76,  185, 101, 150),
        (186, 304, 151, 200),
        (305, 604, 201, 300),
        (605,1004, 301, 500),
    ],
    "NO2": [
        (0,    53,    0,  50),
        (54,   100,  51, 100),
        (101,  360, 101, 150),
        (361,  649, 151, 200),
        (650, 1249, 201, 300),
        (1250,2049, 301, 500),
    ],
}

def truncate_conc(pollutant: str, x: float) -> float:
    if pd.isna(x):
        return np.nan
    if pollutant == "O3":      # truncate to 3 decimals
        return np.floor(x * 1000) / 1000
    if pollutant == "PM2.5":   # truncate to 1 decimal
        return np.floor(x * 10) / 10
    if pollutant == "CO":      # truncate to 1 decimal
        return np.floor(x * 10) / 10
    if pollutant in ("SO2", "NO2"):  # truncate to integer
        return np.floor(x)
    return x

def calc_sub_aqi(pollutant: str, conc: float) -> float:
    """
    Compute pollutant-specific AQI sub-index using linear interpolation
    """
    if pd.isna(conc):
        return np.nan

    bps = AQI_BREAKPOINTS[pollutant]

    # If concentration exceeds highest breakpoint, extrapolate using last range
    if conc > bps[-1][1]:
        C_lo, C_hi, I_lo, I_hi = bps[-1]
        return (I_hi - I_lo) / (C_hi - C_lo) * (conc - C_lo) + I_lo

    for C_lo, C_hi, I_lo, I_hi in bps:
        if C_lo <= conc <= C_hi:
            return (I_hi - I_lo) / (C_hi - C_lo) * (conc - C_lo) + I_lo

    return np.nan


# 4) Compute EPA "daily design concentration" from hourly data
def daily_metric_from_hourly(vals, pollutant):
    s = pd.Series(vals, dtype="float64")
    valid_n = s.notna().sum()

    if pollutant in ("O3", "CO"):   # 8-hour rolling max
        return s.rolling(8, min_periods=6).mean().max()

    if pollutant in ("NO2", "SO2"): # 1-hour max
        return s.max()

    if pollutant == "PM2.5":        # 24-hour mean
        return s.mean()

    return np.nan

In [81]:
# Path to your folder
folder_path = '/Users/liyanwang/Desktop/2026winterProject/stat946/Air-Quality-Data-main/data/air_quality_csv'

all_dataframes = []
# Loop through all CSV files in the folder
for file_path in glob.glob(os.path.join(folder_path, "*.csv")):
    metadata = pd.read_csv(file_path, nrows=10, header=None)
    station_id = metadata.iloc[1, 1].split('(')[-1].replace(')', '') # Extract "12008"
    lat = metadata.iloc[3, 1]
    lon = metadata.iloc[4, 1]
    pollutant_type = metadata.iloc[7, 1].split('(')[-1].split(' ')[-1].replace(')', '')
    map_to_epa = {
    "CO": "CO",
    "O3": "O3",
    "NO2": "NO2",
    "SO2": "SO2",
    "PM2.5": "PM2.5",
    "PM25": "PM2.5",
    }
    pollutant = map_to_epa.get(pollutant_type, None)
    df = pd.read_csv(file_path, skiprows=10, index_col=False)
    df = df[df['Date'].astype(str).str.contains(r'\d{4}-\d{2}-\d{2}', na=False)].copy()
    df = df[pd.to_datetime(df['Date']).between('2022-04-26', '2024-09-26')]
    df.columns = df.columns.str.strip()
    hourly_cols = [col for col in df.columns if col.startswith('H')]
    df[hourly_cols] = df[hourly_cols].mask(df[hourly_cols].abs() >= 999)

    if pollutant == "O3":
        df[hourly_cols] = df[hourly_cols] / 1000.0

    daily_conc = df[hourly_cols].apply(lambda r: daily_metric_from_hourly(r.values, pollutant), axis=1)

    # Special EPA rule for SO2: if 1-hr max >=305 ppb, use 24-hr average
    if pollutant == "SO2":
        max1 = df[hourly_cols].max(axis=1)
        avg24 = df[hourly_cols].mean(axis=1)
        daily_conc = np.where(max1 >= 305, avg24, max1)

    daily_conc = pd.Series(daily_conc, index=df.index)
    daily_conc_trunc = daily_conc.apply(lambda x: truncate_conc(pollutant, x))
    sub_aqi = daily_conc_trunc.apply(lambda x: calc_sub_aqi(pollutant, x)).round()

    temp_df = pd.DataFrame({
            "Station ID": station_id,
            "Latitude": lat,
            "Longitude": lon,
            "Date": df["Date"],
            f"{pollutant}_subAQI": sub_aqi
        })
        
    all_dataframes.append(temp_df)


In [86]:
standardized_list = []
for df in all_dataframes:
    # Identify which column is the pollutant value (e.g., 'PM2.5', 'NO2', etc.)
    # It's the one column that isn't Date, Station ID, Lat, or Long
    core_cols = {'Date', 'Station ID', 'Latitude', 'Longitude'}
    pollutant_col = list(set(df.columns) - core_cols)[0]
    # Rename that column to 'Value' and add a 'Pollutant_Type' label
    temp_df = df.copy()
    temp_df['Value'] = temp_df[pollutant_col]
    temp_df['Pollutant_Type'] = pollutant_col
    standardized_list.append(temp_df[['Date', 'Station ID', 'Latitude', 'Longitude', 'Pollutant_Type', 'Value']])

big_df = pd.concat(standardized_list, ignore_index=True)
# index: columns that stay the same
# columns: the pollutant names that will become new headers
# values: the actual measurement
final_df = big_df.pivot_table(
    index=['Date', 'Station ID', 'Latitude', 'Longitude'],
    columns='Pollutant_Type',
    values='Value'
).reset_index()
final_df.columns.name = None

subaqi_cols = [c for c in final_df.columns if c.endswith("_subAQI")]
final_df["AQI"] = final_df[subaqi_cols].max(axis=1, skipna=True)

final_df.to_csv(OUT_DAILY_AQI, index=False)