# Urban Air Quality Risk Prediction

Air pollution poses a serious health risk in urban areas.
This project analyzes air quality and environmental factors
to identify and classify high-risk pollution days.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()


In [3]:
df = pd.read_csv("UrbanAirPollutionDataset.csv")
df.head()


Unnamed: 0,Station_ID,DateTime,PM2.5,PM10,NO₂,SO₂,CO,O₃,Temp_C,Humidity_%,Wind_Speed_mps,Wind_Direction_deg,Pressure_hPa,Rain_mm,AQI_Target
0,1,2020-01-01 00:00:00,86.397213,111.814972,26.599649,3.875088,0.570793,42.903768,31.491409,45.704988,3.114026,166.148254,1012.641964,1.0,59.759255
1,1,2020-01-01 01:00:00,73.311679,110.919391,18.585412,11.820056,0.562121,15.127661,24.191965,41.544655,3.480094,282.15806,1011.779447,0.0,45.256996
2,1,2020-01-01 02:00:00,61.359818,47.063317,20.605215,16.531417,0.466105,32.752213,30.719383,73.849227,4.94946,109.450898,1004.118484,0.0,43.411916
3,1,2020-01-01 03:00:00,54.081632,122.981322,14.682654,14.453442,0.607025,18.910033,18.24315,42.086443,1.834147,229.796006,1009.154244,0.0,35.227619
4,1,2020-01-01 04:00:00,43.221175,102.259959,41.515463,17.49567,0.537119,39.582884,33.949777,42.393921,1.987593,159.50137,1021.100094,0.0,41.981803


In [4]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.columns


Index(['station_id', 'datetime', 'pm2.5', 'pm10', 'no₂', 'so₂', 'co', 'o₃',
       'temp_c', 'humidity_%', 'wind_speed_mps', 'wind_direction_deg',
       'pressure_hpa', 'rain_mm', 'aqi_target'],
      dtype='str')

In [6]:
df["datetime"] = pd.to_datetime(df["datetime"])
df["month"] = df["datetime"].dt.month
df["year"] = df["datetime"].dt.year

df["season"] = df["month"].map({
    12: "Winter", 1: "Winter", 2: "Winter",
    3: "Summer", 4: "Summer", 5: "Summer",
    6: "Monsoon", 7: "Monsoon", 8: "Monsoon",
    9: "Post-Monsoon", 10: "Post-Monsoon", 11: "Post-Monsoon"
})

df.head()


Unnamed: 0,station_id,datetime,pm2.5,pm10,no₂,so₂,co,o₃,temp_c,humidity_%,wind_speed_mps,wind_direction_deg,pressure_hpa,rain_mm,aqi_target,month,year,season
0,1,2020-01-01 00:00:00,86.397213,111.814972,26.599649,3.875088,0.570793,42.903768,31.491409,45.704988,3.114026,166.148254,1012.641964,1.0,59.759255,1,2020,Winter
1,1,2020-01-01 01:00:00,73.311679,110.919391,18.585412,11.820056,0.562121,15.127661,24.191965,41.544655,3.480094,282.15806,1011.779447,0.0,45.256996,1,2020,Winter
2,1,2020-01-01 02:00:00,61.359818,47.063317,20.605215,16.531417,0.466105,32.752213,30.719383,73.849227,4.94946,109.450898,1004.118484,0.0,43.411916,1,2020,Winter
3,1,2020-01-01 03:00:00,54.081632,122.981322,14.682654,14.453442,0.607025,18.910033,18.24315,42.086443,1.834147,229.796006,1009.154244,0.0,35.227619,1,2020,Winter
4,1,2020-01-01 04:00:00,43.221175,102.259959,41.515463,17.49567,0.537119,39.582884,33.949777,42.393921,1.987593,159.50137,1021.100094,0.0,41.981803,1,2020,Winter


In [9]:
df["date"] = df["datetime"]
df["month"] = df["date"].dt.month
df["year"] = df["date"].dt.year

df["season"] = df["month"].map({
    12: "Winter", 1: "Winter", 2: "Winter",
    3: "Summer", 4: "Summer", 5: "Summer",
    6: "Monsoon", 7: "Monsoon", 8: "Monsoon",
    9: "Post-Monsoon", 10: "Post-Monsoon", 11: "Post-Monsoon"
})

df.head()


Unnamed: 0,station_id,datetime,pm2.5,pm10,no₂,so₂,co,o₃,temp_c,humidity_%,wind_speed_mps,wind_direction_deg,pressure_hpa,rain_mm,aqi_target,month,year,season,date
0,1,2020-01-01 00:00:00,86.397213,111.814972,26.599649,3.875088,0.570793,42.903768,31.491409,45.704988,3.114026,166.148254,1012.641964,1.0,59.759255,1,2020,Winter,2020-01-01 00:00:00
1,1,2020-01-01 01:00:00,73.311679,110.919391,18.585412,11.820056,0.562121,15.127661,24.191965,41.544655,3.480094,282.15806,1011.779447,0.0,45.256996,1,2020,Winter,2020-01-01 01:00:00
2,1,2020-01-01 02:00:00,61.359818,47.063317,20.605215,16.531417,0.466105,32.752213,30.719383,73.849227,4.94946,109.450898,1004.118484,0.0,43.411916,1,2020,Winter,2020-01-01 02:00:00
3,1,2020-01-01 03:00:00,54.081632,122.981322,14.682654,14.453442,0.607025,18.910033,18.24315,42.086443,1.834147,229.796006,1009.154244,0.0,35.227619,1,2020,Winter,2020-01-01 03:00:00
4,1,2020-01-01 04:00:00,43.221175,102.259959,41.515463,17.49567,0.537119,39.582884,33.949777,42.393921,1.987593,159.50137,1021.100094,0.0,41.981803,1,2020,Winter,2020-01-01 04:00:00


In [10]:
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())


In [None]:
def risk_label(aqi):
    if aqi <= 100:
        return "Low Risk"
    elif aqi <= 200:
        return "Medium Risk"
    else:
        return "High Risk"

df["risk_level"] = df["aqi_target"].apply(risk_label)
df["risk_level"].value_counts()


KeyError: 'aqi'