In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,timedelta
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette('husl')


In [7]:
weather_df = pd.read_csv("../data/karachi_weather_1year.csv")
weather_df.head()

Unnamed: 0,datetime,temperature,humidity,pressure,wind_speed,wind_direction,precipitation
0,2024-10-10 00:00:00+00:00,25.8065,82.28609,1008.3,8.538149,341.56494,0.0
1,2024-10-10 01:00:00+00:00,25.956501,86.123695,1009.0,8.93859,334.98312,0.0
2,2024-10-10 02:00:00+00:00,26.0065,83.566086,1009.4,8.244708,323.88055,0.0
3,2024-10-10 03:00:00+00:00,28.1065,74.54487,1009.8,9.039514,305.2725,0.0
4,2024-10-10 04:00:00+00:00,30.706501,64.15833,1009.9,13.783817,310.76352,0.0


In [8]:
air_quality_df = pd.read_csv("../data/karachi_air_quality_1year.csv")
air_quality_df.head()

Unnamed: 0,datetime,pm10,pm25,co,no2,o3,so2
0,2024-10-10 00:00:00+00:00,44.1,26.5,269.0,14.0,49.0,6.8
1,2024-10-10 01:00:00+00:00,45.2,27.1,430.0,24.7,40.0,8.0
2,2024-10-10 02:00:00+00:00,50.4,30.9,645.0,39.0,29.0,9.6
3,2024-10-10 03:00:00+00:00,57.5,35.6,760.0,46.1,31.0,11.0
4,2024-10-10 04:00:00+00:00,56.0,33.8,677.0,39.0,59.0,12.1


In [10]:
weather_df.isnull().sum()

datetime          0
temperature       0
humidity          0
pressure          0
wind_speed        0
wind_direction    0
precipitation     0
dtype: int64

In [11]:
air_quality_df.isnull().sum()

datetime    0
pm10        0
pm25        0
co          0
no2         0
o3          0
so2         0
dtype: int64

In [12]:
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
air_quality_df['datetime'] = pd.to_datetime(air_quality_df['datetime'])

merged_df = pd.merge(weather_df,air_quality_df,on='datetime',how='inner')
print(f"Merged dataset shape: {merged_df.shape}")
print(f"Date range: {merged_df['datetime'].min()} to {merged_df['datetime'].max()}")

Merged dataset shape: (8784, 13)
Date range: 2024-10-10 00:00:00+00:00 to 2025-10-10 23:00:00+00:00


In [13]:
merged_df.isnull().sum()

datetime          0
temperature       0
humidity          0
pressure          0
wind_speed        0
wind_direction    0
precipitation     0
pm10              0
pm25              0
co                0
no2               0
o3                0
so2               0
dtype: int64

In [14]:
merged_df.sample(5)

Unnamed: 0,datetime,temperature,humidity,pressure,wind_speed,wind_direction,precipitation,pm10,pm25,co,no2,o3,so2
6512,2025-07-08 08:00:00+00:00,33.7065,54.806416,998.8,12.991212,183.97241,0.0,94.8,37.1,325.0,4.5,132.0,22.5
826,2024-11-13 10:00:00+00:00,31.206501,34.3736,1010.1,10.587918,215.3113,0.0,38.2,29.3,336.0,6.4,162.0,25.0
7196,2025-08-05 20:00:00+00:00,26.706501,88.285225,1004.0,14.913952,246.52899,0.1,39.1,19.0,174.0,9.5,47.0,7.4
6113,2025-06-21 17:00:00+00:00,29.2565,88.491325,1000.4,12.669238,247.44272,0.0,35.7,21.1,272.0,17.3,47.0,11.0
1264,2024-12-01 16:00:00+00:00,25.1565,38.04964,1011.7,6.746999,350.78906,0.0,73.8,71.5,2170.0,102.4,10.0,33.6


### Calculating AQI from pollutant Concentrations

In [15]:
def calculate_aqi(pm25,pm10,o3,no2,co,so2):
    def calculate_individual_aqi(conc,breakpoints):
        for i, (c_low,c_high,aqi_low,aqi_high) in enumerate(breakpoints):
            if conc<=c_high:
                if conc >= c_low:
                    aqi =  ((aqi_high - aqi_low) / (c_high - c_low)) * (conc - c_low) + aqi_low
                    return round(aqi)
        return 500
    pm25_breakpoints = [
        (0, 12, 0, 50), (12.1, 35.4, 51, 100), (35.5, 55.4, 101, 150),
        (55.5, 150.4, 151, 200), (150.5, 250.4, 201, 300), (250.5, 500, 301, 500)
    ]
    
    pm10_breakpoints = [
        (0, 54, 0, 50), (55, 154, 51, 100), (155, 254, 101, 150),
        (255, 354, 151, 200), (355, 424, 201, 300), (425, 604, 301, 500)
    ]
    
    o3_breakpoints = [ 
        (0, 54, 0, 50), (55, 70, 51, 100), (71, 85, 101, 150),
        (86, 105, 151, 200), (106, 200, 201, 300)
    ]
    
    no2_breakpoints = [ 
        (0, 53, 0, 50), 
        (54, 100, 51, 100), 
        (101, 360, 101, 150),
        (361, 649, 151, 200), 
        (650, 1249, 201, 300), 
        (1250, 2049, 301, 500)
    ]
    

    co_breakpoints = [
        (0, 4.4, 0, 50),      # 0-4.4 ppm
        (4.5, 9.4, 51, 100),  # 4.5-9.4 ppm
        (9.5, 12.4, 101, 150), # 9.5-12.4 ppm
        (12.5, 15.4, 151, 200), # 12.5-15.4 ppm
        (15.5, 30.4, 201, 300), # 15.5-30.4 ppm
        (30.5, 50.4, 301, 500)  # 30.5-50.4 ppm
    ]
    
    so2_breakpoints = [
        (0, 35, 0, 50), 
        (36, 75, 51, 100), 
        (76, 185, 101, 150),
        (186, 304, 151, 200), 
        (305, 604, 201, 300), 
        (605, 1004, 301, 500)
    ]
    
    aqi_pm25 = calculate_individual_aqi(pm25,pm25_breakpoints)
    aqi_pm10 = calculate_individual_aqi(pm10,pm10_breakpoints)
    aqi_o3 = calculate_individual_aqi(o3,o3_breakpoints)
    aqi_no2 = calculate_individual_aqi(no2,no2_breakpoints)
    co_ppm = co/1145
    aqi_co = calculate_individual_aqi(co_ppm,co_breakpoints)
    aqi_so2 = calculate_individual_aqi(so2,so2_breakpoints)
    all_aqi = [aqi_pm25,aqi_pm10,aqi_o3,aqi_no2,aqi_co,aqi_so2]
    return max(all_aqi)


print("Including: PM2.5, PM10, O3, NO2, CO, SO2")

merged_df['aqi'] = merged_df.apply(
    lambda row:calculate_aqi(
        row['pm25'],row['pm10'],row['o3'],row['no2'],row['co'],row['so2']
    ),axis = 1
)

print(f"AQI calculated! Range: {merged_df['aqi'].min()} - {merged_df['aqi'].max()}")
print(f"Average AQI: {merged_df['aqi'].mean():.1f}")

    

Including: PM2.5, PM10, O3, NO2, CO, SO2
AQI calculated! Range: 38 - 500
Average AQI: 135.8


In [16]:
print(merged_df['aqi'].describe())

count    8784.000000
mean      135.797700
std        78.540203
min        38.000000
25%        78.000000
50%       108.000000
75%       182.000000
max       500.000000
Name: aqi, dtype: float64


In [17]:
def get_aqi_category(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for Sensitive Groups"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"
    
merged_df['aqi_category'] = merged_df['aqi'].apply(get_aqi_category)
print(merged_df['aqi_category'].value_counts())


aqi_category
Moderate                          3984
Unhealthy for Sensitive Groups    1763
Very Unhealthy                    1718
Unhealthy                         1108
Hazardous                          162
Good                                49
Name: count, dtype: int64


In [18]:
merged_df[['datetime', 'pm25', 'pm10', 'o3', 'no2', 'co', 'so2', 'aqi', 'aqi_category']].head(5)

Unnamed: 0,datetime,pm25,pm10,o3,no2,co,so2,aqi,aqi_category
0,2024-10-10 00:00:00+00:00,26.5,44.1,49.0,14.0,269.0,6.8,81,Moderate
1,2024-10-10 01:00:00+00:00,27.1,45.2,40.0,24.7,430.0,8.0,83,Moderate
2,2024-10-10 02:00:00+00:00,30.9,50.4,29.0,39.0,645.0,9.6,91,Moderate
3,2024-10-10 03:00:00+00:00,35.6,57.5,31.0,46.1,760.0,11.0,101,Unhealthy for Sensitive Groups
4,2024-10-10 04:00:00+00:00,33.8,56.0,59.0,39.0,677.0,12.1,97,Moderate


### Time Based Feature Engineering

In [22]:
merged_df['hour'] = merged_df['datetime'].dt.hour
merged_df['day'] = merged_df['datetime'].dt.day
merged_df['month'] = merged_df['datetime'].dt.month
merged_df['weekday'] = merged_df['datetime'].dt.weekday
merged_df['is_weekend'] = (merged_df['weekday'] >= 5).astype(int)

merged_df['season'] = merged_df['month'].map({
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
})

merged_df['hour_sin'] = np.sin(2 * np.pi * merged_df['hour'] / 24)
merged_df['hour_cos'] = np.cos(2 * np.pi * merged_df['hour'] / 24)
merged_df['month_sin'] = np.sin(2 * np.pi * merged_df['month'] / 12)
merged_df['month_cos'] = np.cos(2 * np.pi * merged_df['month'] / 12)


print("Time-based features created!")
print(f"New dataset shape: {merged_df.shape}")

merged_df[['datetime','hour','weekday','season','hour_sin','hour_cos']].head(5)


Time-based features created!
New dataset shape: (8784, 25)


Unnamed: 0,datetime,hour,weekday,season,hour_sin,hour_cos
0,2024-10-10 00:00:00+00:00,0,3,Autumn,0.0,1.0
1,2024-10-10 01:00:00+00:00,1,3,Autumn,0.258819,0.965926
2,2024-10-10 02:00:00+00:00,2,3,Autumn,0.5,0.866025
3,2024-10-10 03:00:00+00:00,3,3,Autumn,0.707107,0.707107
4,2024-10-10 04:00:00+00:00,4,3,Autumn,0.866025,0.5


### Derived Features

In [23]:
merged_df = merged_df.sort_values(by='datetime').reset_index(drop=True)
merged_df['aqi_change_1h'] = merged_df['aqi'].diff()
merged_df['aqi_change_3h'] = merged_df['aqi'].diff(3)
merged_df['aqi_change_6h'] = merged_df['aqi'].diff(6)
merged_df['aqi_ma_3h'] = merged_df['aqi'].rolling(window=3).mean()
merged_df['aqi_ma_6h'] = merged_df['aqi'].rolling(window=6).mean()
merged_df['aqi_ma_12h'] = merged_df['aqi'].rolling(window=12).mean()
merged_df['aqi_ma_24h'] = merged_df['aqi'].rolling(window=24).mean()
merged_df['aqi_lag_1h'] = merged_df['aqi'].shift(1)
merged_df['aqi_lag_3h'] = merged_df['aqi'].shift(3)
merged_df['aqi_lag_6h'] = merged_df['aqi'].shift(6)


print(f"Derived features created New shape: {merged_df.shape}")
merged_df[['datetime', 'aqi', 'aqi_change_1h', 'aqi_ma_3h', 'aqi_lag_1h']].head(10)


Derived features created New shape: (8784, 35)


Unnamed: 0,datetime,aqi,aqi_change_1h,aqi_ma_3h,aqi_lag_1h
0,2024-10-10 00:00:00+00:00,81,,,
1,2024-10-10 01:00:00+00:00,83,2.0,,81.0
2,2024-10-10 02:00:00+00:00,91,8.0,85.0,83.0
3,2024-10-10 03:00:00+00:00,101,10.0,91.666667,91.0
4,2024-10-10 04:00:00+00:00,97,-4.0,96.333333,101.0
5,2024-10-10 05:00:00+00:00,187,90.0,128.333333,97.0
6,2024-10-10 06:00:00+00:00,228,41.0,170.666667,187.0
7,2024-10-10 07:00:00+00:00,500,272.0,305.0,228.0
8,2024-10-10 08:00:00+00:00,246,-254.0,324.666667,500.0
9,2024-10-10 09:00:00+00:00,244,-2.0,330.0,246.0


### Weather pollution interaction features

In [25]:
merged_df['temp_humidity_interaction'] = merged_df['temperature'] * merged_df['humidity']
merged_df['wind_pollution_ratio'] = merged_df['wind_speed'] / (merged_df['pm25'] + 1)
merged_df['pressure_stability'] = merged_df['pressure'].rolling(window=6).std()


In [27]:
merged_df.shape

(8784, 38)