In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,timedelta
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette('husl')


In [71]:
weather_df = pd.read_csv("../data/karachi_weather_1year.csv")
weather_df.head()

Unnamed: 0,datetime,temperature,humidity,pressure,wind_speed,wind_direction,precipitation
0,2024-10-10 00:00:00+00:00,25.8065,82.28609,1008.3,8.538149,341.56494,0.0
1,2024-10-10 01:00:00+00:00,25.956501,86.123695,1009.0,8.93859,334.98312,0.0
2,2024-10-10 02:00:00+00:00,26.0065,83.566086,1009.4,8.244708,323.88055,0.0
3,2024-10-10 03:00:00+00:00,28.1065,74.54487,1009.8,9.039514,305.2725,0.0
4,2024-10-10 04:00:00+00:00,30.706501,64.15833,1009.9,13.783817,310.76352,0.0


In [72]:
air_quality_df = pd.read_csv("../data/karachi_air_quality_1year.csv")
air_quality_df.head()

Unnamed: 0,datetime,pm10,pm25,co,no2,o3,so2
0,2024-10-10 00:00:00+00:00,44.1,26.5,269.0,14.0,49.0,6.8
1,2024-10-10 01:00:00+00:00,45.2,27.1,430.0,24.7,40.0,8.0
2,2024-10-10 02:00:00+00:00,50.4,30.9,645.0,39.0,29.0,9.6
3,2024-10-10 03:00:00+00:00,57.5,35.6,760.0,46.1,31.0,11.0
4,2024-10-10 04:00:00+00:00,56.0,33.8,677.0,39.0,59.0,12.1


In [73]:
weather_df.isnull().sum()

datetime          0
temperature       0
humidity          0
pressure          0
wind_speed        0
wind_direction    0
precipitation     0
dtype: int64

In [74]:
air_quality_df.isnull().sum()

datetime    0
pm10        0
pm25        0
co          0
no2         0
o3          0
so2         0
dtype: int64

In [75]:
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
air_quality_df['datetime'] = pd.to_datetime(air_quality_df['datetime'])

merged_df = pd.merge(weather_df,air_quality_df,on='datetime',how='inner')
print(f"Merged dataset shape: {merged_df.shape}")
print(f"Date range: {merged_df['datetime'].min()} to {merged_df['datetime'].max()}")

Merged dataset shape: (8784, 13)
Date range: 2024-10-10 00:00:00+00:00 to 2025-10-10 23:00:00+00:00


In [76]:
merged_df.isnull().sum()

datetime          0
temperature       0
humidity          0
pressure          0
wind_speed        0
wind_direction    0
precipitation     0
pm10              0
pm25              0
co                0
no2               0
o3                0
so2               0
dtype: int64

In [77]:
merged_df.sample(5)

Unnamed: 0,datetime,temperature,humidity,pressure,wind_speed,wind_direction,precipitation,pm10,pm25,co,no2,o3,so2
4139,2025-03-31 11:00:00+00:00,31.6065,34.933777,1008.9,11.854062,221.92245,0.0,65.2,21.8,420.0,10.1,149.0,22.5
3036,2025-02-13 12:00:00+00:00,26.1065,36.448517,1014.0,9.334003,230.4774,0.0,90.6,39.9,593.0,15.8,141.0,30.6
5487,2025-05-26 15:00:00+00:00,29.6065,85.20739,999.6,11.722133,266.4786,0.0,65.0,31.3,265.0,22.4,38.0,11.6
2824,2025-02-04 16:00:00+00:00,20.206501,51.55081,1016.3,8.227004,280.08054,0.0,46.4,22.1,956.0,33.3,63.0,12.4
6237,2025-06-26 21:00:00+00:00,29.5565,88.256454,998.2,7.010763,299.1974,0.0,58.3,26.1,196.0,21.4,47.0,13.6


### Calculating AQI from pollutant Concentrations

In [78]:
def calculate_aqi(pm25,pm10,o3,no2,co,so2):
    def calculate_individual_aqi(conc,breakpoints):
        for i, (c_low,c_high,aqi_low,aqi_high) in enumerate(breakpoints):
            if conc<=c_high:
                if conc >= c_low:
                    aqi =  ((aqi_high - aqi_low) / (c_high - c_low)) * (conc - c_low) + aqi_low
                    return round(aqi)
        return 500
    pm25_breakpoints = [
        (0, 12, 0, 50), (12.1, 35.4, 51, 100), (35.5, 55.4, 101, 150),
        (55.5, 150.4, 151, 200), (150.5, 250.4, 201, 300), (250.5, 500, 301, 500)
    ]
    
    pm10_breakpoints = [
        (0, 54, 0, 50), (55, 154, 51, 100), (155, 254, 101, 150),
        (255, 354, 151, 200), (355, 424, 201, 300), (425, 604, 301, 500)
    ]
    
    o3_breakpoints = [ 
        (0, 54, 0, 50), (55, 70, 51, 100), (71, 85, 101, 150),
        (86, 105, 151, 200), (106, 200, 201, 300)
    ]
    
    no2_breakpoints = [ 
        (0, 53, 0, 50), 
        (54, 100, 51, 100), 
        (101, 360, 101, 150),
        (361, 649, 151, 200), 
        (650, 1249, 201, 300), 
        (1250, 2049, 301, 500)
    ]
    

    co_breakpoints = [
        (0, 4.4, 0, 50),      # 0-4.4 ppm
        (4.5, 9.4, 51, 100),  # 4.5-9.4 ppm
        (9.5, 12.4, 101, 150), # 9.5-12.4 ppm
        (12.5, 15.4, 151, 200), # 12.5-15.4 ppm
        (15.5, 30.4, 201, 300), # 15.5-30.4 ppm
        (30.5, 50.4, 301, 500)  # 30.5-50.4 ppm
    ]
    
    so2_breakpoints = [
        (0, 35, 0, 50), 
        (36, 75, 51, 100), 
        (76, 185, 101, 150),
        (186, 304, 151, 200), 
        (305, 604, 201, 300), 
        (605, 1004, 301, 500)
    ]
    
    aqi_pm25 = calculate_individual_aqi(pm25,pm25_breakpoints)
    aqi_pm10 = calculate_individual_aqi(pm10,pm10_breakpoints)
    aqi_o3 = calculate_individual_aqi(o3,o3_breakpoints)
    aqi_no2 = calculate_individual_aqi(no2,no2_breakpoints)
    co_ppm = co/1145
    aqi_co = calculate_individual_aqi(co_ppm,co_breakpoints)
    aqi_so2 = calculate_individual_aqi(so2,so2_breakpoints)
    all_aqi = [aqi_pm25,aqi_pm10,aqi_o3,aqi_no2,aqi_co,aqi_so2]
    return max(all_aqi)


print("Including: PM2.5, PM10, O3, NO2, CO, SO2")

merged_df['aqi'] = merged_df.apply(
    lambda row:calculate_aqi(
        row['pm25'],row['pm10'],row['o3'],row['no2'],row['co'],row['so2']
    ),axis = 1
)

print(f"AQI calculated! Range: {merged_df['aqi'].min()} - {merged_df['aqi'].max()}")
print(f"Average AQI: {merged_df['aqi'].mean():.1f}")

    

Including: PM2.5, PM10, O3, NO2, CO, SO2
AQI calculated! Range: 38 - 500
Average AQI: 135.8


In [79]:
print(merged_df['aqi'].describe())

count    8784.000000
mean      135.797700
std        78.540203
min        38.000000
25%        78.000000
50%       108.000000
75%       182.000000
max       500.000000
Name: aqi, dtype: float64


In [80]:
def get_aqi_category(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for Sensitive Groups"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"
    
merged_df['aqi_category'] = merged_df['aqi'].apply(get_aqi_category)
print(merged_df['aqi_category'].value_counts())


aqi_category
Moderate                          3984
Unhealthy for Sensitive Groups    1763
Very Unhealthy                    1718
Unhealthy                         1108
Hazardous                          162
Good                                49
Name: count, dtype: int64


In [81]:
merged_df[['datetime', 'pm25', 'pm10', 'o3', 'no2', 'co', 'so2', 'aqi', 'aqi_category']].head(5)

Unnamed: 0,datetime,pm25,pm10,o3,no2,co,so2,aqi,aqi_category
0,2024-10-10 00:00:00+00:00,26.5,44.1,49.0,14.0,269.0,6.8,81,Moderate
1,2024-10-10 01:00:00+00:00,27.1,45.2,40.0,24.7,430.0,8.0,83,Moderate
2,2024-10-10 02:00:00+00:00,30.9,50.4,29.0,39.0,645.0,9.6,91,Moderate
3,2024-10-10 03:00:00+00:00,35.6,57.5,31.0,46.1,760.0,11.0,101,Unhealthy for Sensitive Groups
4,2024-10-10 04:00:00+00:00,33.8,56.0,59.0,39.0,677.0,12.1,97,Moderate


### Time Based Feature Engineering

In [82]:
merged_df['hour'] = merged_df['datetime'].dt.hour
merged_df['day'] = merged_df['datetime'].dt.day
merged_df['month'] = merged_df['datetime'].dt.month
merged_df['weekday'] = merged_df['datetime'].dt.weekday
merged_df['is_weekend'] = (merged_df['weekday'] >= 5).astype(int)

merged_df['season'] = merged_df['month'].map({
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
})

merged_df['hour_sin'] = np.sin(2 * np.pi * merged_df['hour'] / 24)
merged_df['hour_cos'] = np.cos(2 * np.pi * merged_df['hour'] / 24)
merged_df['month_sin'] = np.sin(2 * np.pi * merged_df['month'] / 12)
merged_df['month_cos'] = np.cos(2 * np.pi * merged_df['month'] / 12)


print("Time-based features created!")
print(f"New dataset shape: {merged_df.shape}")

merged_df[['datetime','hour','weekday','season','hour_sin','hour_cos']].head(5)


Time-based features created!
New dataset shape: (8784, 25)


Unnamed: 0,datetime,hour,weekday,season,hour_sin,hour_cos
0,2024-10-10 00:00:00+00:00,0,3,Autumn,0.0,1.0
1,2024-10-10 01:00:00+00:00,1,3,Autumn,0.258819,0.965926
2,2024-10-10 02:00:00+00:00,2,3,Autumn,0.5,0.866025
3,2024-10-10 03:00:00+00:00,3,3,Autumn,0.707107,0.707107
4,2024-10-10 04:00:00+00:00,4,3,Autumn,0.866025,0.5


### Derived Features

In [83]:
merged_df = merged_df.sort_values(by='datetime').reset_index(drop=True)
merged_df['aqi_change_1h'] = merged_df['aqi'].diff()
merged_df['aqi_change_3h'] = merged_df['aqi'].diff(3)
merged_df['aqi_change_6h'] = merged_df['aqi'].diff(6)
merged_df['aqi_ma_3h'] = merged_df['aqi'].rolling(window=3).mean()
merged_df['aqi_ma_6h'] = merged_df['aqi'].rolling(window=6).mean()
merged_df['aqi_ma_12h'] = merged_df['aqi'].rolling(window=12).mean()
merged_df['aqi_ma_24h'] = merged_df['aqi'].rolling(window=24).mean()
merged_df['aqi_lag_1h'] = merged_df['aqi'].shift(1)
merged_df['aqi_lag_3h'] = merged_df['aqi'].shift(3)
merged_df['aqi_lag_6h'] = merged_df['aqi'].shift(6)


print(f"Derived features created New shape: {merged_df.shape}")
merged_df[['datetime', 'aqi', 'aqi_change_1h', 'aqi_ma_3h', 'aqi_lag_1h']].head(10)


Derived features created New shape: (8784, 35)


Unnamed: 0,datetime,aqi,aqi_change_1h,aqi_ma_3h,aqi_lag_1h
0,2024-10-10 00:00:00+00:00,81,,,
1,2024-10-10 01:00:00+00:00,83,2.0,,81.0
2,2024-10-10 02:00:00+00:00,91,8.0,85.0,83.0
3,2024-10-10 03:00:00+00:00,101,10.0,91.666667,91.0
4,2024-10-10 04:00:00+00:00,97,-4.0,96.333333,101.0
5,2024-10-10 05:00:00+00:00,187,90.0,128.333333,97.0
6,2024-10-10 06:00:00+00:00,228,41.0,170.666667,187.0
7,2024-10-10 07:00:00+00:00,500,272.0,305.0,228.0
8,2024-10-10 08:00:00+00:00,246,-254.0,324.666667,500.0
9,2024-10-10 09:00:00+00:00,244,-2.0,330.0,246.0


### Weather pollution interaction features

In [84]:
merged_df['temp_humidity_interaction'] = merged_df['temperature'] * merged_df['humidity']
merged_df['wind_pollution_ratio'] = merged_df['wind_speed'] / (merged_df['pm25'] + 1)
merged_df['pressure_stability'] = merged_df['pressure'].rolling(window=6).std()


In [85]:
merged_df.shape

(8784, 38)

In [86]:
merged_df = merged_df.sort_values(by='datetime').reset_index(drop=True)

lag_rolling_cols = [
    'aqi_change_1h','aqi_change_3h','aqi_change_6h',
    'aqi_ma_3h','aqi_ma_6h','aqi_ma_12h','aqi_ma_24h',
    'aqi_lag_1h','aqi_lag_3h','aqi_lag_6h',
    'pressure_stability'
]

merged_df = merged_df.dropna(subset=[c for c in lag_rolling_cols if c in merged_df.columns])



In [87]:
print(f"Original dataset shape: {merged_df.shape}")
print(f"Columns: {list(merged_df.columns)}")

feature_df = merged_df.copy()

feature_df['timestamp'] = pd.to_datetime(feature_df['datetime'])
feature_df = feature_df.drop('datetime',axis=1)

# Handling categorical features

feature_df['season_encoded'] = feature_df['season'].map(
    {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Autumn': 3}
)

feature_df = feature_df.drop('season',axis=1)

feature_df['aqi_category_encoded'] = feature_df['aqi_category'].map({
    'Good': 0, 'Moderate': 1, 'Unhealthy for Sensitive Groups': 2,
    'Unhealthy': 3, 'Very Unhealthy': 4, 'Hazardous': 5
})

feature_df = feature_df.drop('aqi_category',axis=1)


Original dataset shape: (8761, 38)
Columns: ['datetime', 'temperature', 'humidity', 'pressure', 'wind_speed', 'wind_direction', 'precipitation', 'pm10', 'pm25', 'co', 'no2', 'o3', 'so2', 'aqi', 'aqi_category', 'hour', 'day', 'month', 'weekday', 'is_weekend', 'season', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'aqi_change_1h', 'aqi_change_3h', 'aqi_change_6h', 'aqi_ma_3h', 'aqi_ma_6h', 'aqi_ma_12h', 'aqi_ma_24h', 'aqi_lag_1h', 'aqi_lag_3h', 'aqi_lag_6h', 'temp_humidity_interaction', 'wind_pollution_ratio', 'pressure_stability']


In [88]:
# Ensuring all columns are numeric
y = feature_df['aqi'].copy()
X = feature_df.drop(columns=['aqi'])

ts = pd.to_datetime(X['timestamp'], utc=True, errors='coerce')
X['timestamp'] = ts.dt.tz_convert('UTC').dt.tz_localize(None)

# Creating integer primary key
X['ts_epoch_ms'] = (ts.view('int64') // 10**6).astype('int64')

X.columns = [c.strip().lower().replace(' ', '_') for c in X.columns]
for col in X.columns:
    if col not in ('timestamp','ts_epoch_ms') and X[col].dtype == 'object':
        X[col] = pd.to_numeric(X[col], errors='coerce')


X = X.dropna()
X = X.sort_values('timestamp').drop_duplicates(subset=['ts_epoch_ms'],keep='last').reset_index(drop=True)
        
print(f"Final features shape for upload: {X.shape}")
print(f"Label length (not uploaded): {len(y)}")


Final features shape for upload: (8761, 38)
Label length (not uploaded): 8761


In [89]:
labels_df = feature_df[['timestamp','aqi']].copy()
labels_df['timestamp'] = pd.to_datetime(labels_df['timestamp'], utc=True, errors='coerce').dt.tz_convert('UTC').dt.tz_localize(None)
labels_df = labels_df.dropna().sort_values('timestamp').drop_duplicates(subset=['timestamp'], keep='last').reset_index(drop=True)

In [90]:
X.sample(5)

Unnamed: 0,temperature,humidity,pressure,wind_speed,wind_direction,precipitation,pm10,pm25,co,no2,...,aqi_lag_1h,aqi_lag_3h,aqi_lag_6h,temp_humidity_interaction,wind_pollution_ratio,pressure_stability,timestamp,season_encoded,aqi_category_encoded,ts_epoch_ms
1191,26.1565,68.72016,1014.1,7.467289,285.3763,0.0,38.5,32.2,750.0,42.7,...,200.0,244.0,238.0,1797.478865,0.224918,0.626099,2024-11-29 14:00:00,3,1,1732888800000
7509,27.3565,91.54208,999.4,10.601679,310.17917,0.0,40.4,27.5,232.0,32.4,...,84.0,87.0,500.0,2504.270912,0.371989,0.469042,2025-08-19 20:00:00,2,1,1755633600000
5849,29.1565,81.22429,1002.3,14.892213,262.35968,0.0,44.8,24.2,216.0,14.1,...,77.0,108.0,154.0,2368.216011,0.590961,0.488535,2025-06-11 16:00:00,2,1,1749657600000
7787,30.5065,66.48252,1001.6,18.06199,240.11356,0.0,95.9,32.2,230.0,4.4,...,161.0,151.0,89.0,2028.148996,0.544036,1.493542,2025-08-31 10:00:00,2,2,1756634400000
410,25.456501,89.52172,1007.0,9.56713,281.94415,0.0,52.4,22.5,681.0,20.1,...,500.0,82.0,104.0,2278.909755,0.407112,0.354495,2024-10-28 01:00:00,3,1,1730077200000


### Hopsworks Feature Store Integration

In [91]:
import hopsworks

project = hopsworks.login(
    project = 'aqi_prediction72',
    api_key_file = "hopsworks.key"
)

print("Connected to Project: ",project.name)
fs = project.get_feature_store()
print("Connected to Feature Store: ",fs.name)



2025-10-13 16:07:18,166 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-10-13 16:07:18,174 INFO: Initializing external client
2025-10-13 16:07:18,176 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-10-13 16:07:21,396 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1257622
Connected to Project:  aqi_prediction72
Connected to Feature Store:  aqi_prediction72_featurestore


### Creating Feature Group Schema

In [97]:
FG_NAME = "aqi_features_on"
FG_VER = 1


aqi_fg = fs.create_feature_group(
    name=FG_NAME,
    version=FG_VER,
    description="AQI features with int PK for online serving",
    primary_key=["ts_epoch_ms"],     
    event_time="timestamp",          
    online_enabled=True
)
print("Feature Group created:",aqi_fg.name, aqi_fg.version)


Feature Group created: aqi_features_on 1


In [98]:
job = aqi_fg.insert(X,write_options={"wait_for_job": True})
print(f"Uploaded {len(X)} rows to {FG_NAME} v{FG_VER}")

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1257622/fs/1245251/fg/1551695


Uploading Dataframe: 100.00% |██████████| Rows 8761/8761 | Elapsed Time: 00:04 | Remaining Time: 00:00


Launching job: aqi_features_on_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1257622/jobs/named/aqi_features_on_1_offline_fg_materialization/executions
2025-10-13 16:11:33,525 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-10-13 16:11:36,885 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-10-13 16:13:49,804 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-10-13 16:13:50,071 INFO: Waiting for log aggregation to finish.
2025-10-13 16:14:10,165 INFO: Execution finished successfully.
Uploaded 8761 rows to aqi_features_on v1


In [99]:
LABEL_FG_NAME = "aqi_labels_on"
LABEL_FG_VER = 1

aqi_labels = fs.create_feature_group(
    name=LABEL_FG_NAME,
    version=LABEL_FG_VER,
    description="AQI labels aligned by timestamp",
    primary_key=["timestamp"],
    event_time="timestamp",
    online_enabled=False
)

print("Labels FG created")

aqi_labels.insert(labels_df,write_options={"wait_for_job": True})
print(f"Uploaded {len(labels_df)} labels")

Labels FG created
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1257622/fs/1245251/fg/1551696


Uploading Dataframe: 100.00% |██████████| Rows 8761/8761 | Elapsed Time: 00:07 | Remaining Time: 00:00


Launching job: aqi_labels_on_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1257622/jobs/named/aqi_labels_on_1_offline_fg_materialization/executions
2025-10-13 16:14:45,216 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-10-13 16:14:48,584 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-10-13 16:16:38,053 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-10-13 16:16:38,361 INFO: Waiting for log aggregation to finish.
2025-10-13 16:17:09,081 INFO: Execution finished successfully.
Uploaded 8761 labels


### Creating feature view for ML Training

In [101]:
fv_query = aqi_fg.select_all().join(
    aqi_labels.select(['timestamp','aqi']),
    on=['timestamp']
)

FV_NAME = "aqi_prediction_online"
FV_VER = 1

feature_view = fs.create_feature_view(
    name=FV_NAME,
    version=FV_VER,
    description="Online features (int PK) joined with offline labels on timestamp",
    query=fv_query,
    labels=["aqi"]
)
print("Feature View created:", feature_view.name, feature_view.version)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1257622/fs/1245251/fv/aqi_prediction_online/version/1
Feature View created: aqi_prediction_online 1
