In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,timedelta
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette('husl')


In [4]:
weather_df = pd.read_csv("../data/karachi_weather_1year.csv")
weather_df.head()

Unnamed: 0,datetime,temperature,humidity,pressure,wind_speed,wind_direction,precipitation
0,2024-10-10 00:00:00+00:00,25.8065,82.28609,1008.3,8.538149,341.56494,0.0
1,2024-10-10 01:00:00+00:00,25.956501,86.123695,1009.0,8.93859,334.98312,0.0
2,2024-10-10 02:00:00+00:00,26.0065,83.566086,1009.4,8.244708,323.88055,0.0
3,2024-10-10 03:00:00+00:00,28.1065,74.54487,1009.8,9.039514,305.2725,0.0
4,2024-10-10 04:00:00+00:00,30.706501,64.15833,1009.9,13.783817,310.76352,0.0


In [5]:
air_quality_df = pd.read_csv("../data/karachi_air_quality_1year.csv")
air_quality_df.head()

Unnamed: 0,datetime,pm10,pm25,co,no2,o3,so2
0,2024-10-10 00:00:00+00:00,44.1,26.5,269.0,14.0,49.0,6.8
1,2024-10-10 01:00:00+00:00,45.2,27.1,430.0,24.7,40.0,8.0
2,2024-10-10 02:00:00+00:00,50.4,30.9,645.0,39.0,29.0,9.6
3,2024-10-10 03:00:00+00:00,57.5,35.6,760.0,46.1,31.0,11.0
4,2024-10-10 04:00:00+00:00,56.0,33.8,677.0,39.0,59.0,12.1


In [6]:
weather_df.isnull().sum()

datetime          0
temperature       0
humidity          0
pressure          0
wind_speed        0
wind_direction    0
precipitation     0
dtype: int64

In [7]:
air_quality_df.isnull().sum()

datetime    0
pm10        0
pm25        0
co          0
no2         0
o3          0
so2         0
dtype: int64

In [8]:
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
air_quality_df['datetime'] = pd.to_datetime(air_quality_df['datetime'])

merged_df = pd.merge(weather_df,air_quality_df,on='datetime',how='inner')
print(f"Merged dataset shape: {merged_df.shape}")
print(f"Date range: {merged_df['datetime'].min()} to {merged_df['datetime'].max()}")

Merged dataset shape: (8784, 13)
Date range: 2024-10-10 00:00:00+00:00 to 2025-10-10 23:00:00+00:00


In [9]:
merged_df.isnull().sum()

datetime          0
temperature       0
humidity          0
pressure          0
wind_speed        0
wind_direction    0
precipitation     0
pm10              0
pm25              0
co                0
no2               0
o3                0
so2               0
dtype: int64

In [10]:
merged_df.sample(5)

Unnamed: 0,datetime,temperature,humidity,pressure,wind_speed,wind_direction,precipitation,pm10,pm25,co,no2,o3,so2
3836,2025-03-18 20:00:00+00:00,22.7565,82.681015,1012.2,7.657937,293.55228,0.0,27.0,16.5,561.0,26.9,59.0,9.3
3039,2025-02-13 15:00:00+00:00,22.9065,67.00851,1015.7,8.101999,268.727,0.0,88.9,47.6,1192.0,55.9,53.0,19.8
694,2024-11-07 22:00:00+00:00,25.6565,92.81976,1012.5,8.951267,285.1541,0.0,35.7,19.5,165.0,19.4,47.0,5.2
1056,2024-11-23 00:00:00+00:00,24.0065,75.02335,1013.2,6.775751,309.61066,0.0,18.8,15.6,277.0,11.3,66.0,9.2
1124,2024-11-25 20:00:00+00:00,23.6065,61.037457,1015.0,4.278551,14.620926,0.0,66.1,60.8,1379.0,82.7,15.0,25.9


### Calculating AQI from pollutant Concentrations

In [11]:
def calculate_aqi(pm25,pm10,o3,no2,co,so2):
    def calculate_individual_aqi(conc,breakpoints):
        for i, (c_low,c_high,aqi_low,aqi_high) in enumerate(breakpoints):
            if conc<=c_high:
                if conc >= c_low:
                    aqi =  ((aqi_high - aqi_low) / (c_high - c_low)) * (conc - c_low) + aqi_low
                    return round(aqi)
        return 500
    pm25_breakpoints = [
        (0, 12, 0, 50), (12.1, 35.4, 51, 100), (35.5, 55.4, 101, 150),
        (55.5, 150.4, 151, 200), (150.5, 250.4, 201, 300), (250.5, 500, 301, 500)
    ]
    
    pm10_breakpoints = [
        (0, 54, 0, 50), (55, 154, 51, 100), (155, 254, 101, 150),
        (255, 354, 151, 200), (355, 424, 201, 300), (425, 604, 301, 500)
    ]
    
    o3_breakpoints = [ 
        (0, 54, 0, 50), (55, 70, 51, 100), (71, 85, 101, 150),
        (86, 105, 151, 200), (106, 200, 201, 300)
    ]
    
    no2_breakpoints = [ 
        (0, 53, 0, 50), 
        (54, 100, 51, 100), 
        (101, 360, 101, 150),
        (361, 649, 151, 200), 
        (650, 1249, 201, 300), 
        (1250, 2049, 301, 500)
    ]
    

    co_breakpoints = [
        (0, 4.4, 0, 50),      # 0-4.4 ppm
        (4.5, 9.4, 51, 100),  # 4.5-9.4 ppm
        (9.5, 12.4, 101, 150), # 9.5-12.4 ppm
        (12.5, 15.4, 151, 200), # 12.5-15.4 ppm
        (15.5, 30.4, 201, 300), # 15.5-30.4 ppm
        (30.5, 50.4, 301, 500)  # 30.5-50.4 ppm
    ]
    
    so2_breakpoints = [
        (0, 35, 0, 50), 
        (36, 75, 51, 100), 
        (76, 185, 101, 150),
        (186, 304, 151, 200), 
        (305, 604, 201, 300), 
        (605, 1004, 301, 500)
    ]
    
    aqi_pm25 = calculate_individual_aqi(pm25,pm25_breakpoints)
    aqi_pm10 = calculate_individual_aqi(pm10,pm10_breakpoints)
    aqi_o3 = calculate_individual_aqi(o3,o3_breakpoints)
    aqi_no2 = calculate_individual_aqi(no2,no2_breakpoints)
    co_ppm = co/1145
    aqi_co = calculate_individual_aqi(co_ppm,co_breakpoints)
    aqi_so2 = calculate_individual_aqi(so2,so2_breakpoints)
    all_aqi = [aqi_pm25,aqi_pm10,aqi_o3,aqi_no2,aqi_co,aqi_so2]
    return max(all_aqi)


print("Including: PM2.5, PM10, O3, NO2, CO, SO2")

merged_df['aqi'] = merged_df.apply(
    lambda row:calculate_aqi(
        row['pm25'],row['pm10'],row['o3'],row['no2'],row['co'],row['so2']
    ),axis = 1
)

print(f"AQI calculated! Range: {merged_df['aqi'].min()} - {merged_df['aqi'].max()}")
print(f"Average AQI: {merged_df['aqi'].mean():.1f}")

    

Including: PM2.5, PM10, O3, NO2, CO, SO2
AQI calculated! Range: 38 - 500
Average AQI: 135.8


In [12]:
print(merged_df['aqi'].describe())

count    8784.000000
mean      135.797700
std        78.540203
min        38.000000
25%        78.000000
50%       108.000000
75%       182.000000
max       500.000000
Name: aqi, dtype: float64


In [14]:
def get_aqi_category(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for Sensitive Groups"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"
    
merged_df['aqi_category'] = merged_df['aqi'].apply(get_aqi_category)
print(merged_df['aqi_category'].value_counts())


aqi_category
Moderate                          3984
Unhealthy for Sensitive Groups    1763
Very Unhealthy                    1718
Unhealthy                         1108
Hazardous                          162
Good                                49
Name: count, dtype: int64


In [15]:
merged_df[['datetime', 'pm25', 'pm10', 'o3', 'no2', 'co', 'so2', 'aqi', 'aqi_category']].head(5)

Unnamed: 0,datetime,pm25,pm10,o3,no2,co,so2,aqi,aqi_category
0,2024-10-10 00:00:00+00:00,26.5,44.1,49.0,14.0,269.0,6.8,81,Moderate
1,2024-10-10 01:00:00+00:00,27.1,45.2,40.0,24.7,430.0,8.0,83,Moderate
2,2024-10-10 02:00:00+00:00,30.9,50.4,29.0,39.0,645.0,9.6,91,Moderate
3,2024-10-10 03:00:00+00:00,35.6,57.5,31.0,46.1,760.0,11.0,101,Unhealthy for Sensitive Groups
4,2024-10-10 04:00:00+00:00,33.8,56.0,59.0,39.0,677.0,12.1,97,Moderate


### Time Based Feature Engineering

In [16]:
merged_df['hour'] = merged_df['datetime'].dt.hour
merged_df['day'] = merged_df['datetime'].dt.day
merged_df['month'] = merged_df['datetime'].dt.month
merged_df['weekday'] = merged_df['datetime'].dt.weekday
merged_df['is_weekend'] = (merged_df['weekday'] >= 5).astype(int)

merged_df['season'] = merged_df['month'].map({
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
})

merged_df['hour_sin'] = np.sin(2 * np.pi * merged_df['hour'] / 24)
merged_df['hour_cos'] = np.cos(2 * np.pi * merged_df['hour'] / 24)
merged_df['month_sin'] = np.sin(2 * np.pi * merged_df['month'] / 12)
merged_df['month_cos'] = np.cos(2 * np.pi * merged_df['month'] / 12)


print("Time-based features created!")
print(f"New dataset shape: {merged_df.shape}")

merged_df[['datetime','hour','weekday','season','hour_sin','hour_cos']].head(5)


Time-based features created!
New dataset shape: (8784, 25)


Unnamed: 0,datetime,hour,weekday,season,hour_sin,hour_cos
0,2024-10-10 00:00:00+00:00,0,3,Autumn,0.0,1.0
1,2024-10-10 01:00:00+00:00,1,3,Autumn,0.258819,0.965926
2,2024-10-10 02:00:00+00:00,2,3,Autumn,0.5,0.866025
3,2024-10-10 03:00:00+00:00,3,3,Autumn,0.707107,0.707107
4,2024-10-10 04:00:00+00:00,4,3,Autumn,0.866025,0.5


### Derived Features

In [17]:
merged_df = merged_df.sort_values(by='datetime').reset_index(drop=True)
merged_df['aqi_change_1h'] = merged_df['aqi'].diff()
merged_df['aqi_change_3h'] = merged_df['aqi'].diff(3)
merged_df['aqi_change_6h'] = merged_df['aqi'].diff(6)
merged_df['aqi_ma_3h'] = merged_df['aqi'].rolling(window=3).mean()
merged_df['aqi_ma_6h'] = merged_df['aqi'].rolling(window=6).mean()
merged_df['aqi_ma_12h'] = merged_df['aqi'].rolling(window=12).mean()
merged_df['aqi_ma_24h'] = merged_df['aqi'].rolling(window=24).mean()
merged_df['aqi_lag_1h'] = merged_df['aqi'].shift(1)
merged_df['aqi_lag_3h'] = merged_df['aqi'].shift(3)
merged_df['aqi_lag_6h'] = merged_df['aqi'].shift(6)


print(f"Derived features created New shape: {merged_df.shape}")
merged_df[['datetime', 'aqi', 'aqi_change_1h', 'aqi_ma_3h', 'aqi_lag_1h']].head(10)


Derived features created New shape: (8784, 35)


Unnamed: 0,datetime,aqi,aqi_change_1h,aqi_ma_3h,aqi_lag_1h
0,2024-10-10 00:00:00+00:00,81,,,
1,2024-10-10 01:00:00+00:00,83,2.0,,81.0
2,2024-10-10 02:00:00+00:00,91,8.0,85.0,83.0
3,2024-10-10 03:00:00+00:00,101,10.0,91.666667,91.0
4,2024-10-10 04:00:00+00:00,97,-4.0,96.333333,101.0
5,2024-10-10 05:00:00+00:00,187,90.0,128.333333,97.0
6,2024-10-10 06:00:00+00:00,228,41.0,170.666667,187.0
7,2024-10-10 07:00:00+00:00,500,272.0,305.0,228.0
8,2024-10-10 08:00:00+00:00,246,-254.0,324.666667,500.0
9,2024-10-10 09:00:00+00:00,244,-2.0,330.0,246.0


### Weather pollution interaction features

In [18]:
merged_df['temp_humidity_interaction'] = merged_df['temperature'] * merged_df['humidity']
merged_df['wind_pollution_ratio'] = merged_df['wind_speed'] / (merged_df['pm25'] + 1)
merged_df['pressure_stability'] = merged_df['pressure'].rolling(window=6).std()


In [19]:
merged_df.shape

(8784, 38)

### Hopsworks Feature Store Integration

In [1]:
import hopsworks

project = hopsworks.login(
    project = 'aqi_prediction72',
    api_key_file = "hopsworks.key"
)

print("Connected to Project: ",project.name)
fs = project.get_feature_store()
print("Connected to Feature Store: ",fs.name)



2025-10-11 22:35:43,346 INFO: Initializing external client
2025-10-11 22:35:43,346 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-10-11 22:35:47,238 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1257622
Connected to Project:  aqi_prediction72
Connected to Feature Store:  aqi_prediction72_featurestore


In [21]:
print(f"Original dataset shape: {merged_df.shape}")
print(f"Columns: {list(merged_df.columns)}")

feature_df = merged_df.copy()

feature_df['timestamp'] = pd.to_datetime(feature_df['datetime'])
feature_df = feature_df.drop('datetime',axis=1)

# Handling categorical features

feature_df['season_encoded'] = feature_df['season'].map(
    {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Autumn': 3}
)

feature_df = feature_df.drop('season',axis=1)

feature_df['aqi_category_encoded'] = feature_df['aqi_category'].map({
    'Good': 0, 'Moderate': 1, 'Unhealthy for Sensitive Groups': 2,
    'Unhealthy': 3, 'Very Unhealthy': 4, 'Hazardous': 5
})

feature_df = feature_df.drop('aqi_category',axis=1)


Original dataset shape: (8784, 38)
Columns: ['datetime', 'temperature', 'humidity', 'pressure', 'wind_speed', 'wind_direction', 'precipitation', 'pm10', 'pm25', 'co', 'no2', 'o3', 'so2', 'aqi', 'aqi_category', 'hour', 'day', 'month', 'weekday', 'is_weekend', 'season', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'aqi_change_1h', 'aqi_change_3h', 'aqi_change_6h', 'aqi_ma_3h', 'aqi_ma_6h', 'aqi_ma_12h', 'aqi_ma_24h', 'aqi_lag_1h', 'aqi_lag_3h', 'aqi_lag_6h', 'temp_humidity_interaction', 'wind_pollution_ratio', 'pressure_stability']


In [22]:
# Ensuring all columns are numeric

print(feature_df.dtypes.value_counts())

for col in feature_df.columns:
    if feature_df[col].dtype == 'object':
        print(f"Column {col} is non-numeric")
        feature_df[col] = pd.to_numeric(feature_df[col],errors='coerce')
        
print(f"\nPrepared feature dataset shape: {feature_df.shape}")
print(f"All columns are numeric: {feature_df.dtypes.value_counts()}")

float64                29
int32                   5
int64                   3
datetime64[ns, UTC]     1
Name: count, dtype: int64

Prepared feature dataset shape: (8784, 38)
All columns are numeric: float64                29
int32                   5
int64                   3
datetime64[ns, UTC]     1
Name: count, dtype: int64


In [27]:
feature_df.sample(5)

Unnamed: 0,temperature,humidity,pressure,wind_speed,wind_direction,precipitation,pm10,pm25,co,no2,...,aqi_ma_24h,aqi_lag_1h,aqi_lag_3h,aqi_lag_6h,temp_humidity_interaction,wind_pollution_ratio,pressure_stability,timestamp,season_encoded,aqi_category_encoded
6368,31.5065,74.42955,997.9,15.680675,228.25731,0.1,91.7,27.5,219.0,3.9,...,103.166667,136.0,90.0,500.0,2345.014617,0.550199,0.496655,2025-07-02 08:00:00+00:00,2,2
126,35.9565,26.347225,1011.3,12.601285,89.18157,0.0,35.9,22.7,300.0,9.9,...,160.958333,213.0,82.0,146.0,947.353996,0.5317,0.598052,2024-10-15 06:00:00+00:00,3,4
6547,29.4065,83.693794,999.9,12.145665,230.4119,0.1,85.2,33.3,203.0,28.6,...,113.833333,98.0,95.0,97.0,2461.141553,0.354101,1.017186,2025-07-09 19:00:00+00:00,2,1
7023,28.456501,80.65368,998.0,18.777029,240.7321,0.1,43.9,19.7,245.0,13.6,...,96.125,68.0,84.0,112.0,2295.121526,0.907103,0.480278,2025-07-29 15:00:00+00:00,2,1
2897,22.7565,18.284933,1016.8,10.685391,57.380775,0.0,31.4,27.3,858.0,47.4,...,145.541667,88.0,95.0,229.0,416.101078,0.377576,0.983192,2025-02-07 17:00:00+00:00,0,1


### Creating Feature Group Schema

In [30]:
try:
    feature_descriptions = {
        'timestamp': 'Event timestamp',
        'temperature': 'Temperature in Celsius',
        'humidity': 'Relative humidity percentage',
        'pressure': 'Atmospheric pressure in hPa',
        'wind_speed': 'Wind speed in m/s',
        'wind_direction': 'Wind direction in degrees',
        'precipitation': 'Precipitation in mm',
        'pm10': 'PM10 concentration in µg/m³',
        'pm25': 'PM2.5 concentration in µg/m³',
        'co': 'Carbon monoxide concentration in µg/m³',
        'no2': 'Nitrogen dioxide concentration in ppb',
        'o3': 'Ozone concentration in ppb',
        'so2': 'Sulfur dioxide concentration in ppb',
        'aqi': 'Air Quality Index (target variable)',
        'hour': 'Hour of day (0-23)',
        'day': 'Day of month',
        'month': 'Month of year (1-12)',
        'weekday': 'Day of week (0=Monday, 6=Sunday)',
        'is_weekend': 'Weekend indicator (0=weekday, 1=weekend)',
        'season_encoded': 'Season encoded (0=Winter, 1=Spring, 2=Summer, 3=Autumn)',
        'aqi_category_encoded': 'AQI category encoded (0=Good to 5=Hazardous)'
    }
    
    aqi_fg = fs.create_feature_group(
        name = "aqi_features",
        version = 1,
        description = "AQI prediction features with weather data, pollutants, and engineered time-based features",
        primary_key = ['timestamp'],
        event_time = 'timestamp',
        online_enabled = True,
        validation_type = "STRICT",
        expectation_suite = None
    )
    
    print(f"Feature Group aqi_features created successfully!")
    
except Exception as e:
    print(f"Error creating feature group: {e}")

try:
    aqi_fg = fs.get_feature_group("aqi_features",version = 1)
    print(f"Feature Group aqi_features already exists!")
except Exception as e:
    print(f"Could not find feature group aqi_features!")
    print("Trying with simpler config..")
    aqi_fg = fs.create_feature_group(
        name="aqi_features_simple",
            version=1,
            description="AQI prediction features",
            primary_key=["timestamp"],
            event_time="timestamp"
        )
    print(f"Simplified Feature Group aqi_features_simple created successfully!")
        
        
        

Error creating feature group: FeatureStore.create_feature_group() got an unexpected keyword argument 'validation_type'
Feature Group aqi_features already exists!


### Uploading features to feature store

In [35]:
try:
    aqi_fg = fs.get_feature_group("aqi_features",version=1)
    print(f"✅ Successfully retrieved Feature Group: {aqi_fg.name}")
    print(f"   Version: {aqi_fg.version}")
    print(f"   Features: {len(aqi_fg.features)}")
    
except Exception as e:
    print(f"Could not retrieve Feature Group: {e}")
    try:
        aqi_fg = fs.create_feature_group(
            name="aqi_features",
            version=1,
            description="AQI prediction features",
            primary_key=["timestamp"],
            event_time="timestamp"
        )
        print("New Feature Group created!")
        
    except Exception as e2:
        if "already exists" in str(e2).lower():
            print("Feature Group exists but couldn't retrieve it.")
            try:
                aqi_fg = fs.create_feature_group(
                    name="aqi_features",
                    version=2,
                    description="AQI prediction features v2",
                    primary_key=["timestamp"],
                    event_time="timestamp"
                )
                print("Feature Group v2 created!")
            except Exception as e3:
                print(f"Still failed: {e3}")
        else:
            print(f"Creation failed: {e2}")
            


✅ Successfully retrieved Feature Group: aqi_features
   Version: 1
   Features: 38


In [36]:
if aqi_fg is not None:
    print(f"Dataset shape: {feature_df.shape}")
    
    try:
        job = aqi_fg.insert(
            feature_df,
            write_options={"wait_for_job": True}
        )
        
        print(f"Uploaded {len(feature_df)} records")
        
    except Exception as e:
        print(f"Upload failed: {e}")
        
else:
    print("Cannot upload")

Dataset shape: (8784, 38)


Uploading Dataframe: 100.00% |██████████| Rows 8784/8784 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: aqi_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1257622/jobs/named/aqi_features_1_offline_fg_materialization/executions
2025-10-11 23:08:46,588 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-10-11 23:08:49,966 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-10-11 23:10:20,594 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-10-11 23:10:20,899 INFO: Waiting for log aggregation to finish.
2025-10-11 23:10:30,196 INFO: Execution finished successfully.
Uploaded 8784 records


### Creating feature view for ML Training

In [38]:
try:
    feature_view = fs.create_feature_view(
        name = "aqi_prediction_fv",
        version = 1,
        description = "Feature view for AQI prediction ML Training",
        query = aqi_fg.select_all(),
        labels = ["aqi"] # -> Target variable
    )
    print(f"Feature View aqi_prediction_fv created successfully!")
except Exception as e:
    if "already exists" in str(e).lower():
        print(f"Feature View aqi_prediction_fv already exists!")
        feature_view = fs.get_feature_view(
            name = "aqi_prediction_fv",
            version = 1
        )
    else:
        print(f"Error creating feature view: {e}")
        
print(f"Feature Group: {aqi_fg.name}")
print(f"Records in Feature Group: 8784")
print(f"Features in Feature Group: 38")
print(f"Target: AQI")
print("Ready for ML Training!")
        
    


Feature View aqi_prediction_fv already exists!
Feature Group: aqi_features
Records in Feature Group: 8784
Features in Feature Group: 38
Target: AQI
Ready for ML Training!
