In [72]:
!pip install openmeteo-requests==1.3.0 requests-cache==1.2.1 retry-requests==2.0.0 numpy==1.26.4 pandas geopy folium streamlit-folium geopy --q

import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta, timezone
import openmeteo_requests 
from openmeteo_requests import Client
import requests_cache
from retry_requests import retry

In [74]:
import os 
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [76]:
# API Key and City Name
API_KEY = '5bdede423214bc661bc2c20f401a0e14'
CITY = 'karachi'

url = f"http://api.openweathermap.org/geo/1.0/direct?q={CITY},PK&limit=5&appid={API_KEY}"
# Make the API request
response = requests.get(url)
raw = response.json()

In [78]:
# Extract latitude and longitude
latitude = raw[0]["lat"]
longitude = raw[0]["lon"]
print(f"Latitude: {latitude}, Longitude: {longitude}")

Latitude: 24.8546842, Longitude: 67.0207055


In [80]:
#Set time range for air pollution data
current_datetime = datetime.now(timezone.utc)  # Current UTC time as timezone-aware
current_unix_time = int(current_datetime.timestamp())  # Current time in UNIX format

start_datetime = current_datetime - timedelta(days=365)  # Start time 1 year ago
start_unix_time = int(start_datetime.timestamp())


In [82]:
#Fetch air pollution data
pollution_url = (
        f"http://api.openweathermap.org/data/2.5/air_pollution/history?"
        f"lat={latitude}&lon={longitude}&start={start_unix_time}&end={current_unix_time}&appid={API_KEY}"
)
pollution_response = requests.get(pollution_url)
pollution_data = pollution_response.json()

In [83]:
karachi = pd.json_normalize(pollution_data["list"])
print(karachi)

              dt  main.aqi  components.co  components.no  components.no2  \
0     1706025600         5        2216.34           2.32           90.48   
1     1706029200         5        4165.65          33.53          111.04   
2     1706032800         5        8117.68         103.71          139.83   
3     1706036400         5       13031.01         173.45          197.41   
4     1706040000         5       14312.74         184.18          222.09   
...          ...       ...            ...            ...             ...   
8635  1737543600         3         397.21           1.02            7.28   
8636  1737547200         3         407.22           0.79            9.17   
8637  1737550800         3         907.90           0.58           34.62   
8638  1737554400         4        1428.60           0.00           56.21   
8639  1737558000         5        1655.58           0.01           58.95   

      components.o3  components.so2  components.pm2_5  components.pm10  \
0            

In [84]:
#Convert Unix Timestamps to Readable Date/Time
karachi['dt'] = pd.to_datetime(karachi['dt'], unit='s')
karachi.set_index('dt', inplace=True)

karachi.sort_values(by='dt', ascending=True)

Unnamed: 0_level_0,main.aqi,components.co,components.no,components.no2,components.o3,components.so2,components.pm2_5,components.pm10,components.nh3
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-01-23 16:00:00,5,2216.34,2.32,90.48,20.38,38.62,109.09,179.84,29.39
2024-01-23 17:00:00,5,4165.65,33.53,111.04,0.06,46.25,208.32,295.13,57.76
2024-01-23 18:00:00,5,8117.68,103.71,139.83,0.00,66.76,439.62,561.32,106.39
2024-01-23 19:00:00,5,13031.01,173.45,197.41,0.00,86.78,737.33,907.86,151.99
2024-01-23 20:00:00,5,14312.74,184.18,222.09,0.00,87.74,827.94,1013.23,151.99
...,...,...,...,...,...,...,...,...,...
2025-01-22 11:00:00,3,397.21,1.02,7.28,124.45,6.97,19.63,41.86,2.95
2025-01-22 12:00:00,3,407.22,0.79,9.17,117.30,7.45,18.36,38.50,2.88
2025-01-22 13:00:00,3,907.90,0.58,34.62,82.97,11.09,33.28,54.82,10.77
2025-01-22 14:00:00,4,1428.60,0.00,56.21,48.64,13.71,58.19,84.51,20.52


In [88]:
current = datetime.now(timezone.utc).date() - timedelta(days=365 + 5)
end = datetime.now(timezone.utc).date() - timedelta(days=5)

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = Client(session=retry_session)

# Define the API parameters
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": 24.8546842,
    "longitude": 67.0207055,
    "start_date": current,
    "end_date": end,
    "hourly": ["temperature_2m", "relative_humidity_2m"]
}

# Make API call
responses = openmeteo.weather_api(url, params=params)

# Process the first response
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process hourly data
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()

hourly_data = {
    "date": pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left"
    ),
    "temperature_2m": hourly_temperature_2m,
    "relative_humidity_2m": hourly_relative_humidity_2m
}

hourly_dataframe = pd.DataFrame(data=hourly_data)
hourly_dataframe["date"] = pd.to_datetime(hourly_dataframe["date"]).dt.tz_localize(None)
print(hourly_dataframe)

# Forecasting setup
start_date = datetime.now(timezone.utc) - timedelta(days=5)
end_date = datetime.now(timezone.utc)

url = "https://api.open-meteo.com/v1/forecast"
params = {
    "latitude": 24.8546842,
    "longitude": 67.0207055,
    "hourly": ["temperature_2m", "relative_humidity_2m"],
    "start_date": start_date.strftime("%Y-%m-%d"),
    "end_date": end_date.strftime("%Y-%m-%d")
}

responses = openmeteo.weather_api(url, params=params)

# Process forecast response
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()

hourly_data = {
    "date": pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left"
    ),
    "temperature_2m": hourly_temperature_2m,
    "relative_humidity_2m": hourly_relative_humidity_2m
}

dataframe = pd.DataFrame(data=hourly_data)
print(dataframe)


Coordinates 24.850614547729492°N 66.99248504638672°E
Elevation 8.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
                    date  temperature_2m  relative_humidity_2m
0    2024-01-18 00:00:00       16.500000             63.928204
1    2024-01-18 01:00:00       15.700000             67.955162
2    2024-01-18 02:00:00       15.150000             69.456673
3    2024-01-18 03:00:00       15.450000             66.997360
4    2024-01-18 04:00:00       17.450001             58.008617
...                  ...             ...                   ...
8779 2025-01-17 19:00:00       17.500000             44.433994
8780 2025-01-17 20:00:00       17.350000             42.423756
8781 2025-01-17 21:00:00       17.200001             41.063839
8782 2025-01-17 22:00:00       16.850000             42.280239
8783 2025-01-17 23:00:00       16.400000             46.011971

[8784 rows x 3 columns]
Coordinates 24.875°N 67.0°E
Elevation 8.0 m asl
Timezone None None
Timezone difference to GMT+

In [90]:
dataframe["date"] = pd.to_datetime(dataframe["date"]).dt.tz_localize(None)

dataframe

df = pd.concat([hourly_dataframe, dataframe], axis = 0)
df

final_df = pd.merge(karachi, df, right_on="date", left_on= "dt", how="inner")

#change the . in columns names to _
final_df.columns = final_df.columns.str.replace('.', '_')

final_df.sort_values(by="date", ascending=True).tail()

final_df["index"] = final_df.index

In [92]:
final_df.tail()

Unnamed: 0,main_aqi,components_co,components_no,components_no2,components_o3,components_so2,components_pm2_5,components_pm10,components_nh3,date,temperature_2m,relative_humidity_2m,index
8659,3,397.21,1.02,7.28,124.45,6.97,19.63,41.86,2.95,2025-01-22 11:00:00,25.8085,34.0,8659
8660,3,407.22,0.79,9.17,117.3,7.45,18.36,38.5,2.88,2025-01-22 12:00:00,24.658501,42.0,8660
8661,3,907.9,0.58,34.62,82.97,11.09,33.28,54.82,10.77,2025-01-22 13:00:00,23.3085,46.0,8661
8662,4,1428.6,0.0,56.21,48.64,13.71,58.19,84.51,20.52,2025-01-22 14:00:00,22.008501,52.0,8662
8663,5,1655.58,0.01,58.95,38.62,13.95,76.86,107.71,26.35,2025-01-22 15:00:00,21.008501,59.0,8663


In [94]:
# Check for missing values
print(final_df.isnull().sum())

main_aqi                0
components_co           0
components_no           0
components_no2          0
components_o3           0
components_so2          0
components_pm2_5        0
components_pm10         0
components_nh3          0
date                    0
temperature_2m          0
relative_humidity_2m    0
index                   0
dtype: int64


In [96]:
# Ensure the 'date' column is in datetime format
final_df['date'] = pd.to_datetime(final_df['date']).dt.tz_localize(None)

# Extract additional columns from the 'date' column
final_df['time'] = final_df['date'].dt.time
final_df['day'] = final_df['date'].dt.day
final_df['month'] = final_df['date'].dt.month
final_df['year'] = final_df['date'].dt.year
final_df['hour'] = final_df['date'].dt.hour
final_df['day_of_week'] = final_df['date'].dt.dayofweek

final_df['time'] = final_df['time'].apply(lambda x: x.strftime('%H:%M:%S') if isinstance(x, pd.Timestamp) else str(x))

# Determine whether the day is a weekend (Saturday = 5, Sunday = 6)
final_df['is_weekend'] = final_df['day_of_week'].apply(lambda x: 1 if x in [5, 6] else 0)

# Display the last few rows sorted by date
final_df.sort_values(by="date", ascending=True).tail()


Unnamed: 0,main_aqi,components_co,components_no,components_no2,components_o3,components_so2,components_pm2_5,components_pm10,components_nh3,date,temperature_2m,relative_humidity_2m,index,time,day,month,year,hour,day_of_week,is_weekend
8659,3,397.21,1.02,7.28,124.45,6.97,19.63,41.86,2.95,2025-01-22 11:00:00,25.8085,34.0,8659,11:00:00,22,1,2025,11,2,0
8660,3,407.22,0.79,9.17,117.3,7.45,18.36,38.5,2.88,2025-01-22 12:00:00,24.658501,42.0,8660,12:00:00,22,1,2025,12,2,0
8661,3,907.9,0.58,34.62,82.97,11.09,33.28,54.82,10.77,2025-01-22 13:00:00,23.3085,46.0,8661,13:00:00,22,1,2025,13,2,0
8662,4,1428.6,0.0,56.21,48.64,13.71,58.19,84.51,20.52,2025-01-22 14:00:00,22.008501,52.0,8662,14:00:00,22,1,2025,14,2,0
8663,5,1655.58,0.01,58.95,38.62,13.95,76.86,107.71,26.35,2025-01-22 15:00:00,21.008501,59.0,8663,15:00:00,22,1,2025,15,2,0


In [98]:
print(final_df.columns)

Index(['main_aqi', 'components_co', 'components_no', 'components_no2',
       'components_o3', 'components_so2', 'components_pm2_5',
       'components_pm10', 'components_nh3', 'date', 'temperature_2m',
       'relative_humidity_2m', 'index', 'time', 'day', 'month', 'year', 'hour',
       'day_of_week', 'is_weekend'],
      dtype='object')


In [100]:
aqi_mapping = {
    1: 'Good',
    2: 'Fair',
    3: 'Moderate',
    4: 'Poor',
    5: 'Very Poor'
}
final_df['aqi_category'] = final_df['main_aqi'].map(aqi_mapping)

In [102]:
# Adding lag features
final_df['aqi_lag_1'] = final_df['main_aqi'].shift(1)
final_df['aqi_lag_2'] = final_df['main_aqi'].shift(2)
final_df['aqi_lag_3'] = final_df['main_aqi'].shift(3)

In [104]:
# Rolling averages for AQI
final_df['aqi_rolling_mean_3'] = final_df['main_aqi'].rolling(window=3).mean()
final_df['aqi_rolling_mean_7'] = final_df['main_aqi'].rolling(window=7).mean()

In [106]:
# Interaction terms
final_df['co_pm2_5_interaction'] = final_df['components_co'] * final_df['components_pm2_5']
final_df['temperature_humidity_interaction'] = final_df['temperature_2m'] * final_df['relative_humidity_2m']

In [108]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_columns = ['main_aqi', 'components_co', 'components_pm2_5', 'temperature_2m', 'relative_humidity_2m']
final_df[scaled_columns] = scaler.fit_transform(final_df[scaled_columns])

In [110]:
print(final_df['temperature_humidity_interaction'].dtype)

float32


In [112]:
final_df['temperature_humidity_interaction'] = final_df['temperature_humidity_interaction'].astype('float64')


In [114]:
final_df['date'] = pd.to_datetime(final_df['date'])

In [116]:
final_df['year'] = final_df['date'].dt.year
final_df['month'] = final_df['date'].dt.month
final_df['day'] = final_df['date'].dt.day
final_df['weekday'] = final_df['date'].dt.weekday

In [118]:
final_df['aqi_category'] = final_df['aqi_category'].astype('category').cat.codes

In [120]:
final_df = final_df.drop(columns=['date', 'time'])

In [122]:
print(final_df.columns)

Index(['main_aqi', 'components_co', 'components_no', 'components_no2',
       'components_o3', 'components_so2', 'components_pm2_5',
       'components_pm10', 'components_nh3', 'temperature_2m',
       'relative_humidity_2m', 'index', 'day', 'month', 'year', 'hour',
       'day_of_week', 'is_weekend', 'aqi_category', 'aqi_lag_1', 'aqi_lag_2',
       'aqi_lag_3', 'aqi_rolling_mean_3', 'aqi_rolling_mean_7',
       'co_pm2_5_interaction', 'temperature_humidity_interaction', 'weekday'],
      dtype='object')


In [124]:
print(final_df.dtypes)

main_aqi                            float64
components_co                       float64
components_no                       float64
components_no2                      float64
components_o3                       float64
components_so2                      float64
components_pm2_5                    float64
components_pm10                     float64
components_nh3                      float64
temperature_2m                      float64
relative_humidity_2m                float64
index                                 int64
day                                   int32
month                                 int32
year                                  int32
hour                                  int32
day_of_week                           int32
is_weekend                            int64
aqi_category                           int8
aqi_lag_1                           float64
aqi_lag_2                           float64
aqi_lag_3                           float64
aqi_rolling_mean_3              

In [126]:
final_df = final_df.bfill()

In [128]:
print(final_df.isnull().sum())

main_aqi                            0
components_co                       0
components_no                       0
components_no2                      0
components_o3                       0
components_so2                      0
components_pm2_5                    0
components_pm10                     0
components_nh3                      0
temperature_2m                      0
relative_humidity_2m                0
index                               0
day                                 0
month                               0
year                                0
hour                                0
day_of_week                         0
is_weekend                          0
aqi_category                        0
aqi_lag_1                           0
aqi_lag_2                           0
aqi_lag_3                           0
aqi_rolling_mean_3                  0
aqi_rolling_mean_7                  0
co_pm2_5_interaction                0
temperature_humidity_interaction    0
weekday     

In [130]:
pip install hopsworks[python]

Note: you may need to restart the kernel to use updated packages.


In [132]:
import warnings
warnings.filterwarnings("ignore")

hopsworks_api = "ir5PKrvMxVGQtr4I.OJAzB9b685t2LvfMHguGosCsipkeOyV0XSRsiz5ia81FyxNkSlgHW5eGY6b3W99O"

import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

2025-01-22 20:34:07,204 INFO: Closing external client and cleaning up certificates.
2025-01-22 20:34:07,208 INFO: Initializing external client
2025-01-22 20:34:07,209 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-22 20:34:22,369 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1210516


In [134]:
# Define the feature group
air_quality_fg = fs.get_or_create_feature_group(
    name='aqi_featuregroup',
    description='Air Quality characteristics of each day',
    version=2,
    primary_key=["index"],
    online_enabled=True
)

# Insert the data into the feature group
air_quality_fg.insert(final_df, write_options={"wait_for_job": False})


Uploading Dataframe: 100.00% |███████████████████████████| Rows 8664/8664 | Elapsed Time: 00:22 | Remaining Time: 00:00


Launching job: aqi_featuregroup_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1210516/jobs/named/aqi_featuregroup_2_offline_fg_materialization/executions


(Job('aqi_featuregroup_2_offline_fg_materialization', 'SPARK'), None)