In [1]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('airquality',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")
    
# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Added the following directory to the PYTHONPATH: /Users/alexanderdahm/Documents/GitHub/mlfs-book-proj
HopsworksSettings initialized!


<span style="font-width:bold; font-size: 3rem; color:#333;">- Part 02: Daily Feature Pipeline for Air Quality (aqicn.org) and weather (openmeteo)</span>

## üóíÔ∏è This notebook is divided into the following sections:
1. Download and Parse Data
2. Feature Group Insertion


__This notebook should be scheduled to run daily__

In the book, we use a GitHub Action stored here:
[.github/workflows/air-quality-daily.yml](https://github.com/featurestorebook/mlfs-book/blob/main/.github/workflows/air-quality-daily.yml)

However, you are free to use any Python Orchestration tool to schedule this program to run daily.

### <span style='color:#ff5f27'> üìù Imports

In [2]:
import datetime
import time
import requests
import pandas as pd
import hopsworks
from mlfs.airquality import util
from mlfs import config
import json
import os
import warnings
warnings.filterwarnings("ignore")






In [3]:
class Sensor:
    def __init__(self, name, city, lat, lon, csv=""):
        self.name = name
        self.country = "sweden"
        self.city = city
        self.street = city
        self.url = ""  # no explicit AQICN URL provided for these cities
        self.lat = float(lat)
        self.lon = float(lon)
        self.csv = csv

cities = [
    {"name": "flasjon", "lat": 62.760350390111626, "lon": 13.715986496712969},
    {"name": "hudiksvall", "lat": 61.790862930411194, "lon": 17.15754858778168},
    {"name": "ange", "lat": 62.54989082316923, "lon": 15.751547550392734},
    {"name": "solleftea", "lat": 63.159587742988755, "lon": 17.2655114712721},
    {"name": "umea", "lat": 63.81702480736613, "lon": 20.18691175826482},
]

sensorList = []
for idx, c in enumerate(cities):
    sensorList.append(Sensor(f"sensor{idx}", c["name"], c["lat"], c["lon"], ""))
    

   

for sensor in sensorList:
    print(f"Processing sensor: {sensor.name} located at {sensor.street}, {sensor.city}, {sensor.country} with coordinates ({sensor.lat}, {sensor.lon}), csv path {sensor.csv}")


Processing sensor: sensor0 located at flasjon, flasjon, sweden with coordinates (62.760350390111626, 13.715986496712969), csv path 
Processing sensor: sensor1 located at hudiksvall, hudiksvall, sweden with coordinates (61.790862930411194, 17.15754858778168), csv path 
Processing sensor: sensor2 located at ange, ange, sweden with coordinates (62.54989082316923, 15.751547550392734), csv path 
Processing sensor: sensor3 located at solleftea, solleftea, sweden with coordinates (63.159587742988755, 17.2655114712721), csv path 
Processing sensor: sensor4 located at umea, umea, sweden with coordinates (63.81702480736613, 20.18691175826482), csv path 


## <span style='color:#ff5f27'> üåç Get the Sensor URL, Country, City, Street names from Hopsworks </span>

__Update the values in the cell below.__

__These should be the same values as in notebook 1 - the feature backfill notebook__


In [4]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 
secrets = hopsworks.get_secrets_api()

# This line will fail if you have not registered the AQICN_API_KEY as a secret in Hopsworks
#AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value
"""location_str = secrets.get_secret("SENSOR_LOCATION_JSON").value
location = json.loads(location_str)

country=location['country']
city=location['city']
street=location['street']
aqicn_url=location['aqicn_url']
latitude=location['latitude']
longitude=location['longitude']

today = datetime.date.today()

location_str"""
today = datetime.date.today()

2025-12-29 16:16:14,655 INFO: Initializing external client
2025-12-29 16:16:14,656 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-12-29 16:16:16,156 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1290388


### <span style="color:#ff5f27;"> üîÆ Get references to the Feature Groups </span>

In [5]:
# Retrieve feature groups
energy_price_fg = fs.get_feature_group(
    name='energy_price',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather',
    version=1,
)


In [6]:
# Collect per-city daily weather dfs to merge into wide format (same as file 1)
all_weather_data = []

# Get today air quality
price_value = util.get_energy_price(date=today)
energy_price_today_df = pd.DataFrame({"date": [today], "sek": [price_value]  })
energy_price_today_df['sek'] = energy_price_today_df['sek'].astype('float32')
energy_price_today_df['zone'] = "SE2"


energy_price_today_df["date"] = pd.to_datetime(energy_price_today_df["date"], utc=True).dt.tz_convert(None)

print(energy_price_today_df)
print(energy_price_today_df.dtypes)

# Loop over each sensor and add their daily data
for sensor in sensorList:
    country = sensor.country
    city = sensor.city
    street = sensor.street
    aqicn_url = sensor.url
    latitude = sensor.lat
    longitude = sensor.lon

    # Get weather data (daily at ~12:00), then rename columns with city suffix to produce wide format
    hourly_df = util.get_hourly_weather_forecast(city, latitude, longitude)
    hourly_df = hourly_df.set_index("date")

    # We will only make 1 daily prediction, so we will replace the hourly forecasts with a single daily forecast
    # We only want the daily weather data, so only get weather at 12:00
    daily_df = hourly_df.between_time('11:59', '12:01')
    daily_df = daily_df.reset_index()
    daily_df['date'] = pd.to_datetime(daily_df['date']).dt.date
    daily_df['date'] = pd.to_datetime(daily_df['date'])
    # Rename columns to include city name (exclude 'date')
    daily_df = daily_df.rename(columns={col: f"{col}_{city}" for col in daily_df.columns if col != "date"})
    all_weather_data.append(daily_df)

energy_price_fg.insert(energy_price_today_df)


if all_weather_data:
    combined_weather_df = all_weather_data[0]
    for df in all_weather_data[1:]:
        combined_weather_df = pd.merge(combined_weather_df, df, on="date", how="outer")
    weather_fg.insert(combined_weather_df, wait=True)

combined_weather_df


        date   sek zone
0 2025-12-29  2.71  SE2
date    datetime64[ns]
sek            float32
zone            object
dtype: object
Coordinates 62.75¬∞N 13.75¬∞E
Elevation 478.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 61.75¬∞N 17.25¬∞E
Elevation 65.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 62.5¬∞N 15.75¬∞E
Elevation 165.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 63.25¬∞N 17.25¬∞E
Elevation 66.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Coordinates 63.75¬∞N 20.25¬∞E
Elevation 18.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
2025-12-29 16:16:18,331 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1290388/fs/1279043/fg/1878498


Uploading Dataframe: 100.00% |‚ñà| Rows 1/1 | Elapsed Time: 00:00 | Remaining Time


Launching job: energy_price_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1290388/jobs/named/energy_price_1_offline_fg_materialization/executions


Uploading Dataframe: 100.00% |‚ñà| Rows 7/7 | Elapsed Time: 00:00 | Remaining Time


Launching job: weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1290388/jobs/named/weather_1_offline_fg_materialization/executions
2025-12-29 16:16:45,740 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-12-29 16:16:52,132 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-12-29 16:18:47,486 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-12-29 16:18:47,665 INFO: Waiting for log aggregation to finish.
2025-12-29 16:19:09,799 INFO: Execution finished successfully.


Unnamed: 0,date,temperature_2m_mean_flasjon,precipitation_sum_flasjon,wind_speed_10m_max_flasjon,wind_direction_10m_dominant_flasjon,temperature_2m_mean_hudiksvall,precipitation_sum_hudiksvall,wind_speed_10m_max_hudiksvall,wind_direction_10m_dominant_hudiksvall,temperature_2m_mean_ange,...,wind_speed_10m_max_ange,wind_direction_10m_dominant_ange,temperature_2m_mean_solleftea,precipitation_sum_solleftea,wind_speed_10m_max_solleftea,wind_direction_10m_dominant_solleftea,temperature_2m_mean_umea,precipitation_sum_umea,wind_speed_10m_max_umea,wind_direction_10m_dominant_umea
0,2025-12-29,-6.35,0.0,24.627789,307.875031,-2.05,0.0,22.657131,315.643677,-3.55,...,18.775303,327.528839,-5.2,0.0,16.79314,329.036316,-5.7,0.0,22.73707,336.682251
1,2025-12-30,-5.5,0.0,15.408671,307.405426,-3.7,0.0,18.204042,335.46228,-5.15,...,13.830749,321.340179,-8.1,0.0,11.200571,315.000092,-8.15,0.0,16.119801,330.572632
2,2025-12-31,-6.85,0.0,3.893995,213.690094,-6.6,0.0,9.178235,318.179901,-10.0,...,2.968636,255.96373,-15.35,0.0,3.6,323.130005,-13.5,0.0,9.693296,328.671356
3,2026-01-01,-6.85,0.6,19.130875,70.201042,-0.5,0.2,22.539564,116.564987,-3.6,...,21.267441,87.089226,-4.25,0.6,21.986542,92.815498,-6.45,0.0,16.23596,86.186005
4,2026-01-02,-15.0,0.0,13.584932,32.005356,-4.35,0.1,32.682762,49.467251,-11.25,...,16.394829,19.23077,-12.3,0.0,17.287498,31.372961,-12.1,0.0,17.518356,9.462248
5,2026-01-03,-21.049999,0.0,9.957108,282.528809,-9.75,0.1,24.724951,343.939941,-15.9,...,13.104198,344.054535,-14.65,0.0,12.496719,348.366394,-11.35,0.1,21.746504,6.65433
6,2026-01-04,-14.55,0.2,11.966954,15.708701,-5.35,0.9,31.92654,352.875061,-11.6,...,17.643673,1.169115,-9.5,0.1,13.708391,13.671325,-6.5,0.5,22.545315,38.516956


---