In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('energy_price',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /Users/alexanderdahm/Documents/GitHub/mlfs-book-proj
Added the following directory to the PYTHONPATH: /Users/alexanderdahm/Documents/GitHub/mlfs-book-proj
HopsworksSettings initialized!


### <span style='color:#ff5f27'> üìù Imports

In [2]:
import datetime
import requests
import pandas as pd
import hopsworks
from mlfs.energy_price import util
import datetime
from pathlib import Path
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")






---

In [3]:
project = hopsworks.login(engine="python")

2026-01-03 20:03:39,699 INFO: Initializing external client
2026-01-03 20:03:39,699 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-03 20:03:41,223 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1290388


In [4]:

today = datetime.date.today()
csv_file=f"{root_dir}/data/Day-ahead_SE2_SEK_2023-2025ytd.csv"
util.check_file_path(csv_file)



# taken from ~/.env. You can also replace settings.AQICN_API_KEY with the api key value as a string "...."
if settings.AQICN_API_KEY is None:
    print("You need to set AQICN_API_KEY either in this cell or in ~/.env")
    sys.exit(1)


AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value() 


print(f"Found AQICN_API_KEY: {AQICN_API_KEY}")

secrets = hopsworks.get_secrets_api()
# Replace any existing secret with the new value
secret = secrets.get_secret("AQICN_API_KEY")
if secret is not None:
    secret.delete()
    print("Replacing existing AQICN_API_KEY")

secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)
# If this API call fails (it fails in a github action), then set longitude and latitude explicitly - comment out next line
#latitude, longitude = util.get_city_coordinates(city)
# Uncomment this if API call to get longitude and latitude
# latitude = sensorList[i].lat
# longitude = sensorList[i].lon

File successfully found at the path: /Users/alexanderdahm/Documents/GitHub/mlfs-book-proj/data/Day-ahead_SE2_SEK_2023-2025ytd.csv
Found AQICN_API_KEY: 8594407037155fcb75cb7c1b6cc4173f4bc07355
Replacing existing AQICN_API_KEY
Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


Secret('AQICN_API_KEY', 'PRIVATE')

In [5]:
df = pd.read_csv(csv_file,  parse_dates=['date'], skipinitialspace=True, sep=';', decimal=',')
df
df['sek'] = (
    df['sek']
      .astype(str)
      .str.replace(' ', '', regex=False)   # remove spaces
      .str.replace(',', '.', regex=False)  # decimal comma ‚Üí dot
      .astype(float)                       # convert to float
)

print(df.dtypes)

date    datetime64[ns]
sek            float64
dtype: object


## Check the data types for the columns in your DataFrame

 * `date` should be of type   datetime64[ns] 
 * `pm25` should be of type float64

In [None]:
df_ep = df[['date', 'sek']]

df_ep['sek'] = df_ep['sek'].astype('float32')
df_ep
df_ep.dtypes


date    datetime64[ns]
sek            float32
dtype: object

In [None]:
# Cast the pm25 column to be a float32 data type
df_ep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1089 entries, 0 to 1088
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1089 non-null   datetime64[ns]
 1   sek     1089 non-null   float32       
dtypes: datetime64[ns](1), float32(1)
memory usage: 12.9 KB


In [None]:
df_ep.dropna(inplace=True)

df_ep = df_ep.sort_values("date").reset_index(drop=True)

#df_ep["pm25_roll3"] = df_ep["pm25"].shift(1).rolling(window=3).mean()

#df_ep = df_ep.dropna(subset=["pm25_roll3"])

df_ep.dtypes

date    datetime64[ns]
sek            float32
dtype: object

In [None]:
df_ep['zone'] = "SE2"
df_ep
df_ep.dtypes

date    datetime64[ns]
sek            float32
zone            object
dtype: object

In [None]:
df_ep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1089 entries, 0 to 1088
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1089 non-null   datetime64[ns]
 1   sek     1089 non-null   float32       
 2   zone    1089 non-null   object        
dtypes: datetime64[ns](1), float32(1), object(1)
memory usage: 21.4+ KB


---


Load weather data from 5 different cities located in our energy zone. Then concat all values into a single dataframe. There will therefore be 5 distinct weather values on each date.

The weather features we will download are:

 * `temperature (average over the day)`
 * `precipitation (the total over the day)`
 * `wind speed (average over the day)`
 * `wind direction (the most dominant direction over the day)`


In [None]:
earliest_aq_date = pd.Series.min(df_ep['date'])
earliest_aq_date = earliest_aq_date.strftime('%Y-%m-%d')
earliest_aq_date



# 5 diffeten weather sensors
cities = [
    {"name": "flasjon", "lat": 62.760350390111626, "lon": 13.715986496712969},
    {"name": "hudiksvall", "lat": 61.790862930411194, "lon": 17.15754858778168},
    {"name": "ange", "lat": 62.54989082316923, "lon": 15.751547550392734},
    {"name": "solleftea", "lat": 63.159587742988755, "lon": 17.2655114712721},
    {"name": "umea", "lat": 63.81702480736613, "lon": 20.18691175826482},
]

# Store indivudal city data frames
all_weather_data = []

for city in cities:
    weather_df = util.get_historical_weather(city["name"], earliest_aq_date, str(today), city["lat"], city["lon"])
    # Rename columns to include city name
    weather_df = weather_df.drop(columns=["city"])
    weather_df = weather_df.rename(columns={col: f"{col}_{city['name']}" for col in weather_df.columns if col != "date"})
    print(weather_df)
    all_weather_data.append(weather_df)

# Merge all dataframes on date
combined_weather_df = all_weather_data[0]
for df in all_weather_data[1:]:
    combined_weather_df = pd.merge(combined_weather_df, df, on="date", how="outer")



Coordinates 62.74164962768555¬∞N 13.77550983428955¬∞E
Elevation 478.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
           date  temperature_2m_mean_flasjon  precipitation_sum_flasjon  \
0    2023-01-01                    -8.666417                   0.000000   
1    2023-01-02                   -14.641418                   0.000000   
2    2023-01-03                    -9.841416                   0.000000   
3    2023-01-04                   -11.476833                   1.900000   
4    2023-01-05                    -9.262250                   2.400000   
...         ...                          ...                        ...   
1094 2025-12-30                    -5.051834                   3.400000   
1095 2025-12-31                    -5.476833                   4.100000   
1096 2026-01-01                    -6.776834                  17.299999   
1097 2026-01-02                   -12.337251                   2.100000   
1098 2026-01-03                   -15.456000   

In [12]:
combined_weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1099 entries, 0 to 1098
Data columns (total 21 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   date                                    1099 non-null   datetime64[ns]
 1   temperature_2m_mean_flasjon             1099 non-null   float32       
 2   precipitation_sum_flasjon               1099 non-null   float32       
 3   wind_speed_10m_max_flasjon              1099 non-null   float32       
 4   wind_direction_10m_dominant_flasjon     1099 non-null   float32       
 5   temperature_2m_mean_hudiksvall          1099 non-null   float32       
 6   precipitation_sum_hudiksvall            1099 non-null   float32       
 7   wind_speed_10m_max_hudiksvall           1099 non-null   float32       
 8   wind_direction_10m_dominant_hudiksvall  1099 non-null   float32       
 9   temperature_2m_mean_ange                1099 non-nul

In [13]:
import great_expectations as ge
aq_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="aq_expectation_suite"
)

aq_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"sek",
            "min_value":-5000,
            "max_value":10000,
            "strict_min":True
        }
    )
)

{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "sek", "min_value": -5000, "max_value": 10000, "strict_min": true}, "meta": {}}

In [14]:
"""import great_expectations as ge
weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

def expect_greater_than_zero(col):
    weather_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_min_to_be_between",
            kwargs={
                "column":col,
                "min_value":-0.1,
                "max_value":1000.0,
                "strict_min":True
            }
        )
    )
expect_greater_than_zero("precipitation_sum")
expect_greater_than_zero("wind_speed_10m_max")"""

'import great_expectations as ge\nweather_expectation_suite = ge.core.ExpectationSuite(\n    expectation_suite_name="weather_expectation_suite"\n)\n\ndef expect_greater_than_zero(col):\n    weather_expectation_suite.add_expectation(\n        ge.core.ExpectationConfiguration(\n            expectation_type="expect_column_min_to_be_between",\n            kwargs={\n                "column":col,\n                "min_value":-0.1,\n                "max_value":1000.0,\n                "strict_min":True\n            }\n        )\n    )\nexpect_greater_than_zero("precipitation_sum")\nexpect_greater_than_zero("wind_speed_10m_max")'

---

In [15]:
fs = project.get_feature_store() 

In [16]:
energy_price_fg = fs.get_or_create_feature_group(
    name=f"energy_price",
    description='Energy price of each day',
    version=1,
    primary_key=["date"],
    event_time="date",
    stream=False,
    expectation_suite=aq_expectation_suite,
    time_travel_format="HUDI"
)

#### Insert the DataFrame into the Feature Group

In [None]:
energy_price_fg.insert(df_ep)

2026-01-03 20:03:44,092 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1290388/fs/1279043/fg/1878498


Uploading Dataframe: 100.00% |‚ñà| Rows 1089/1089 | Elapsed Time: 00:00 | Remainin


Launching job: energy_price_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1290388/jobs/named/energy_price_1_offline_fg_materialization/executions


(Job('energy_price_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "sek",
           "min_value": -5000,
           "max_value": 10000,
           "strict_min": true
         },
         "meta": {
           "expectationId": 800785
         }
       },
       "result": {
         "observed_value": -95.55000305175781,
         "element_count": 1089,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2026-01-03T07:03:44.000091Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     }
   ],
   "evaluation_parameters": {},
   "statistics": {
     "evaluated_expectations": 1,
     "s

#### Enter a description for each feature in the Feature Group

In [18]:
energy_price_fg.update_feature_description("date", "Date of measurement of energy price")
energy_price_fg.update_feature_description("zone", "Zone where measurement are taken")
energy_price_fg.update_feature_description("sek", "Energy price in SEK")

<hsfs.feature_group.FeatureGroup at 0x1180af850>

In [19]:
# Get or create feature group 
weather_fg = fs.get_or_create_feature_group(
    name=f"weather",
    description='Weather characteristics of each day',
    version=1,
    primary_key=["date"],
    event_time="date",
    stream=False,
    time_travel_format="HUDI"
    #expectation_suite=weather_expectation_suite
) 

#### Insert the DataFrame into the Feature Group

In [20]:
# Insert data
weather_fg.insert(combined_weather_df, wait=True)

Uploading Dataframe: 100.00% |‚ñà| Rows 1099/1099 | Elapsed Time: 00:00 | Remainin


Launching job: weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1290388/jobs/named/weather_1_offline_fg_materialization/executions
2026-01-03 20:04:14,769 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2026-01-03 20:04:17,981 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2026-01-03 20:06:07,181 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2026-01-03 20:06:07,353 INFO: Waiting for log aggregation to finish.
2026-01-03 20:06:16,020 INFO: Execution finished successfully.


(Job('weather_1_offline_fg_materialization', 'SPARK'), None)

#### Enter a description for each feature in the Feature Group

In [21]:
weather_fg.update_feature_description("date", "Date of measurement of weather")

<hsfs.feature_group.FeatureGroup at 0x1785b5f00>

In [22]:
combined_weather_df

Unnamed: 0,date,temperature_2m_mean_flasjon,precipitation_sum_flasjon,wind_speed_10m_max_flasjon,wind_direction_10m_dominant_flasjon,temperature_2m_mean_hudiksvall,precipitation_sum_hudiksvall,wind_speed_10m_max_hudiksvall,wind_direction_10m_dominant_hudiksvall,temperature_2m_mean_ange,...,wind_speed_10m_max_ange,wind_direction_10m_dominant_ange,temperature_2m_mean_solleftea,precipitation_sum_solleftea,wind_speed_10m_max_solleftea,wind_direction_10m_dominant_solleftea,temperature_2m_mean_umea,precipitation_sum_umea,wind_speed_10m_max_umea,wind_direction_10m_dominant_umea
0,2023-01-01,-8.666417,0.000000,15.941944,265.520966,-5.731250,0.000000,16.039202,284.950745,-8.749249,...,13.207634,251.453522,-5.047917,0.0,18.416384,263.280090,-2.477083,0.300000,23.344549,286.180054
1,2023-01-02,-14.641418,0.000000,14.759999,267.631683,-8.097916,0.600000,13.627795,320.027954,-14.113833,...,9.779817,290.071594,-13.874999,0.2,10.829958,277.910614,-8.441667,0.000000,16.981165,291.899872
2,2023-01-03,-9.841416,0.000000,16.299694,272.360565,-7.706250,0.000000,13.532360,296.325409,-9.417999,...,12.387348,300.902130,-12.839585,0.0,12.303366,296.105499,-11.341666,0.000000,15.575981,337.380066
3,2023-01-04,-11.476833,1.900000,19.483284,79.366707,-6.047917,1.300000,19.483284,52.250797,-8.588834,...,16.179985,64.256348,-9.129167,0.0,15.077082,43.212765,-9.693751,0.200000,16.781561,18.239532
4,2023-01-05,-9.262250,2.400000,13.779114,98.105415,-5.389584,2.000000,17.551615,72.729340,-7.461750,...,15.124284,67.373535,-7.912500,0.7,15.946010,66.229439,-7.237501,2.899999,12.287555,20.448183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094,2025-12-30,-5.051834,3.400000,22.843695,311.638092,-4.208333,0.100000,23.838961,330.785675,-5.036750,...,18.855375,322.978394,-7.577084,0.0,19.567083,309.549225,-9.481251,0.000000,25.893110,336.864685
1095,2025-12-31,-5.476833,4.100000,11.183201,194.364670,-6.991667,1.000000,11.866355,261.511078,-7.411751,...,14.584443,121.015274,-12.231250,0.4,12.535548,343.442780,-17.204166,0.000000,13.985663,332.503815
1096,2026-01-01,-6.776834,17.299999,22.942055,73.078445,-2.685416,12.099998,28.417774,108.197762,-4.759667,...,27.774298,91.420250,-5.452084,6.6,24.502489,97.868401,-10.950002,1.400000,18.072752,53.300491
1097,2026-01-02,-12.337251,2.100000,17.728857,32.389431,-5.210417,10.800001,28.739748,41.937622,-10.103415,...,20.756269,26.770164,-10.764584,0.0,18.188019,31.387003,-11.864583,0.600000,24.152887,25.315922
