# <span style='color:#ff5f27'> Initialization </span>

### Hopsworks Settings

In [2]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('airquality',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /home/federica_lorenzini/sml-bike-sharing
Added the following directory to the PYTHONPATH: /home/federica_lorenzini/sml-bike-sharing
HopsworksSettings initialized!


### Imports

In [3]:
import datetime
import requests
import pandas as pd
import hopsworks
from mlfs import util
import datetime
from pathlib import Path
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")






### Hopsworks Login

In [4]:
project = hopsworks.login()

2026-01-11 17:09:04,271 INFO: Initializing external client
2026-01-11 17:09:04,272 INFO: Base URL: https://c.app.hopsworks.ai:443






2026-01-11 17:09:07,222 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279175


# <span style='color:#ff5f27'> Check CityaBikes API </span>

### Set City Data

In [5]:
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
city = 'Trento'
country = 'Italy'
latitude = 46.07
longitude = 11.12

### Perform a Request

In [6]:
# No API-key is required for this API

network_id = "e-motion-trento" 
url = f"https://api.citybik.es/v2/networks/{network_id}"

try:
    resp = requests.get(url).json()
    print("Request Successful!")
except:
    print("Something went wrong, please check the URL.")
    print("Networn Answer:")
    print(requests.get(url))


Request Successful!


### Visualize Answer

These info shall coincide with the variables set in the "Set City Data" cell.

In [7]:
# Print info
print(f"Network ID: {resp['network']['id']}")

print(f"City: {resp['network']['location']['city']}")
print(f"Contry: {resp['network']['location']['country']}")
print(f"Latitude: {resp['network']['location']['latitude']}")
print(f"Longitude: {resp['network']['location']['longitude']}")

# Print first 5 stations
stations = resp['network']['stations']
print("\nFirst five stations:")
for s in stations[:5]:
    print(f"{s['name']} -> Bikes: {s['free_bikes']}, Empty slots: {s['empty_slots']}")


Network ID: e-motion-trento
City: Trento
Contry: IT
Latitude: 46.06643205823519
Longitude: 11.122145390351879

First five stations:
10.02 Top Center -> Bikes: 1, Empty slots: 13
20.10 Noriglio -> Bikes: 4, Empty slots: 4
20.09 Sacco -> Bikes: 5, Empty slots: 6
10.18 Vannetti -> Bikes: 0, Empty slots: 0
11.01 Ospedale San Giovanni -> Bikes: 3, Empty slots: 9


# <span style='color:#ff5f27'> Load Historcal Bike Data </span>

### Load File List

In [8]:
list_file = f"{root_dir}/bike-historical-data/all-files.csv"
list_df = pd.read_csv(list_file, skipinitialspace=True)
list_df

Unnamed: 0,tag,file-name,country,city
0,e-motion-trento,202411-e-motion-trento-stats.parquet,Italy,Trento
1,e-motion-trento,202412-e-motion-trento-stats.parquet,Italy,Trento
2,e-motion-trento,202501-e-motion-trento-stats.parquet,Italy,Trento
3,e-motion-trento,202502-e-motion-trento-stats.parquet,Italy,Trento
4,e-motion-trento,202503-e-motion-trento-stats.parquet,Italy,Trento
5,e-motion-trento,202504-e-motion-trento-stats.parquet,Italy,Trento
6,e-motion-trento,202505-e-motion-trento-stats.parquet,Italy,Trento
7,e-motion-trento,202506-e-motion-trento-stats.parquet,Italy,Trento
8,e-motion-trento,202507-e-motion-trento-stats.parquet,Italy,Trento
9,e-motion-trento,202508-e-motion-trento-stats.parquet,Italy,Trento


### Read Files

In [25]:
bikes_df = pd.DataFrame()

for i in range(list_df.shape[0]):
    # Read file
    file_name = list_df.iloc[i]['file-name']
    df_i = pd.read_parquet(f"{root_dir}/bike-historical-data/{file_name}", engine='pyarrow')

    # Drop nones and add columns
    df_i.dropna(inplace=True)
    df_i['country'] = list_df.iloc[i]['country']
    df_i['city'] = list_df.iloc[i]['city']

    bikes_df = pd.concat([bikes_df, df_i], ignore_index=True)

# Adjust schema to match API
bikes_df = bikes_df.drop(columns=['nuid'])
bikes_df = bikes_df.rename(columns={'timestamp': 'date'})

bikes_df.head()
#print all station names
station_names = bikes_df['name']
for name in station_names:
    print(name)

10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Center
10.02 Top Cent

### Print Info

In [10]:
print(bikes_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206011 entries, 0 to 206010
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   tag        206011 non-null  object        
 1   id         206011 non-null  object        
 2   name       206011 non-null  object        
 3   latitude   206011 non-null  float64       
 4   longitude  206011 non-null  float64       
 5   bikes      206011 non-null  int32         
 6   free       206011 non-null  int32         
 7   extra      206011 non-null  object        
 8   date       206011 non-null  datetime64[us]
 9   country    206011 non-null  object        
 10  city       206011 non-null  object        
dtypes: datetime64[us](1), float64(2), int32(2), object(6)
memory usage: 15.7+ MB
None


# <span style='color:#ff5f27'> Load Historcal Weather Data </span>

Features Downloaded:

 * `weather_code`: Weather condition as a numeric code (WMO).
 * `apparent_temperature_mean`: Apparent temperature is the perceived feels-like temperature combining wind chill factor, relative humidity and solar radiation.
 * `daylight_duration`: Number of seconds of daylight per day.
 * `precipitation_sum`: Sum of daily precipitation (including rain, showers and snowfall).
 * `wind_speed_10m_max`: Maximum wind speed on the day.


### Download the Data

In [11]:
earliest_date = pd.Series.min(bikes_df['date'])
earliest_date = earliest_date.strftime('%Y-%m-%d')
earliest_date

'2024-11-01'

In [12]:
weather_df = util.get_historical_weather(city, earliest_date, str(today), latitude, longitude)

Coordinates 46.080841064453125°N 11.160572052001953°E
Elevation 193.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


### Print Info

In [13]:
weather_df.head()

Unnamed: 0,date,weather_code,apparent_temperature_mean,daylight_duration,precipitation_sum,wind_speed_10m_max,city
0,2024-11-01,2,14.199712,36237.121094,0.0,4.846648,Trento
1,2024-11-02,1,13.831336,36063.140625,0.0,3.319036,Trento
2,2024-11-03,3,13.089696,35891.15625,0.0,5.297018,Trento
3,2024-11-04,3,12.426098,35721.378906,0.0,4.68,Trento
4,2024-11-05,3,11.426768,35554.003906,0.0,5.393997,Trento


In [14]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   date                       437 non-null    datetime64[ns]
 1   weather_code               437 non-null    int32         
 2   apparent_temperature_mean  437 non-null    float32       
 3   daylight_duration          437 non-null    float32       
 4   precipitation_sum          437 non-null    float32       
 5   wind_speed_10m_max         437 non-null    float32       
 6   city                       437 non-null    object        
dtypes: datetime64[ns](1), float32(4), int32(1), object(1)
memory usage: 15.5+ KB


# <span style='color:#ff5f27'> Define Data Validation Rules </span>

### Bike Expectations

In [15]:
import great_expectations as ge
bikes_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="bikes_expectation_suite"
)

bikes_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"bikes",
            "min_value":-0.1,
            "max_value":100.0,
            "strict_min":True
        }
    )
)

bikes_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"free",
            "min_value":-0.1,
            "max_value":100.0,
            "strict_min":True
        }
    )
)

{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "free", "min_value": -0.1, "max_value": 100.0, "strict_min": true}, "meta": {}}

### Weather Expectations

In [16]:
import great_expectations as ge
weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"weather_code",
            "min_value":-0.1,
            "max_value":70.1,
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"apparent_temperature_mean",
            "min_value":-20.1,
            "max_value":50.1,
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"daylight_duration",
            "min_value":-0.1,
            "max_value":86400.1, # 1 Day = 86400 seconds
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"precipitation_sum",
            "min_value":-0.1,
            "max_value":100.1,
            "strict_min":True
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column":"wind_speed_10m_max",
            "min_value":-0.1,
            "max_value":100.1,
            "strict_min":True
        }
    )
)

{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "wind_speed_10m_max", "min_value": -0.1, "max_value": 100.1, "strict_min": true}, "meta": {}}

# <span style='color:#ff5f27'> Load to Hopsworks </span>

### Load Metadata

In [17]:
fs = project.get_feature_store()

In [18]:
secrets = hopsworks.get_secrets_api()

dict_obj = {
    "country": country,
    "city": city,
    "latitude": latitude,
    "longitude": longitude
}

# Convert the dictionary to a JSON string
str_dict = json.dumps(dict_obj)

# Replace any existing secret with the new value
secret = secrets.get_secret("BIKES_LOCATION_JSON")
if secret is not None:
    secret.delete()
    print("Replacing existing BIKES_LOCATION_JSON")

secrets.create_secret("BIKES_LOCATION_JSON", str_dict)

Replacing existing BIKES_LOCATION_JSON
Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


Secret('BIKES_LOCATION_JSON', 'PRIVATE')

### Load Bikes Data

In [19]:
bikes_fg = fs.get_or_create_feature_group(
    name='bikes_trento',
    description='Bikes Availability for Trento',
    version=2,
    primary_key=['id', 'city', 'country'],
    event_time='date',
    expectation_suite=bikes_expectation_suite
)

In [20]:
bikes_fg.insert(bikes_df)

2026-01-11 17:09:13,843 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279175/fs/1265791/fg/1893939


Uploading Dataframe: 100.00% |██████████| Rows 206011/206011 | Elapsed Time: 00:21 | Remaining Time: 00:00


Launching job: bikes_trento_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279175/jobs/named/bikes_trento_2_offline_fg_materialization/executions


(Job('bikes_trento_2_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "free",
           "min_value": -0.1,
           "max_value": 100.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 805119
         }
       },
       "result": {
         "observed_value": 0,
         "element_count": 206011,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2026-01-11T04:09:13.000843Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to

In [21]:
bikes_fg.update_feature_description("tag", "Network Tag")
bikes_fg.update_feature_description("id", "Sensor Identifier")
bikes_fg.update_feature_description("name", "Sensor Name")
bikes_fg.update_feature_description("latitude", "Sensor Latitude Coordinate")
bikes_fg.update_feature_description("longitude", "Sensor Longitude Coordinate")
bikes_fg.update_feature_description("bikes", "Number of Available Bikes")
bikes_fg.update_feature_description("free", "Number of Empty Slots")
bikes_fg.update_feature_description("extra", "Sensor Metadata")
bikes_fg.update_feature_description("date", "Measurement Day")
bikes_fg.update_feature_description("country", "Sensor Country")
bikes_fg.update_feature_description("city", "Sensor City")

<hsfs.feature_group.FeatureGroup at 0x7917383debf0>

### Load Weather Data

In [22]:
weather_fg = fs.get_or_create_feature_group(
    name='weather_trento',
    description='Weather for Trento',
    version=2,
    primary_key=['city'],
    event_time='date',
    expectation_suite=weather_expectation_suite
)

In [23]:
weather_fg.insert(weather_df, wait=True)

2026-01-11 17:10:26,106 INFO: 	5 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279175/fs/1265791/fg/1893940


Uploading Dataframe: 100.00% |██████████| Rows 437/437 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_trento_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279175/jobs/named/weather_trento_2_offline_fg_materialization/executions
2026-01-11 17:11:20,517 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2026-01-11 17:11:26,954 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2026-01-11 17:13:15,254 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2026-01-11 17:13:15,482 INFO: Waiting for log aggregation to finish.
2026-01-11 17:13:41,871 INFO: Execution finished successfully.


(Job('weather_trento_2_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "wind_speed_10m_max",
           "min_value": -0.1,
           "max_value": 100.1,
           "strict_min": true
         },
         "meta": {
           "expectationId": 805123
         }
       },
       "result": {
         "observed_value": 0.9178234934806824,
         "element_count": 437,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2026-01-11T04:10:26.000103Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectatio

In [24]:
weather_fg.update_feature_description("date", "Measurement Day")
weather_fg.update_feature_description("weather_code", "Weather Condition Numeric Code (WMO)")
weather_fg.update_feature_description("apparent_temperature_mean", "Apparent Temperature, Mean of the Day")
weather_fg.update_feature_description("daylight_duration", "Seconds of Sun for the Day")
weather_fg.update_feature_description("precipitation_sum", "Total Precipitation for the Day")
weather_fg.update_feature_description("wind_speed_10m_max", "Wind Speed")
weather_fg.update_feature_description("city", "City of Measurement")

<hsfs.feature_group.FeatureGroup at 0x791738390220>