# Aurora Forecasting - Part 02: Daily Feature Pipeline

üóíÔ∏è This notebook is divided into the following sections:
Initialize Hopsworks connection.

Fetch the latest real-time Solar Wind data from NOAA.

Fetch the latest Cloud Cover forecast for Stockholm, Lule√•, and Kiruna.

Update the Feature Groups in the Hopsworks Feature Store.

# Imports and Login

In [16]:
import pandas as pd
import datetime
import hopsworks
from config import HopsworksSettings
import util
import warnings
warnings.filterwarnings("ignore")
import numpy

# Setup settings
settings = HopsworksSettings()

print(settings.HOPSWORKS_PROJECT)

# Login to Hopsworks
project = hopsworks.login(
    project=settings.HOPSWORKS_PROJECT,
    api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value()
)
fs = project.get_feature_store()

HopsworksSettings initialized!
mac64
2026-01-09 00:05:41,275 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-09 00:05:41,279 INFO: Initializing external client
2026-01-09 00:05:41,280 INFO: Base URL: https://c.app.hopsworks.ai:443






2026-01-09 00:05:42,679 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1299605


# Step 1: Get Real-time Solar Wind Data

We use the NOAA SWPC API to get the most recent measurements from the DSCOVR/ACE satellites. These will serve as the features for our real-time inference.

In [17]:
print("Fetching real-time solar wind data from NOAA...")

# Uses the helper function from util.py to fetch and merge mag/plasma data
new_solar_df = util.get_noaa_realtime_data(
    settings.NOAA_MAG_URL,
    settings.NOAA_PLASMA_URL,
    settings.KP_INDEX_URL
)

# Format the time_tag for Hopsworks compatibility
#new_solar_df['time'] = new_solar_df['time'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Drop unecessary columns if any (spoiler, there are)
new_solar_df.drop(columns=['bx_gsm', 'lon_gsm', 'lat_gsm', 'bt', 'temperature', 'a_running', 'station_count'], inplace=True, errors='ignore')

print(f"Successfully retrieved {len(new_solar_df)} new solar wind records.")
new_solar_df

Fetching real-time solar wind data from NOAA...
Raw Magnetometer data:
     bx_gsm  by_gsm  bz_gsm  lon_gsm  lat_gsm     bt             date_and_time
0    -0.16   -6.39   -7.32   268.58   -48.87   9.72 2026-01-07 23:00:00+00:00
1     0.19   -4.39   -9.30   272.49   -64.74  10.29 2026-01-08 00:00:00+00:00
2     2.01   -5.63   -8.95   289.61   -56.26  10.76 2026-01-08 01:00:00+00:00
3     1.91   -3.81   -9.01   296.70   -64.71   9.97 2026-01-08 02:00:00+00:00
4     1.59   -2.48   -8.17   302.75   -70.19   8.69 2026-01-08 03:00:00+00:00
5    -3.01    5.17   -2.18   120.22   -19.97   6.37 2026-01-08 04:00:00+00:00
6     0.77    2.46   -7.30    72.70   -70.58   7.74 2026-01-08 05:00:00+00:00
7     0.73    1.81  -11.61    67.94   -80.46  11.77 2026-01-08 06:00:00+00:00
8     2.94   -5.38    0.50   298.66     4.66   6.15 2026-01-08 07:00:00+00:00
9     1.29   -5.59   -2.93   283.03   -27.07   6.44 2026-01-08 08:00:00+00:00
10    2.70   -7.28    2.43   290.33    17.35   8.13 2026-01-08 09:00:0

Unnamed: 0,by_gsm,bz_gsm,date_and_time,density,speed,kp_index
0,-6.39,-7.32,2026-01-07 23:00:00+00:00,7.27,349.2,
1,-4.39,-9.3,2026-01-08 00:00:00+00:00,6.16,352.5,4.33
2,-5.63,-8.95,2026-01-08 01:00:00+00:00,3.39,354.1,
3,-3.81,-9.01,2026-01-08 02:00:00+00:00,7.32,355.2,
4,-2.48,-8.17,2026-01-08 03:00:00+00:00,11.88,362.3,3.33
5,5.17,-2.18,2026-01-08 04:00:00+00:00,19.24,368.8,
6,2.46,-7.3,2026-01-08 05:00:00+00:00,19.49,374.7,
7,1.81,-11.61,2026-01-08 06:00:00+00:00,5.61,369.1,3.67
8,-5.38,0.5,2026-01-08 07:00:00+00:00,10.51,382.1,
9,-5.59,-2.93,2026-01-08 08:00:00+00:00,6.45,370.3,


In [18]:
new_solar_df = util.aggregate_solar_wind_3h(new_solar_df)
new_solar_df

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-08 00:00:00+00:00,2026-01-08 03:00:00+00:00,-4.61,-5.63,-3.81,0.929731,-9.086667,-9.3,-8.95,0.187172,5.623333,3.39,7.32,2.019216,353.933333,352.5,355.2,1.357694,4.33
1,2026-01-08 03:00:00+00:00,2026-01-08 06:00:00+00:00,1.716667,-2.48,5.17,3.878793,-5.883333,-8.17,-2.18,3.236547,16.87,11.88,19.49,4.323274,368.6,362.3,374.7,6.202419,3.33
2,2026-01-08 06:00:00+00:00,2026-01-08 09:00:00+00:00,-3.053333,-5.59,1.81,4.213079,-4.68,-11.61,0.5,6.241787,7.523333,5.61,10.51,2.620407,373.833333,369.1,382.1,7.184242,3.67
3,2026-01-08 09:00:00+00:00,2026-01-08 12:00:00+00:00,-5.906667,-7.28,-4.53,1.375003,2.723333,1.89,3.85,1.01239,7.016667,3.98,8.68,2.633825,379.3,372.3,383.5,6.102459,2.0
4,2026-01-08 12:00:00+00:00,2026-01-08 15:00:00+00:00,-3.783333,-4.58,-2.2,1.371216,5.393333,2.5,8.09,2.800185,10.723333,8.55,15.02,3.721106,378.6,373.8,382.3,4.355456,1.0
5,2026-01-08 15:00:00+00:00,2026-01-08 18:00:00+00:00,-6.556667,-9.36,-5.04,2.43048,2.256667,-3.57,6.24,5.158239,5.23,3.24,8.01,2.481189,391.6,376.0,409.7,16.988526,1.0
6,2026-01-08 18:00:00+00:00,2026-01-08 21:00:00+00:00,-6.66,-8.07,-4.48,1.914863,-1.966667,-4.76,-0.31,2.43303,3.773333,2.87,4.85,1.001316,414.666667,403.4,435.8,18.315385,2.0


In [19]:
# Filter out rows with missing values and sort by date_and_time
new_solar_df = new_solar_df.dropna()
new_solar_df = new_solar_df.sort_values(["window_start"])
new_solar_df = new_solar_df.reset_index(drop=True)

new_solar_df

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-08 00:00:00+00:00,2026-01-08 03:00:00+00:00,-4.61,-5.63,-3.81,0.929731,-9.086667,-9.3,-8.95,0.187172,5.623333,3.39,7.32,2.019216,353.933333,352.5,355.2,1.357694,4.33
1,2026-01-08 03:00:00+00:00,2026-01-08 06:00:00+00:00,1.716667,-2.48,5.17,3.878793,-5.883333,-8.17,-2.18,3.236547,16.87,11.88,19.49,4.323274,368.6,362.3,374.7,6.202419,3.33
2,2026-01-08 06:00:00+00:00,2026-01-08 09:00:00+00:00,-3.053333,-5.59,1.81,4.213079,-4.68,-11.61,0.5,6.241787,7.523333,5.61,10.51,2.620407,373.833333,369.1,382.1,7.184242,3.67
3,2026-01-08 09:00:00+00:00,2026-01-08 12:00:00+00:00,-5.906667,-7.28,-4.53,1.375003,2.723333,1.89,3.85,1.01239,7.016667,3.98,8.68,2.633825,379.3,372.3,383.5,6.102459,2.0
4,2026-01-08 12:00:00+00:00,2026-01-08 15:00:00+00:00,-3.783333,-4.58,-2.2,1.371216,5.393333,2.5,8.09,2.800185,10.723333,8.55,15.02,3.721106,378.6,373.8,382.3,4.355456,1.0
5,2026-01-08 15:00:00+00:00,2026-01-08 18:00:00+00:00,-6.556667,-9.36,-5.04,2.43048,2.256667,-3.57,6.24,5.158239,5.23,3.24,8.01,2.481189,391.6,376.0,409.7,16.988526,1.0
6,2026-01-08 18:00:00+00:00,2026-01-08 21:00:00+00:00,-6.66,-8.07,-4.48,1.914863,-1.966667,-4.76,-0.31,2.43303,3.773333,2.87,4.85,1.001316,414.666667,403.4,435.8,18.315385,2.0


# Step 2: Get current weather data

In [20]:
weather_data = []
today = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

for city, coords in settings.CITIES.items():
    print(f"Fetching today cloud cover for {city}...")

    # Get current cloud cover percentage from Open-Meteo
    cloud_cover = util.get_city_weather_today(coords['lat'], coords['lon'])

    weather_data.append({
        'city': city,
        'date_and_time': today,
        'cloud_cover': cloud_cover
    })

new_weather_df = pd.DataFrame(weather_data)
# Convert date column from string to datetime format
new_weather_df['date_and_time'] = pd.to_datetime(new_weather_df['date_and_time'])

print(new_weather_df.dtypes)
new_weather_df.tail(100)

Fetching today cloud cover for Kiruna...
Fetching today cloud cover for Lule√•...
Fetching today cloud cover for Stockholm...
city                     object
date_and_time    datetime64[ns]
cloud_cover              object
dtype: object


Unnamed: 0,city,date_and_time,cloud_cover
0,Kiruna,2026-01-09 00:05:44,"{'time': '2026-01-09T00:00', 'cloud_cover': 7}"
1,Lule√•,2026-01-09 00:05:44,"{'time': '2026-01-09T00:00', 'cloud_cover': 95}"
2,Stockholm,2026-01-09 00:05:44,"{'time': '2026-01-09T00:00', 'cloud_cover': 95}"


# Step 3: Insert into Feature Groups

Now we push the new observations into the Feature Store. Hopsworks will handle the deduplication based on the primary keys defined in the backfill notebook.

In [21]:
print("Before casting:\n", new_solar_df)
# Clean and cast to correct types for Feature Store compatibility
# Convert numeric columns to float32 (Feature Store expects 'float' not 'double')
df = new_solar_df.copy()

for col in df.columns:
    if col not in ["window_start", "window_end"]:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')

new_solar_df = df
# check data types of each column
print("After casting:\n", new_solar_df.dtypes)
new_solar_df

Before casting:
                window_start                window_end  by_gsm_mean  \
0 2026-01-08 00:00:00+00:00 2026-01-08 03:00:00+00:00    -4.610000   
1 2026-01-08 03:00:00+00:00 2026-01-08 06:00:00+00:00     1.716667   
2 2026-01-08 06:00:00+00:00 2026-01-08 09:00:00+00:00    -3.053333   
3 2026-01-08 09:00:00+00:00 2026-01-08 12:00:00+00:00    -5.906667   
4 2026-01-08 12:00:00+00:00 2026-01-08 15:00:00+00:00    -3.783333   
5 2026-01-08 15:00:00+00:00 2026-01-08 18:00:00+00:00    -6.556667   
6 2026-01-08 18:00:00+00:00 2026-01-08 21:00:00+00:00    -6.660000   

   by_gsm_min  by_gsm_max  by_gsm_std  bz_gsm_mean  bz_gsm_min  bz_gsm_max  \
0       -5.63       -3.81    0.929731    -9.086667       -9.30       -8.95   
1       -2.48        5.17    3.878793    -5.883333       -8.17       -2.18   
2       -5.59        1.81    4.213079    -4.680000      -11.61        0.50   
3       -7.28       -4.53    1.375003     2.723333        1.89        3.85   
4       -4.58       -2.20    1.3

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-08 00:00:00+00:00,2026-01-08 03:00:00+00:00,-4.61,-5.63,-3.81,0.929731,-9.086667,-9.3,-8.95,0.187172,5.623333,3.39,7.32,2.019216,353.933319,352.5,355.200012,1.357694,4.33
1,2026-01-08 03:00:00+00:00,2026-01-08 06:00:00+00:00,1.716667,-2.48,5.17,3.878793,-5.883333,-8.17,-2.18,3.236547,16.870001,11.88,19.49,4.323274,368.600006,362.299988,374.700012,6.202419,3.33
2,2026-01-08 06:00:00+00:00,2026-01-08 09:00:00+00:00,-3.053333,-5.59,1.81,4.213079,-4.68,-11.61,0.5,6.241786,7.523334,5.61,10.51,2.620407,373.833344,369.100006,382.100006,7.184242,3.67
3,2026-01-08 09:00:00+00:00,2026-01-08 12:00:00+00:00,-5.906667,-7.28,-4.53,1.375003,2.723333,1.89,3.85,1.01239,7.016667,3.98,8.68,2.633825,379.299988,372.299988,383.5,6.102458,2.0
4,2026-01-08 12:00:00+00:00,2026-01-08 15:00:00+00:00,-3.783333,-4.58,-2.2,1.371216,5.393333,2.5,8.09,2.800184,10.723333,8.55,15.02,3.721107,378.600006,373.799988,382.299988,4.355456,1.0
5,2026-01-08 15:00:00+00:00,2026-01-08 18:00:00+00:00,-6.556667,-9.36,-5.04,2.43048,2.256667,-3.57,6.24,5.158239,5.23,3.24,8.01,2.481189,391.600006,376.0,409.700012,16.988525,1.0
6,2026-01-08 18:00:00+00:00,2026-01-08 21:00:00+00:00,-6.66,-8.07,-4.48,1.914863,-1.966667,-4.76,-0.31,2.43303,3.773333,2.87,4.85,1.001316,414.666656,403.399994,435.799988,18.315386,2.0


In [23]:
# Retrieve references to the Feature Groups
solar_wind_fg = fs.get_feature_group(name="solar_wind_fg", version=4)
#city_weather_fg = fs.get_feature_group(name="city_weather_fg", version=2)

# Insert new data
# Note: For real-time pipelines, we often use online_enabled=True
# so the data is available for immediate inference.
solar_wind_fg.insert(new_solar_df)
#city_weather_fg.insert(new_weather_df)

print("Daily Feature Pipeline execution complete!")

Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 7/7 | Elapsed Time: 00:01 | Remaining Time: 00:00


Daily Feature Pipeline execution complete!
