# Aurora Forecasting - Part 02: Daily Feature Pipeline

üóíÔ∏è This notebook is divided into the following sections:
Initialize Hopsworks connection.

Fetch the latest real-time Solar Wind data from NOAA.

Fetch the latest Cloud Cover forecast for Stockholm, Lule√•, and Kiruna.

Update the Feature Groups in the Hopsworks Feature Store.

# Imports and Login

In [17]:
import pandas as pd
import datetime
import hopsworks
from config import HopsworksSettings
import util
import warnings
warnings.filterwarnings("ignore")
import numpy

# Setup settings
settings = HopsworksSettings()

print(settings.HOPSWORKS_PROJECT)

# Login to Hopsworks
project = hopsworks.login(
    project=settings.HOPSWORKS_PROJECT,
    api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value()
)
fs = project.get_feature_store()

HopsworksSettings initialized!
mac64
2026-01-09 20:30:35,225 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-09 20:30:35,243 INFO: Initializing external client
2026-01-09 20:30:35,244 INFO: Base URL: https://c.app.hopsworks.ai:443






2026-01-09 20:30:36,705 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1299605


# Step 1: Get Real-time Solar Wind Data

We use the NOAA SWPC API to get the most recent measurements from the DSCOVR/ACE satellites. These will serve as the features for our real-time inference.

In [18]:
print("Fetching real-time solar wind data from NOAA...")

# Uses the helper function from util.py to fetch and merge mag/plasma data
new_solar_df = util.get_noaa_realtime_hourly_data(
    settings.NOAA_MAG_URL,
    settings.NOAA_PLASMA_URL,
    settings.KP_INDEX_URL
)

# Format the time_tag for Hopsworks compatibility
#new_solar_df['time'] = new_solar_df['time'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Drop unecessary columns if any (spoiler, there are)
new_solar_df.drop(columns=['bx_gsm', 'lon_gsm', 'lat_gsm', 'bt', 'temperature', 'a_running', 'station_count'], inplace=True, errors='ignore')

print(f"Successfully retrieved {len(new_solar_df)} new solar wind records.")
new_solar_df

Fetching real-time solar wind data from NOAA...
Raw Magnetometer data:
     bx_gsm  by_gsm  bz_gsm  lon_gsm  lat_gsm     bt             date_and_time
0     8.78   -4.48   -0.31   332.95    -1.79   9.86 2026-01-08 20:00:00+00:00
1     3.82   -9.02   -4.85   292.93   -26.35  10.93 2026-01-08 21:00:00+00:00
2     8.18   -5.14   -3.71   327.82   -21.02  10.35 2026-01-08 22:00:00+00:00
3     1.62   -2.30   -8.70   305.16   -72.11   9.14 2026-01-08 23:00:00+00:00
4    -1.38   -3.76   -8.34   249.84   -64.36   9.26 2026-01-09 00:00:00+00:00
5     3.67   -3.42   -2.89   317.04   -29.92   5.79 2026-01-09 01:00:00+00:00
6    -2.30   -2.55   -5.52   227.94   -58.07   6.50 2026-01-09 02:00:00+00:00
7     4.64   -9.67   -0.91   295.64    -4.85  10.76 2026-01-09 03:00:00+00:00
8     4.92   -3.85    1.85   321.97    16.48   6.52 2026-01-09 04:00:00+00:00
9    -4.90   -3.52   -4.61   215.72   -37.38   7.59 2026-01-09 05:00:00+00:00
10    1.93   -9.00   -6.20   282.13   -33.96  11.10 2026-01-09 06:00:0

Unnamed: 0,by_gsm,bz_gsm,date_and_time,density,speed,kp_index
0,-4.48,-0.31,2026-01-08 20:00:00+00:00,4.85,435.8,
1,-9.02,-4.85,2026-01-08 21:00:00+00:00,4.15,413.4,3.0
2,-5.14,-3.71,2026-01-08 22:00:00+00:00,6.82,434.4,
3,-2.3,-8.7,2026-01-08 23:00:00+00:00,6.13,,
4,-3.76,-8.34,2026-01-09 00:00:00+00:00,5.7,414.4,3.67
5,-3.42,-2.89,2026-01-09 01:00:00+00:00,5.1,412.8,
6,-2.55,-5.52,2026-01-09 02:00:00+00:00,6.06,410.4,
7,-9.67,-0.91,2026-01-09 03:00:00+00:00,6.82,,3.33
8,-3.85,1.85,2026-01-09 04:00:00+00:00,10.41,403.9,
9,-3.52,-4.61,2026-01-09 05:00:00+00:00,10.12,424.3,


In [19]:
new_solar_aggregated_df = util.aggregate_solar_wind_3h(new_solar_df)
new_solar_aggregated_df

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-08 21:00:00+00:00,2026-01-09 00:00:00+00:00,-5.486667,-9.02,-2.3,3.373386,-5.753333,-8.7,-3.71,2.614772,5.7,4.15,6.82,1.385965,423.9,413.4,434.4,14.849242,3.0
1,2026-01-09 00:00:00+00:00,2026-01-09 03:00:00+00:00,-3.243333,-3.76,-2.55,0.624046,-5.583333,-8.34,-2.89,2.725552,5.62,5.1,6.06,0.484974,412.533333,410.4,414.4,2.013289,3.67
2,2026-01-09 03:00:00+00:00,2026-01-09 06:00:00+00:00,-5.68,-9.67,-3.52,3.459379,-1.223333,-4.61,1.85,3.241378,9.116667,6.82,10.41,1.99425,414.1,403.9,424.3,14.424978,3.33
3,2026-01-09 09:00:00+00:00,2026-01-09 12:00:00+00:00,-0.333333,-9.18,5.66,7.820648,6.06,-0.74,9.79,5.898212,5.26,4.41,6.42,1.04024,542.066667,531.1,552.1,10.531065,2.0
4,2026-01-09 12:00:00+00:00,2026-01-09 15:00:00+00:00,-4.593333,-6.42,-2.44,2.010008,4.6,-2.65,11.25,6.969397,4.823333,3.71,5.64,0.998616,520.4,487.2,556.5,34.740898,1.67
5,2026-01-09 15:00:00+00:00,2026-01-09 18:00:00+00:00,-6.156667,-7.13,-4.54,1.409764,-3.776667,-7.01,-0.37,3.323392,2.405,1.26,3.55,1.619275,465.2,458.0,472.4,10.182338,2.33


In [20]:
# Filter out rows with missing values and sort by date_and_time
new_solar_aggregated_df = new_solar_aggregated_df.dropna()
new_solar_aggregated_df = new_solar_aggregated_df.sort_values(["window_start"])
new_solar_aggregated_df = new_solar_aggregated_df.reset_index(drop=True)

new_solar_aggregated_df

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-08 21:00:00+00:00,2026-01-09 00:00:00+00:00,-5.486667,-9.02,-2.3,3.373386,-5.753333,-8.7,-3.71,2.614772,5.7,4.15,6.82,1.385965,423.9,413.4,434.4,14.849242,3.0
1,2026-01-09 00:00:00+00:00,2026-01-09 03:00:00+00:00,-3.243333,-3.76,-2.55,0.624046,-5.583333,-8.34,-2.89,2.725552,5.62,5.1,6.06,0.484974,412.533333,410.4,414.4,2.013289,3.67
2,2026-01-09 03:00:00+00:00,2026-01-09 06:00:00+00:00,-5.68,-9.67,-3.52,3.459379,-1.223333,-4.61,1.85,3.241378,9.116667,6.82,10.41,1.99425,414.1,403.9,424.3,14.424978,3.33
3,2026-01-09 09:00:00+00:00,2026-01-09 12:00:00+00:00,-0.333333,-9.18,5.66,7.820648,6.06,-0.74,9.79,5.898212,5.26,4.41,6.42,1.04024,542.066667,531.1,552.1,10.531065,2.0
4,2026-01-09 12:00:00+00:00,2026-01-09 15:00:00+00:00,-4.593333,-6.42,-2.44,2.010008,4.6,-2.65,11.25,6.969397,4.823333,3.71,5.64,0.998616,520.4,487.2,556.5,34.740898,1.67
5,2026-01-09 15:00:00+00:00,2026-01-09 18:00:00+00:00,-6.156667,-7.13,-4.54,1.409764,-3.776667,-7.01,-0.37,3.323392,2.405,1.26,3.55,1.619275,465.2,458.0,472.4,10.182338,2.33


In [21]:
# Drop the column of the KP index, because it is not useful for the inference in the real time data
new_solar_df = new_solar_df.drop(columns=['kp_index'])
new_solar_df.dropna(inplace=True)
new_solar_df = new_solar_df.sort_values(["date_and_time"])
new_solar_df = new_solar_df.reset_index(drop=True)
new_solar_df

Unnamed: 0,by_gsm,bz_gsm,date_and_time,density,speed
0,-4.48,-0.31,2026-01-08 20:00:00+00:00,4.85,435.8
1,-9.02,-4.85,2026-01-08 21:00:00+00:00,4.15,413.4
2,-5.14,-3.71,2026-01-08 22:00:00+00:00,6.82,434.4
3,-3.76,-8.34,2026-01-09 00:00:00+00:00,5.7,414.4
4,-3.42,-2.89,2026-01-09 01:00:00+00:00,5.1,412.8
5,-2.55,-5.52,2026-01-09 02:00:00+00:00,6.06,410.4
6,-3.85,1.85,2026-01-09 04:00:00+00:00,10.41,403.9
7,-3.52,-4.61,2026-01-09 05:00:00+00:00,10.12,424.3
8,2.52,9.79,2026-01-09 09:00:00+00:00,6.42,552.1
9,5.66,9.13,2026-01-09 10:00:00+00:00,4.41,531.1


# Step 3: Insert into Feature Groups

Now we push the new observations into the Feature Store. Hopsworks will handle the deduplication based on the primary keys defined in the backfill notebook.

In [22]:
print("Before casting the aggregated data:\n", new_solar_aggregated_df)
# Clean and cast to correct types for Feature Store compatibility
# Convert numeric columns to float32 (Feature Store expects 'float' not 'double')
df = new_solar_aggregated_df.copy()

for col in df.columns:
    if col not in ["window_start", "window_end"]:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')

new_solar_aggregated_df = df
# check data types of each column
print("After casting:\n", new_solar_aggregated_df.dtypes)
new_solar_aggregated_df

Before casting the aggregated data:
                window_start                window_end  by_gsm_mean  \
0 2026-01-08 21:00:00+00:00 2026-01-09 00:00:00+00:00    -5.486667   
1 2026-01-09 00:00:00+00:00 2026-01-09 03:00:00+00:00    -3.243333   
2 2026-01-09 03:00:00+00:00 2026-01-09 06:00:00+00:00    -5.680000   
3 2026-01-09 09:00:00+00:00 2026-01-09 12:00:00+00:00    -0.333333   
4 2026-01-09 12:00:00+00:00 2026-01-09 15:00:00+00:00    -4.593333   
5 2026-01-09 15:00:00+00:00 2026-01-09 18:00:00+00:00    -6.156667   

   by_gsm_min  by_gsm_max  by_gsm_std  bz_gsm_mean  bz_gsm_min  bz_gsm_max  \
0       -9.02       -2.30    3.373386    -5.753333       -8.70       -3.71   
1       -3.76       -2.55    0.624046    -5.583333       -8.34       -2.89   
2       -9.67       -3.52    3.459379    -1.223333       -4.61        1.85   
3       -9.18        5.66    7.820648     6.060000       -0.74        9.79   
4       -6.42       -2.44    2.010008     4.600000       -2.65       11.25   
5   

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-08 21:00:00+00:00,2026-01-09 00:00:00+00:00,-5.486667,-9.02,-2.3,3.373386,-5.753334,-8.7,-3.71,2.614772,5.7,4.15,6.82,1.385965,423.899994,413.399994,434.399994,14.849242,3.0
1,2026-01-09 00:00:00+00:00,2026-01-09 03:00:00+00:00,-3.243333,-3.76,-2.55,0.624046,-5.583333,-8.34,-2.89,2.725552,5.62,5.1,6.06,0.484974,412.533325,410.399994,414.399994,2.013289,3.67
2,2026-01-09 03:00:00+00:00,2026-01-09 06:00:00+00:00,-5.68,-9.67,-3.52,3.459378,-1.223333,-4.61,1.85,3.241378,9.116667,6.82,10.41,1.99425,414.100006,403.899994,424.299988,14.424978,3.33
3,2026-01-09 09:00:00+00:00,2026-01-09 12:00:00+00:00,-0.333333,-9.18,5.66,7.820648,6.06,-0.74,9.79,5.898211,5.26,4.41,6.42,1.04024,542.06665,531.099976,552.099976,10.531065,2.0
4,2026-01-09 12:00:00+00:00,2026-01-09 15:00:00+00:00,-4.593333,-6.42,-2.44,2.010008,4.6,-2.65,11.25,6.969398,4.823333,3.71,5.64,0.998616,520.400024,487.200012,556.5,34.740898,1.67
5,2026-01-09 15:00:00+00:00,2026-01-09 18:00:00+00:00,-6.156667,-7.13,-4.54,1.409764,-3.776667,-7.01,-0.37,3.323392,2.405,1.26,3.55,1.619274,465.200012,458.0,472.399994,10.182338,2.33


In [23]:
print("Before casting the real time data:\n", new_solar_df)
# Clean and cast to correct types for Feature Store compatibility
# Convert numeric columns to float32 (Feature Store expects 'float' not 'double')
df = new_solar_df.copy()

for col in df.columns:
    if col not in ["date_and_time"]:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')

new_solar_df = df
# check data types of each column
print("After casting:\n", new_solar_df.dtypes)
new_solar_df

Before casting the real time data:
     by_gsm  bz_gsm             date_and_time  density  speed
0    -4.48   -0.31 2026-01-08 20:00:00+00:00     4.85  435.8
1    -9.02   -4.85 2026-01-08 21:00:00+00:00     4.15  413.4
2    -5.14   -3.71 2026-01-08 22:00:00+00:00     6.82  434.4
3    -3.76   -8.34 2026-01-09 00:00:00+00:00     5.70  414.4
4    -3.42   -2.89 2026-01-09 01:00:00+00:00     5.10  412.8
5    -2.55   -5.52 2026-01-09 02:00:00+00:00     6.06  410.4
6    -3.85    1.85 2026-01-09 04:00:00+00:00    10.41  403.9
7    -3.52   -4.61 2026-01-09 05:00:00+00:00    10.12  424.3
8     2.52    9.79 2026-01-09 09:00:00+00:00     6.42  552.1
9     5.66    9.13 2026-01-09 10:00:00+00:00     4.41  531.1
10   -9.18   -0.74 2026-01-09 11:00:00+00:00     4.95  543.0
11   -2.44   11.25 2026-01-09 12:00:00+00:00     5.12  556.5
12   -4.92    5.20 2026-01-09 13:00:00+00:00     5.64  517.5
13   -6.42   -2.65 2026-01-09 14:00:00+00:00     3.71  487.2
14   -4.54   -0.37 2026-01-09 16:00:00+00:00     

Unnamed: 0,by_gsm,bz_gsm,date_and_time,density,speed
0,-4.48,-0.31,2026-01-08 20:00:00+00:00,4.85,435.799988
1,-9.02,-4.85,2026-01-08 21:00:00+00:00,4.15,413.399994
2,-5.14,-3.71,2026-01-08 22:00:00+00:00,6.82,434.399994
3,-3.76,-8.34,2026-01-09 00:00:00+00:00,5.7,414.399994
4,-3.42,-2.89,2026-01-09 01:00:00+00:00,5.1,412.799988
5,-2.55,-5.52,2026-01-09 02:00:00+00:00,6.06,410.399994
6,-3.85,1.85,2026-01-09 04:00:00+00:00,10.41,403.899994
7,-3.52,-4.61,2026-01-09 05:00:00+00:00,10.12,424.299988
8,2.52,9.79,2026-01-09 09:00:00+00:00,6.42,552.099976
9,5.66,9.13,2026-01-09 10:00:00+00:00,4.41,531.099976


In [24]:
# Retrieve references to the Feature Groups
solar_wind_fg = fs.get_feature_group(name="solar_wind_fg", version=6)
solar_wind_aggregated_fg = fs.get_feature_group(name="solar_wind_aggregated_fg", version=1)

# Insert new data
# Note: For real-time pipelines, we often use online_enabled=True
# so the data is available for immediate inference.
solar_wind_fg.insert(new_solar_df)
solar_wind_aggregated_fg.insert(new_solar_aggregated_df)

print("Daily Feature Pipeline execution complete!")

Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 18/18 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: solar_wind_fg_6_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1299605/jobs/named/solar_wind_fg_6_offline_fg_materialization/executions


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 6/6 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: solar_wind_aggregated_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1299605/jobs/named/solar_wind_aggregated_fg_1_offline_fg_materialization/executions
Daily Feature Pipeline execution complete!
