# Aurora Forecasting - Part 02: Daily Feature Pipeline

üóíÔ∏è This notebook is divided into the following sections:
Initialize Hopsworks connection.

Fetch the latest real-time Solar Wind data from NOAA.

Fetch the latest Cloud Cover forecast for Stockholm, Lule√•, and Kiruna.

Update the Feature Groups in the Hopsworks Feature Store.

# Imports and Login

In [17]:
import pandas as pd
import datetime
import hopsworks
from config import HopsworksSettings
import util
import warnings
warnings.filterwarnings("ignore")
import numpy

# Setup settings
settings = HopsworksSettings()

print(settings.HOPSWORKS_PROJECT)

# Login to Hopsworks
project = hopsworks.login(
    project=settings.HOPSWORKS_PROJECT,
    api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value()
)
fs = project.get_feature_store()

HopsworksSettings initialized!
mac64
2026-01-09 12:54:20,377 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-09 12:54:20,384 INFO: Initializing external client
2026-01-09 12:54:20,385 INFO: Base URL: https://c.app.hopsworks.ai:443






2026-01-09 12:54:21,804 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1299605


# Step 1: Get Real-time Solar Wind Data

We use the NOAA SWPC API to get the most recent measurements from the DSCOVR/ACE satellites. These will serve as the features for our real-time inference.

In [18]:
print("Fetching real-time solar wind data from NOAA...")

# Uses the helper function from util.py to fetch and merge mag/plasma data
new_solar_df = util.get_noaa_realtime_hourly_data(
    settings.NOAA_MAG_URL,
    settings.NOAA_PLASMA_URL,
    settings.KP_INDEX_URL
)

# Format the time_tag for Hopsworks compatibility
#new_solar_df['time'] = new_solar_df['time'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Drop unecessary columns if any (spoiler, there are)
new_solar_df.drop(columns=['bx_gsm', 'lon_gsm', 'lat_gsm', 'bt', 'temperature', 'a_running', 'station_count'], inplace=True, errors='ignore')

print(f"Successfully retrieved {len(new_solar_df)} new solar wind records.")
new_solar_df

Fetching real-time solar wind data from NOAA...
Raw Magnetometer data:
     bx_gsm  by_gsm  bz_gsm  lon_gsm  lat_gsm     bt             date_and_time
0     4.50   -4.57    2.50   314.57    21.25   6.89 2026-01-08 12:00:00+00:00
1     4.70   -4.58    5.59   315.72    40.40   8.62 2026-01-08 13:00:00+00:00
2     4.49   -2.20    8.09   333.90    58.27   9.51 2026-01-08 14:00:00+00:00
3     6.37   -5.04    6.24   321.68    37.54  10.24 2026-01-08 15:00:00+00:00
4     8.56   -9.36   -3.57   312.44   -15.74  13.17 2026-01-08 16:00:00+00:00
5     9.62   -5.27    4.10   331.28    20.50  11.71 2026-01-08 17:00:00+00:00
6    10.23   -7.43   -0.83   324.02    -3.77  12.67 2026-01-08 18:00:00+00:00
7     7.17   -8.07   -4.76   311.64   -23.78  11.80 2026-01-08 19:00:00+00:00
8     8.78   -4.48   -0.31   332.95    -1.79   9.86 2026-01-08 20:00:00+00:00
9     3.82   -9.02   -4.85   292.93   -26.35  10.93 2026-01-08 21:00:00+00:00
10    8.18   -5.14   -3.71   327.82   -21.02  10.35 2026-01-08 22:00:0

Unnamed: 0,by_gsm,bz_gsm,date_and_time,density,speed,kp_index
0,-4.57,2.5,2026-01-08 12:00:00+00:00,15.02,373.8,1.0
1,-4.58,5.59,2026-01-08 13:00:00+00:00,8.6,379.7,
2,-2.2,8.09,2026-01-08 14:00:00+00:00,8.55,382.3,
3,-5.04,6.24,2026-01-08 15:00:00+00:00,8.01,376.0,1.0
4,-9.36,-3.57,2026-01-08 16:00:00+00:00,3.24,389.1,
5,-5.27,4.1,2026-01-08 17:00:00+00:00,4.44,409.7,
6,-7.43,-0.83,2026-01-08 18:00:00+00:00,2.87,403.4,2.0
7,-8.07,-4.76,2026-01-08 19:00:00+00:00,3.6,404.8,
8,-4.48,-0.31,2026-01-08 20:00:00+00:00,4.85,435.8,
9,-9.02,-4.85,2026-01-08 21:00:00+00:00,4.15,413.4,3.0


In [19]:
new_solar_aggregated_df = util.aggregate_solar_wind_3h(new_solar_df)
new_solar_aggregated_df

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-08 12:00:00+00:00,2026-01-08 15:00:00+00:00,-3.783333,-4.58,-2.2,1.371216,5.393333,2.5,8.09,2.800185,10.723333,8.55,15.02,3.721106,378.6,373.8,382.3,4.355456,1.0
1,2026-01-08 15:00:00+00:00,2026-01-08 18:00:00+00:00,-6.556667,-9.36,-5.04,2.43048,2.256667,-3.57,6.24,5.158239,5.23,3.24,8.01,2.481189,391.6,376.0,409.7,16.988526,1.0
2,2026-01-08 18:00:00+00:00,2026-01-08 21:00:00+00:00,-6.66,-8.07,-4.48,1.914863,-1.966667,-4.76,-0.31,2.43303,3.773333,2.87,4.85,1.001316,414.666667,403.4,435.8,18.315385,2.0
3,2026-01-08 21:00:00+00:00,2026-01-09 00:00:00+00:00,-5.486667,-9.02,-2.3,3.373386,-5.753333,-8.7,-3.71,2.614772,5.7,4.15,6.82,1.385965,423.9,413.4,434.4,14.849242,3.0
4,2026-01-09 00:00:00+00:00,2026-01-09 03:00:00+00:00,-3.243333,-3.76,-2.55,0.624046,-5.583333,-8.34,-2.89,2.725552,5.62,5.1,6.06,0.484974,412.533333,410.4,414.4,2.013289,3.67
5,2026-01-09 03:00:00+00:00,2026-01-09 06:00:00+00:00,-5.68,-9.67,-3.52,3.459379,-1.223333,-4.61,1.85,3.241378,9.116667,6.82,10.41,1.99425,414.1,403.9,424.3,14.424978,3.33


In [20]:
# Filter out rows with missing values and sort by date_and_time
new_solar_aggregated_df = new_solar_aggregated_df.dropna()
new_solar_aggregated_df = new_solar_aggregated_df.sort_values(["window_start"])
new_solar_aggregated_df = new_solar_aggregated_df.reset_index(drop=True)

new_solar_aggregated_df

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-08 12:00:00+00:00,2026-01-08 15:00:00+00:00,-3.783333,-4.58,-2.2,1.371216,5.393333,2.5,8.09,2.800185,10.723333,8.55,15.02,3.721106,378.6,373.8,382.3,4.355456,1.0
1,2026-01-08 15:00:00+00:00,2026-01-08 18:00:00+00:00,-6.556667,-9.36,-5.04,2.43048,2.256667,-3.57,6.24,5.158239,5.23,3.24,8.01,2.481189,391.6,376.0,409.7,16.988526,1.0
2,2026-01-08 18:00:00+00:00,2026-01-08 21:00:00+00:00,-6.66,-8.07,-4.48,1.914863,-1.966667,-4.76,-0.31,2.43303,3.773333,2.87,4.85,1.001316,414.666667,403.4,435.8,18.315385,2.0
3,2026-01-08 21:00:00+00:00,2026-01-09 00:00:00+00:00,-5.486667,-9.02,-2.3,3.373386,-5.753333,-8.7,-3.71,2.614772,5.7,4.15,6.82,1.385965,423.9,413.4,434.4,14.849242,3.0
4,2026-01-09 00:00:00+00:00,2026-01-09 03:00:00+00:00,-3.243333,-3.76,-2.55,0.624046,-5.583333,-8.34,-2.89,2.725552,5.62,5.1,6.06,0.484974,412.533333,410.4,414.4,2.013289,3.67
5,2026-01-09 03:00:00+00:00,2026-01-09 06:00:00+00:00,-5.68,-9.67,-3.52,3.459379,-1.223333,-4.61,1.85,3.241378,9.116667,6.82,10.41,1.99425,414.1,403.9,424.3,14.424978,3.33


In [21]:
# Drop the column of the KP index, because it is not useful for the inference in the real time data
new_solar_df = new_solar_df.drop(columns=['kp_index'])
new_solar_df.dropna(inplace=True)
new_solar_df = new_solar_df.sort_values(["date_and_time"])
new_solar_df = new_solar_df.reset_index(drop=True)
new_solar_df

Unnamed: 0,by_gsm,bz_gsm,date_and_time,density,speed
0,-4.57,2.5,2026-01-08 12:00:00+00:00,15.02,373.8
1,-4.58,5.59,2026-01-08 13:00:00+00:00,8.6,379.7
2,-2.2,8.09,2026-01-08 14:00:00+00:00,8.55,382.3
3,-5.04,6.24,2026-01-08 15:00:00+00:00,8.01,376.0
4,-9.36,-3.57,2026-01-08 16:00:00+00:00,3.24,389.1
5,-5.27,4.1,2026-01-08 17:00:00+00:00,4.44,409.7
6,-7.43,-0.83,2026-01-08 18:00:00+00:00,2.87,403.4
7,-8.07,-4.76,2026-01-08 19:00:00+00:00,3.6,404.8
8,-4.48,-0.31,2026-01-08 20:00:00+00:00,4.85,435.8
9,-9.02,-4.85,2026-01-08 21:00:00+00:00,4.15,413.4


# Step 3: Insert into Feature Groups

Now we push the new observations into the Feature Store. Hopsworks will handle the deduplication based on the primary keys defined in the backfill notebook.

In [22]:
print("Before casting the aggregated data:\n", new_solar_aggregated_df)
# Clean and cast to correct types for Feature Store compatibility
# Convert numeric columns to float32 (Feature Store expects 'float' not 'double')
df = new_solar_aggregated_df.copy()

for col in df.columns:
    if col not in ["window_start", "window_end"]:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')

new_solar_aggregated_df = df
# check data types of each column
print("After casting:\n", new_solar_aggregated_df.dtypes)
new_solar_aggregated_df

Before casting the aggregated data:
                window_start                window_end  by_gsm_mean  \
0 2026-01-08 12:00:00+00:00 2026-01-08 15:00:00+00:00    -3.783333   
1 2026-01-08 15:00:00+00:00 2026-01-08 18:00:00+00:00    -6.556667   
2 2026-01-08 18:00:00+00:00 2026-01-08 21:00:00+00:00    -6.660000   
3 2026-01-08 21:00:00+00:00 2026-01-09 00:00:00+00:00    -5.486667   
4 2026-01-09 00:00:00+00:00 2026-01-09 03:00:00+00:00    -3.243333   
5 2026-01-09 03:00:00+00:00 2026-01-09 06:00:00+00:00    -5.680000   

   by_gsm_min  by_gsm_max  by_gsm_std  bz_gsm_mean  bz_gsm_min  bz_gsm_max  \
0       -4.58       -2.20    1.371216     5.393333        2.50        8.09   
1       -9.36       -5.04    2.430480     2.256667       -3.57        6.24   
2       -8.07       -4.48    1.914863    -1.966667       -4.76       -0.31   
3       -9.02       -2.30    3.373386    -5.753333       -8.70       -3.71   
4       -3.76       -2.55    0.624046    -5.583333       -8.34       -2.89   
5   

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-08 12:00:00+00:00,2026-01-08 15:00:00+00:00,-3.783333,-4.58,-2.2,1.371216,5.393333,2.5,8.09,2.800184,10.723333,8.55,15.02,3.721107,378.600006,373.799988,382.299988,4.355456,1.0
1,2026-01-08 15:00:00+00:00,2026-01-08 18:00:00+00:00,-6.556667,-9.36,-5.04,2.43048,2.256667,-3.57,6.24,5.158239,5.23,3.24,8.01,2.481189,391.600006,376.0,409.700012,16.988525,1.0
2,2026-01-08 18:00:00+00:00,2026-01-08 21:00:00+00:00,-6.66,-8.07,-4.48,1.914863,-1.966667,-4.76,-0.31,2.43303,3.773333,2.87,4.85,1.001316,414.666656,403.399994,435.799988,18.315386,2.0
3,2026-01-08 21:00:00+00:00,2026-01-09 00:00:00+00:00,-5.486667,-9.02,-2.3,3.373386,-5.753334,-8.7,-3.71,2.614772,5.7,4.15,6.82,1.385965,423.899994,413.399994,434.399994,14.849242,3.0
4,2026-01-09 00:00:00+00:00,2026-01-09 03:00:00+00:00,-3.243333,-3.76,-2.55,0.624046,-5.583333,-8.34,-2.89,2.725552,5.62,5.1,6.06,0.484974,412.533325,410.399994,414.399994,2.013289,3.67
5,2026-01-09 03:00:00+00:00,2026-01-09 06:00:00+00:00,-5.68,-9.67,-3.52,3.459378,-1.223333,-4.61,1.85,3.241378,9.116667,6.82,10.41,1.99425,414.100006,403.899994,424.299988,14.424978,3.33


In [23]:
print("Before casting the real time data:\n", new_solar_df)
# Clean and cast to correct types for Feature Store compatibility
# Convert numeric columns to float32 (Feature Store expects 'float' not 'double')
df = new_solar_df.copy()

for col in df.columns:
    if col not in ["date_and_time"]:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')

new_solar_df = df
# check data types of each column
print("After casting:\n", new_solar_df.dtypes)
new_solar_df

Before casting the real time data:
     by_gsm  bz_gsm             date_and_time  density  speed
0    -4.57    2.50 2026-01-08 12:00:00+00:00    15.02  373.8
1    -4.58    5.59 2026-01-08 13:00:00+00:00     8.60  379.7
2    -2.20    8.09 2026-01-08 14:00:00+00:00     8.55  382.3
3    -5.04    6.24 2026-01-08 15:00:00+00:00     8.01  376.0
4    -9.36   -3.57 2026-01-08 16:00:00+00:00     3.24  389.1
5    -5.27    4.10 2026-01-08 17:00:00+00:00     4.44  409.7
6    -7.43   -0.83 2026-01-08 18:00:00+00:00     2.87  403.4
7    -8.07   -4.76 2026-01-08 19:00:00+00:00     3.60  404.8
8    -4.48   -0.31 2026-01-08 20:00:00+00:00     4.85  435.8
9    -9.02   -4.85 2026-01-08 21:00:00+00:00     4.15  413.4
10   -5.14   -3.71 2026-01-08 22:00:00+00:00     6.82  434.4
11   -3.76   -8.34 2026-01-09 00:00:00+00:00     5.70  414.4
12   -3.42   -2.89 2026-01-09 01:00:00+00:00     5.10  412.8
13   -2.55   -5.52 2026-01-09 02:00:00+00:00     6.06  410.4
14   -3.85    1.85 2026-01-09 04:00:00+00:00    1

Unnamed: 0,by_gsm,bz_gsm,date_and_time,density,speed
0,-4.57,2.5,2026-01-08 12:00:00+00:00,15.02,373.799988
1,-4.58,5.59,2026-01-08 13:00:00+00:00,8.6,379.700012
2,-2.2,8.09,2026-01-08 14:00:00+00:00,8.55,382.299988
3,-5.04,6.24,2026-01-08 15:00:00+00:00,8.01,376.0
4,-9.36,-3.57,2026-01-08 16:00:00+00:00,3.24,389.100006
5,-5.27,4.1,2026-01-08 17:00:00+00:00,4.44,409.700012
6,-7.43,-0.83,2026-01-08 18:00:00+00:00,2.87,403.399994
7,-8.07,-4.76,2026-01-08 19:00:00+00:00,3.6,404.799988
8,-4.48,-0.31,2026-01-08 20:00:00+00:00,4.85,435.799988
9,-9.02,-4.85,2026-01-08 21:00:00+00:00,4.15,413.399994


In [24]:
# Retrieve references to the Feature Groups
solar_wind_fg = fs.get_feature_group(name="solar_wind_fg", version=6)
solar_wind_aggregated_fg = fs.get_feature_group(name="solar_wind_aggregated_fg", version=1)

# Insert new data
# Note: For real-time pipelines, we often use online_enabled=True
# so the data is available for immediate inference.
solar_wind_fg.insert(new_solar_df)
solar_wind_aggregated_fg.insert(new_solar_aggregated_df)

print("Daily Feature Pipeline execution complete!")

Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 20/20 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: solar_wind_fg_6_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1299605/jobs/named/solar_wind_fg_6_offline_fg_materialization/executions


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 6/6 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: solar_wind_aggregated_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1299605/jobs/named/solar_wind_aggregated_fg_1_offline_fg_materialization/executions
Daily Feature Pipeline execution complete!
