# Aurora Forecasting - Part 02: Daily Feature Pipeline

üóíÔ∏è This notebook is divided into the following sections:
Initialize Hopsworks connection.

Fetch the latest real-time Solar Wind data from NOAA.

Fetch the latest Cloud Cover forecast for Stockholm, Lule√•, and Kiruna.

Update the Feature Groups in the Hopsworks Feature Store.

# Imports and Login

In [1]:
import pandas as pd
import datetime
import hopsworks
from config import HopsworksSettings
import util
import warnings
warnings.filterwarnings("ignore")
import numpy

# Setup settings
settings = HopsworksSettings()

print(settings.HOPSWORKS_PROJECT)

# Login to Hopsworks
project = hopsworks.login(
    project=settings.HOPSWORKS_PROJECT,
    api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value()
)
fs = project.get_feature_store()


HopsworksSettings initialized!
mac64
2026-01-10 22:28:50,727 INFO: Initializing external client
2026-01-10 22:28:50,728 INFO: Base URL: https://c.app.hopsworks.ai:443






2026-01-10 22:28:52,442 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1299605


# Step 1: Get Real-time Solar Wind Data

We use the NOAA SWPC API to get the most recent measurements from the DSCOVR/ACE satellites. These will serve as the features for our real-time inference.

In [2]:
print("Fetching real-time solar wind data from NOAA...")

# Uses the helper function from util.py to fetch and merge mag/plasma data
new_solar_df = util.get_noaa_realtime_hourly_data(
    settings.NOAA_MAG_URL,
    settings.NOAA_PLASMA_URL,
    settings.KP_INDEX_URL
)

# Format the time_tag for Hopsworks compatibility
#new_solar_df['time'] = new_solar_df['time'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Drop unecessary columns if any (spoiler, there are)
new_solar_df.drop(columns=['bx_gsm', 'lon_gsm', 'lat_gsm', 'bt', 'temperature', 'a_running', 'station_count'], inplace=True, errors='ignore')

print(f"Successfully retrieved {len(new_solar_df)} new solar wind records.")
new_solar_df

Fetching real-time solar wind data from NOAA...
Raw Magnetometer data:
     bx_gsm  by_gsm  bz_gsm  lon_gsm  lat_gsm     bt             date_and_time
0     0.81    2.18   -3.53    69.59   -56.68   4.23 2026-01-09 21:00:00+00:00
1     0.29   -6.90   -2.49   272.43   -19.83   7.34 2026-01-09 22:00:00+00:00
2     4.14   -2.90    0.01   325.03     0.08   5.05 2026-01-09 23:00:00+00:00
3     6.70   -3.82    0.05   330.31     0.39   7.71 2026-01-10 00:00:00+00:00
4     0.58   -2.33    5.88   283.99    67.81   6.35 2026-01-10 01:00:00+00:00
5     3.35   -0.96    6.44   344.00    61.56   7.32 2026-01-10 02:00:00+00:00
6     2.04   -7.11   -4.45   285.97   -31.07   8.63 2026-01-10 03:00:00+00:00
7     3.30   -5.21   -1.04   302.33    -9.54   6.25 2026-01-10 04:00:00+00:00
8     0.63    1.68    6.94    69.40    75.53   7.17 2026-01-10 05:00:00+00:00
9    -1.51   -1.13   -6.25   216.80   -73.17   6.52 2026-01-10 06:00:00+00:00
10    1.86   -5.06   -1.08   290.16   -11.36   5.50 2026-01-10 07:00:0

Unnamed: 0,by_gsm,bz_gsm,date_and_time,density,speed,kp_index
0,2.18,-3.53,2026-01-09 21:00:00+00:00,3.59,503.6,2.0
1,-6.9,-2.49,2026-01-09 22:00:00+00:00,2.92,487.9,
2,-2.9,0.01,2026-01-09 23:00:00+00:00,4.77,479.4,
3,-3.82,0.05,2026-01-10 00:00:00+00:00,4.1,479.1,2.0
4,-2.33,5.88,2026-01-10 01:00:00+00:00,5.56,480.3,
5,-0.96,6.44,2026-01-10 02:00:00+00:00,7.47,480.3,
6,-7.11,-4.45,2026-01-10 03:00:00+00:00,4.36,472.7,3.0
7,-5.21,-1.04,2026-01-10 04:00:00+00:00,4.3,474.8,
8,1.68,6.94,2026-01-10 05:00:00+00:00,5.33,490.9,
9,-1.13,-6.25,2026-01-10 06:00:00+00:00,5.0,497.7,3.67


In [3]:
new_solar_aggregated_df = util.aggregate_solar_wind_3h(new_solar_df)
new_solar_aggregated_df

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-09 21:00:00+00:00,2026-01-10 00:00:00+00:00,-2.54,-6.9,2.18,4.550692,-2.003333,-3.53,0.01,1.819487,3.76,2.92,4.77,0.936643,490.3,479.4,503.6,12.277215,2.0
1,2026-01-10 00:00:00+00:00,2026-01-10 03:00:00+00:00,-2.37,-3.82,-0.96,1.43042,4.123333,0.05,6.44,3.538705,5.71,4.1,7.47,1.69,479.9,479.1,480.3,0.69282,2.0
2,2026-01-10 03:00:00+00:00,2026-01-10 06:00:00+00:00,-3.546667,-7.11,1.68,4.625044,0.483333,-4.45,6.94,5.845805,4.663333,4.3,5.33,0.578129,479.466667,472.7,490.9,9.957075,3.0
3,2026-01-10 06:00:00+00:00,2026-01-10 09:00:00+00:00,-1.316667,-5.06,2.24,3.653578,-4.33,-6.25,-1.08,2.83,4.04,2.29,5.0,1.517926,485.133333,475.1,497.7,11.511009,3.67
4,2026-01-10 09:00:00+00:00,2026-01-10 12:00:00+00:00,-3.463333,-6.79,1.88,4.673589,-4.856667,-8.41,-2.4,3.151513,3.903333,3.23,4.83,0.829538,483.8,464.1,505.6,20.829546,2.67
5,2026-01-10 12:00:00+00:00,2026-01-10 15:00:00+00:00,1.796667,-2.61,5.16,3.988688,-5.576667,-6.94,-3.96,1.506066,5.283333,3.54,6.17,1.509845,482.466667,478.9,486.3,3.7072,4.33
6,2026-01-10 15:00:00+00:00,2026-01-10 18:00:00+00:00,-3.656667,-7.2,-1.85,3.068816,-5.313333,-6.44,-3.5,1.585728,5.386667,3.01,7.71,2.350454,476.1,465.9,485.3,9.738583,3.33
7,2026-01-10 18:00:00+00:00,2026-01-10 21:00:00+00:00,5.23,0.61,13.36,7.062627,-5.336667,-7.1,-3.05,2.075098,4.926667,1.06,7.56,3.421013,504.333333,450.8,595.7,79.514296,6.0


In [4]:
# Filter out rows with missing values and sort by date_and_time
new_solar_aggregated_df = new_solar_aggregated_df.dropna()
new_solar_aggregated_df = new_solar_aggregated_df.sort_values(["window_start"])
new_solar_aggregated_df = new_solar_aggregated_df.reset_index(drop=True)

new_solar_aggregated_df

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-09 21:00:00+00:00,2026-01-10 00:00:00+00:00,-2.54,-6.9,2.18,4.550692,-2.003333,-3.53,0.01,1.819487,3.76,2.92,4.77,0.936643,490.3,479.4,503.6,12.277215,2.0
1,2026-01-10 00:00:00+00:00,2026-01-10 03:00:00+00:00,-2.37,-3.82,-0.96,1.43042,4.123333,0.05,6.44,3.538705,5.71,4.1,7.47,1.69,479.9,479.1,480.3,0.69282,2.0
2,2026-01-10 03:00:00+00:00,2026-01-10 06:00:00+00:00,-3.546667,-7.11,1.68,4.625044,0.483333,-4.45,6.94,5.845805,4.663333,4.3,5.33,0.578129,479.466667,472.7,490.9,9.957075,3.0
3,2026-01-10 06:00:00+00:00,2026-01-10 09:00:00+00:00,-1.316667,-5.06,2.24,3.653578,-4.33,-6.25,-1.08,2.83,4.04,2.29,5.0,1.517926,485.133333,475.1,497.7,11.511009,3.67
4,2026-01-10 09:00:00+00:00,2026-01-10 12:00:00+00:00,-3.463333,-6.79,1.88,4.673589,-4.856667,-8.41,-2.4,3.151513,3.903333,3.23,4.83,0.829538,483.8,464.1,505.6,20.829546,2.67
5,2026-01-10 12:00:00+00:00,2026-01-10 15:00:00+00:00,1.796667,-2.61,5.16,3.988688,-5.576667,-6.94,-3.96,1.506066,5.283333,3.54,6.17,1.509845,482.466667,478.9,486.3,3.7072,4.33
6,2026-01-10 15:00:00+00:00,2026-01-10 18:00:00+00:00,-3.656667,-7.2,-1.85,3.068816,-5.313333,-6.44,-3.5,1.585728,5.386667,3.01,7.71,2.350454,476.1,465.9,485.3,9.738583,3.33
7,2026-01-10 18:00:00+00:00,2026-01-10 21:00:00+00:00,5.23,0.61,13.36,7.062627,-5.336667,-7.1,-3.05,2.075098,4.926667,1.06,7.56,3.421013,504.333333,450.8,595.7,79.514296,6.0


In [5]:
# Drop the column of the KP index, because it is not useful for the inference in the real time data
new_solar_df = new_solar_df.drop(columns=['kp_index'])
new_solar_df.dropna(inplace=True)
new_solar_df = new_solar_df.sort_values(["date_and_time"])
new_solar_df = new_solar_df.reset_index(drop=True)
new_solar_df

Unnamed: 0,by_gsm,bz_gsm,date_and_time,density,speed
0,2.18,-3.53,2026-01-09 21:00:00+00:00,3.59,503.6
1,-6.9,-2.49,2026-01-09 22:00:00+00:00,2.92,487.9
2,-2.9,0.01,2026-01-09 23:00:00+00:00,4.77,479.4
3,-3.82,0.05,2026-01-10 00:00:00+00:00,4.1,479.1
4,-2.33,5.88,2026-01-10 01:00:00+00:00,5.56,480.3
5,-0.96,6.44,2026-01-10 02:00:00+00:00,7.47,480.3
6,-7.11,-4.45,2026-01-10 03:00:00+00:00,4.36,472.7
7,-5.21,-1.04,2026-01-10 04:00:00+00:00,4.3,474.8
8,1.68,6.94,2026-01-10 05:00:00+00:00,5.33,490.9
9,-1.13,-6.25,2026-01-10 06:00:00+00:00,5.0,497.7


# Step 3: Insert into Feature Groups

Now we push the new observations into the Feature Store. Hopsworks will handle the deduplication based on the primary keys defined in the backfill notebook.

In [6]:
print("Before casting the aggregated data:\n", new_solar_aggregated_df)
# Clean and cast to correct types for Feature Store compatibility
# Convert numeric columns to float32 (Feature Store expects 'float' not 'double')
df = new_solar_aggregated_df.copy()

for col in df.columns:
    if col not in ["window_start", "window_end"]:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')

new_solar_aggregated_df = df
# check data types of each column
print("After casting:\n", new_solar_aggregated_df.dtypes)
new_solar_aggregated_df

Before casting the aggregated data:
                window_start                window_end  by_gsm_mean  \
0 2026-01-09 21:00:00+00:00 2026-01-10 00:00:00+00:00    -2.540000   
1 2026-01-10 00:00:00+00:00 2026-01-10 03:00:00+00:00    -2.370000   
2 2026-01-10 03:00:00+00:00 2026-01-10 06:00:00+00:00    -3.546667   
3 2026-01-10 06:00:00+00:00 2026-01-10 09:00:00+00:00    -1.316667   
4 2026-01-10 09:00:00+00:00 2026-01-10 12:00:00+00:00    -3.463333   
5 2026-01-10 12:00:00+00:00 2026-01-10 15:00:00+00:00     1.796667   
6 2026-01-10 15:00:00+00:00 2026-01-10 18:00:00+00:00    -3.656667   
7 2026-01-10 18:00:00+00:00 2026-01-10 21:00:00+00:00     5.230000   

   by_gsm_min  by_gsm_max  by_gsm_std  bz_gsm_mean  bz_gsm_min  bz_gsm_max  \
0       -6.90        2.18    4.550692    -2.003333       -3.53        0.01   
1       -3.82       -0.96    1.430420     4.123333        0.05        6.44   
2       -7.11        1.68    4.625044     0.483333       -4.45        6.94   
3       -5.06       

Unnamed: 0,window_start,window_end,by_gsm_mean,by_gsm_min,by_gsm_max,by_gsm_std,bz_gsm_mean,bz_gsm_min,bz_gsm_max,bz_gsm_std,density_mean,density_min,density_max,density_std,speed_mean,speed_min,speed_max,speed_std,kp_index
0,2026-01-09 21:00:00+00:00,2026-01-10 00:00:00+00:00,-2.54,-6.9,2.18,4.550692,-2.003333,-3.53,0.01,1.819487,3.76,2.92,4.77,0.936643,490.299988,479.399994,503.600006,12.277215,2.0
1,2026-01-10 00:00:00+00:00,2026-01-10 03:00:00+00:00,-2.37,-3.82,-0.96,1.43042,4.123333,0.05,6.44,3.538705,5.71,4.1,7.47,1.69,479.899994,479.100006,480.299988,0.69282,2.0
2,2026-01-10 03:00:00+00:00,2026-01-10 06:00:00+00:00,-3.546667,-7.11,1.68,4.625044,0.483333,-4.45,6.94,5.845805,4.663333,4.3,5.33,0.578129,479.466675,472.700012,490.899994,9.957074,3.0
3,2026-01-10 06:00:00+00:00,2026-01-10 09:00:00+00:00,-1.316667,-5.06,2.24,3.653578,-4.33,-6.25,-1.08,2.83,4.04,2.29,5.0,1.517926,485.133331,475.100006,497.700012,11.511009,3.67
4,2026-01-10 09:00:00+00:00,2026-01-10 12:00:00+00:00,-3.463333,-6.79,1.88,4.673589,-4.856667,-8.41,-2.4,3.151513,3.903333,3.23,4.83,0.829538,483.799988,464.100006,505.600006,20.829546,2.67
5,2026-01-10 12:00:00+00:00,2026-01-10 15:00:00+00:00,1.796667,-2.61,5.16,3.988688,-5.576667,-6.94,-3.96,1.506065,5.283333,3.54,6.17,1.509845,482.466675,478.899994,486.299988,3.7072,4.33
6,2026-01-10 15:00:00+00:00,2026-01-10 18:00:00+00:00,-3.656667,-7.2,-1.85,3.068816,-5.313334,-6.44,-3.5,1.585728,5.386667,3.01,7.71,2.350454,476.100006,465.899994,485.299988,9.738583,3.33
7,2026-01-10 18:00:00+00:00,2026-01-10 21:00:00+00:00,5.23,0.61,13.36,7.062627,-5.336667,-7.1,-3.05,2.075098,4.926667,1.06,7.56,3.421014,504.333344,450.799988,595.700012,79.514297,6.0


In [7]:
print("Before casting the real time data:\n", new_solar_df)
# Clean and cast to correct types for Feature Store compatibility
# Convert numeric columns to float32 (Feature Store expects 'float' not 'double')
df = new_solar_df.copy()

for col in df.columns:
    if col not in ["date_and_time"]:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')

new_solar_df = df
# check data types of each column
print("After casting:\n", new_solar_df.dtypes)
new_solar_df

Before casting the real time data:
     by_gsm  bz_gsm             date_and_time  density  speed
0     2.18   -3.53 2026-01-09 21:00:00+00:00     3.59  503.6
1    -6.90   -2.49 2026-01-09 22:00:00+00:00     2.92  487.9
2    -2.90    0.01 2026-01-09 23:00:00+00:00     4.77  479.4
3    -3.82    0.05 2026-01-10 00:00:00+00:00     4.10  479.1
4    -2.33    5.88 2026-01-10 01:00:00+00:00     5.56  480.3
5    -0.96    6.44 2026-01-10 02:00:00+00:00     7.47  480.3
6    -7.11   -4.45 2026-01-10 03:00:00+00:00     4.36  472.7
7    -5.21   -1.04 2026-01-10 04:00:00+00:00     4.30  474.8
8     1.68    6.94 2026-01-10 05:00:00+00:00     5.33  490.9
9    -1.13   -6.25 2026-01-10 06:00:00+00:00     5.00  497.7
10   -5.06   -1.08 2026-01-10 07:00:00+00:00     2.29  482.6
11    2.24   -5.66 2026-01-10 08:00:00+00:00     4.83  475.1
12   -5.48   -3.76 2026-01-10 09:00:00+00:00     3.23  464.1
13   -6.79   -2.40 2026-01-10 10:00:00+00:00     4.83  505.6
14    1.88   -8.41 2026-01-10 11:00:00+00:00     

Unnamed: 0,by_gsm,bz_gsm,date_and_time,density,speed
0,2.18,-3.53,2026-01-09 21:00:00+00:00,3.59,503.600006
1,-6.9,-2.49,2026-01-09 22:00:00+00:00,2.92,487.899994
2,-2.9,0.01,2026-01-09 23:00:00+00:00,4.77,479.399994
3,-3.82,0.05,2026-01-10 00:00:00+00:00,4.1,479.100006
4,-2.33,5.88,2026-01-10 01:00:00+00:00,5.56,480.299988
5,-0.96,6.44,2026-01-10 02:00:00+00:00,7.47,480.299988
6,-7.11,-4.45,2026-01-10 03:00:00+00:00,4.36,472.700012
7,-5.21,-1.04,2026-01-10 04:00:00+00:00,4.3,474.799988
8,1.68,6.94,2026-01-10 05:00:00+00:00,5.33,490.899994
9,-1.13,-6.25,2026-01-10 06:00:00+00:00,5.0,497.700012


In [8]:
# Retrieve references to the Feature Groups
solar_wind_fg = fs.get_feature_group(name="solar_wind_fg", version=6)
solar_wind_aggregated_fg = fs.get_feature_group(name="solar_wind_aggregated_fg", version=1)

# Insert new data
# Note: For real-time pipelines, we often use online_enabled=True
# so the data is available for immediate inference.
solar_wind_fg.insert(new_solar_df)
solar_wind_aggregated_fg.insert(new_solar_aggregated_df)

print("Daily Feature Pipeline execution complete!")

Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 25/25 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: solar_wind_fg_6_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1299605/jobs/named/solar_wind_fg_6_offline_fg_materialization/executions


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 8/8 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: solar_wind_aggregated_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1299605/jobs/named/solar_wind_aggregated_fg_1_offline_fg_materialization/executions
Daily Feature Pipeline execution complete!
