In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import sys
import torch
sys.path.append("../")
from ionopy import MadrigalDatasetTimeSeries
from ionopy import weight_init


In [2]:
bucket_path='/home/ga00693/gcs-bucket/'
config={'madrigal_path': f'{bucket_path}/madrigal_data/processed/gps_data_tarr/csv_subsets/subset_tec_10mln.csv',
        'set_sw_path': f'{bucket_path}/karman-2025/data/sw_data/set_sw.csv',
        'celestrack_path': f'{bucket_path}/karman-2025/data/sw_data/celestrack_sw.csv',
        'omni_indices_path': f'{bucket_path}/karman-2025/data/omniweb_data/merged_omni_indices.csv',
        'omni_magnetic_field_path': f'{bucket_path}/karman-2025/data/omniweb_data/merged_omni_magnetic_field.csv',
        'omni_solar_wind_path': f'{bucket_path}/karman-2025/data/omniweb_data/merged_omni_solar_wind.csv',
        'jpld_path': f'{bucket_path}/jpld/subset_lat_lon/jpld_vtec_15min.csv',
        'use_celestrack': True,
        'use_set_sw': True,
        'use_jpld': True,
        'use_omni_indices': True,
        'use_omni_magnetic_field': True,
        'use_omni_solar_wind': True,
        'lag_days_proxies':144, # 81 days
        'proxies_resolution':1,  # 1 day
        'lag_minutes_omni':8640,
        'omni_resolution':60,  # 1 minute
        'lag_minutes_jpld':2160,
        'jpld_resolution':15,  # 15 minutes
}
madrigal_dataset = MadrigalDatasetTimeSeries(config,
                                             torch_type=torch.float32)


MadrigalDatasetTimeSeries initialized with min_date: 2010-06-13 00:00:00 and max_date: 2024-07-31 23:45:00
Loading Madrigal dataset with config:
 {'madrigal_path': '/home/ga00693/gcs-bucket//madrigal_data/processed/gps_data_tarr/csv_subsets/subset_tec_10mln.csv', 'set_sw_path': '/home/ga00693/gcs-bucket//karman-2025/data/sw_data/set_sw.csv', 'celestrack_path': '/home/ga00693/gcs-bucket//karman-2025/data/sw_data/celestrack_sw.csv', 'omni_indices_path': '/home/ga00693/gcs-bucket//karman-2025/data/omniweb_data/merged_omni_indices.csv', 'omni_magnetic_field_path': '/home/ga00693/gcs-bucket//karman-2025/data/omniweb_data/merged_omni_magnetic_field.csv', 'omni_solar_wind_path': '/home/ga00693/gcs-bucket//karman-2025/data/omniweb_data/merged_omni_solar_wind.csv', 'jpld_path': '/home/ga00693/gcs-bucket//jpld/subset_lat_lon/jpld_vtec_15min.csv', 'use_celestrack': True, 'use_set_sw': True, 'use_jpld': True, 'use_omni_indices': True, 'use_omni_magnetic_field': True, 'use_omni_solar_wind': True, 

Madrigal data loaded with shape: (9075780, 5)
Now removing 0 TEC values...
Madrigal data processed, final shape: (9074097, 5)
Input features shape: torch.Size([9074097, 7])

Loading JPLD dataset.
Loading time series data for jpld
Normalizing time series data for jpld

Loading Omni indices.
Loading time series data for omni_indices
Normalizing time series data for omni_indices

Loading Omni Solar Wind.
Loading time series data for omni_solar_wind
Normalizing time series data for omni_solar_wind

Loading Omni Magnetic Field.
Loading time series data for omni_magnetic_field
Normalizing time series data for omni_magnetic_field

Loading SET Solar Wind data.
Normalizing time series data for set_sw

Loading Celestrack data.
Normalizing time series data for celestrack


In [8]:
import pandas as pd
import torch
from omegaconf import OmegaConf
from tft_torch import tft
import tft_torch.loss as tft_loss
import torch.nn.init as init
import numpy as np
import wandb
from pyfiglet import Figlet
from termcolor import colored
from tqdm import tqdm
import argparse
import pprint
import time
from torch import optim
from torch.utils.data import RandomSampler, SequentialSampler
import random


In [9]:
print('Ionopy Model Training -> Forecasting the ionosphere vTEC')
f = Figlet(font='big')
print(colored(f.renderText('Ionopy 1.0'), 'red'))
f = Figlet(font='digital')
print(colored(f.renderText("Training Forecasting Model"), 'blue'))
#print(colored(f'Version {ionopy.__version__}\n','blue'))

Ionopy Model Training -> Forecasting the ionosphere vTEC
[31m _____                                __   ___  
|_   _|                              /_ | / _ \ 
  | |  ___  _ __   ___  _ __  _   _   | || | | |
  | | / _ \| '_ \ / _ \| '_ \| | | |  | || | | |
 _| || (_) | | | | (_) | |_) | |_| |  | || |_| |
|_____\___/|_| |_|\___/| .__/ \__, |  |_(_)___/ 
                       | |     __/ |            
                       |_|    |___/             
[0m
[34m+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+
|T|r|a|i|n|i|n|g| |F|o|r|e|c|a|s|t|i|n|g| |M|o|d|e|l|
+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+
[0m


In [10]:
wandb_inactive=True
if wandb_inactive == False:
    wandb.init(project='', group='', config=vars(opt))
    # wandb.init(mode="disabled")
    if opt.run_name != '':
        wandb.run.name = opt.run_name
        wandb.run.save()


# print('Arguments:\n{}\n'.format(' '.join(sys.argv[1:])))
# print('Config:')
# pprint.pprint(vars(opt), depth=2, width=1)
# print()


if 'float32':
    torch_type=torch.float32
elif 'float64':
    torch_type=torch.float64
else:
    raise ValueError('Invalid torch type. Only float32 and float64 are supported')

# if opt.device.startswith('cuda'):
#     device = torch.device(opt.device if torch.cuda.is_available() else 'cpu')
# else:
#     device=torch.device('cpu') 
#print(f"Device is {device}")

In [11]:
madrigal_dataset[0]['inputs'].shape,madrigal_dataset[0]['jpld'].shape,madrigal_dataset[0]['celestrack'].shape, madrigal_dataset[0]['set_sw'].shape, madrigal_dataset[0]['omni_magnetic_field'].shape, madrigal_dataset[0]['omni_solar_wind'].shape, madrigal_dataset[0]['omni_indices'].shape

(torch.Size([7]),
 torch.Size([145, 20]),
 torch.Size([145, 1]),
 torch.Size([145, 9]),
 torch.Size([145, 3]),
 torch.Size([145, 4]),
 torch.Size([145, 3]))

In [12]:
# set configuration
num_historical_numeric=0

if madrigal_dataset.config['omni_indices_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['omni_indices'].shape[1]
if madrigal_dataset.config['omni_magnetic_field_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['omni_magnetic_field'].shape[1]
if madrigal_dataset.config['omni_solar_wind_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['omni_solar_wind'].shape[1]
if madrigal_dataset.config['celestrack_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['celestrack'].shape[1]
if madrigal_dataset.config['set_sw_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['set_sw'].shape[1]
if madrigal_dataset.config['jpld_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['jpld'].shape[1]

print(f"Historical input features of the model: {num_historical_numeric}")

input_dimension=len(madrigal_dataset[0]['inputs'])
print(f"Static features of the model: {input_dimension}")

Historical input features of the model: 40
Static features of the model: 7


In [13]:
model_type='tft'
dropout=0.2
state_size=64
lstm_layers=2
attention_heads=4

if num_historical_numeric==0:
    raise ValueError('No historical numeric data found in the dataset')
if model_type=='tft':
    data_props = {'num_historical_numeric': num_historical_numeric,
                'num_static_numeric': input_dimension,
                'num_future_numeric': 1,
                }

    configuration = {
                    'model':
                        {
                            'dropout': dropout,
                            'state_size': state_size,
                            'output_quantiles': [0.1, 0.5, 0.9],
                            'lstm_layers': lstm_layers,
                            'attention_heads': attention_heads,
                        },
                    'task_type': 'regression',
                    'target_window_start': None,
                    'data_props': data_props,
                    }
    # initialize TFT model 
    ts_ionopy_model = tft.TemporalFusionTransformer(OmegaConf.create(configuration))
    # weight init
    ts_ionopy_model.apply(weight_init)


#ts_ionopy_model.to(device)
# if opt.model_path is not None:
#     ts_karman_model.load_state_dict(torch.load(opt.model_path))

num_params=sum(p.numel() for p in ts_ionopy_model.parameters() if p.requires_grad)
print(f'Karman model num parameters: {num_params}')

#(q_0.9-q_0.1)/2.5632

Karman model num parameters: 1433223


In [14]:
from torch.utils.data import RandomSampler, SequentialSampler

idx_test_fold=2
test_month_idx = 2 * (idx_test_fold - 1)
validation_month_idx = test_month_idx + 2
print(test_month_idx,validation_month_idx)
madrigal_dataset._set_indices(test_month_idx=[test_month_idx], validation_month_idx=[validation_month_idx],custom={ 2012: {"validation":8, "test":9},
                                                                                                                    2013: {"validation":4, "test":5},
                                                                                                                    2015: {"validation":2, "test":3},
                                                                                                                    2022: {"validation":0, "test":1},
                                                                                                                    2024: {"validation":5,"test":6}})
train_dataset = madrigal_dataset.train_dataset()
validation_dataset = madrigal_dataset.validation_dataset()
test_dataset = madrigal_dataset.test_dataset()
print(f'Training dataset example: {train_dataset[0].items()}')

2 4
Creating training, validation and test sets.


15 years to iterate through.: 100%|██████████| 15/15 [00:09<00:00,  1.60it/s]


Train size: 7329735
Validation size: 718736
Test size: 771343
Training dataset example: dict_items([('date', '2010-06-13 00:02:30.000000'), ('inputs', tensor([ 0.3294, -0.9442,  0.0109,  0.9999,  0.5778,  0.1392,  0.9903])), ('tec', tensor(-0.2729)), ('dtec', tensor(0.7676)), ('jpld', tensor([[-0.5653, -1.2753, -1.2941,  ...,  0.1433,  0.3472,  0.3095],
        [-0.4282, -1.3155, -1.2281,  ...,  0.0794,  0.3472,  0.3430],
        [-0.2803, -1.3155, -1.0452,  ..., -0.0039,  0.3313,  0.3595],
        ...,
        [-1.1379, -1.3155, -0.9886,  ...,  0.4316,  0.5836,  0.4881],
        [-0.8198, -1.1266, -0.9341,  ...,  0.4580,  0.5557,  0.4881],
        [-0.6244, -0.9322, -0.9611,  ...,  0.5096,  0.5416,  0.4407]])), ('omni_indices', tensor([[ 1.0824,  0.3498,  0.9382],
        [ 1.4279, -0.1262,  0.7425],
        [ 1.0824, -0.6543,  0.4396],
        [ 1.4279, -0.6543, -1.5251],
        [ 1.0824, -0.6543, -0.8765],
        [ 0.0606, -0.6034, -0.7049],
        [ 0.3978, -0.3962, -1.0670],
  

In [15]:
train_sampler = RandomSampler(train_dataset, num_samples=len(train_dataset))
validation_sampler = RandomSampler(validation_dataset, num_samples=len(validation_dataset))
test_sampler = SequentialSampler(test_dataset)


In [None]:
lr=.4*1e-3
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, list(ts_ionopy_model.parameters())),
    lr=lr,
    amsgrad=True,
)

#scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[25,50,75,100,125,150,175,200,225,230,240,250,260,270], gamma=0.8, verbose=False)
criterion=torch.nn.MSELoss()

# And the dataloader
#seed them
g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7f2264a6ec90>

In [25]:
batch_size=32
seed_worker=0
num_workers=0


train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    pin_memory=False,
    num_workers=num_workers,
    sampler=train_sampler,
    drop_last=True,
    worker_init_fn=seed_worker,
    generator=g
)
validation_loader = torch.utils.data.DataLoader(
    validation_dataset,
    batch_size=batch_size,
    pin_memory=False,
    num_workers=num_workers,
    sampler=validation_sampler,
    drop_last=True,
    worker_init_fn=seed_worker,
    generator=g
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    pin_memory=False,
    num_workers=num_workers,
    sampler=test_sampler,
    drop_last=True,
    worker_init_fn=seed_worker,
    generator=g
)

In [27]:
el=next(iter(train_loader))  # prefetch the first batch

In [38]:
el['jpld'].shape

torch.Size([32, 11, 20])

In [None]:
historical_ts_numeric=[]
try:
    historical_ts_numeric+=batch['jpld'][:,:-1,:]
except KeyError:
    #do nothing:
    pass
try:
    historical_ts_numeric+=batch['celestrack'][:,:-1,:]
except KeyError:
    #do nothing:
    pass
try:
    historical_ts_numeric+=batch['set_sw'][:,:-1,:]
except KeyError:
    #do nothing:
    pass
try:
    historical_ts_numeric+=batch['omni_indices'][:,:-1,:]
except KeyError:
    #do nothing:
    pass
try:
    historical_ts_numeric+=batch['omni_magnetic_field'][:,:-1,:]
except KeyError:
    #do nothing:
    pass
try:
    historical_ts_numeric+=batch['omni_solar_wind'][:,:-1,:]
except KeyError:
    #do nothing:
    pass

In [77]:
8640/60, 15*144

(144.0, 2160)

In [71]:
torch.cat(historical_ts_numeric,dim=2)

RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 10 but got size 81 for tensor number 1 in the list.

In [66]:
batch['jpld'][:,:-1,:].shape,historical_ts_numeric[5].shape

(torch.Size([32, 10, 20]), torch.Size([32, 2880, 4]))

In [70]:
torch.stack(historical_ts_numeric)#,dim=2).shape

RuntimeError: stack expects each tensor to be equal size, but got [32, 10, 20] at entry 0 and [32, 81, 1] at entry 1

In [35]:
torch.cat(historical_ts_numeric,dim=2).shape

IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)