In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import sys
import torch
sys.path.append("../")
from ionopy import MadrigalDatasetTimeSeries
from ionopy import weight_init


In [2]:
base_dir='/Users/ga00693/data_madrigal'
config={'madrigal_path': f'{base_dir}/subset_tec_10mln.csv',
        'set_sw_path': f'{base_dir}/set_sw.csv',
        'celestrack_path': f'{base_dir}/celestrack_sw.csv',
        'omni_indices_path': f'{base_dir}/merged_omni_indices.csv',
        'omni_magnetic_field_path':f'{base_dir}/merged_omni_magnetic_field.csv',
        'omni_solar_wind_path':f'{base_dir}/merged_omni_solar_wind.csv',
        'jpld_path': f'{base_dir}/jpld_vtec_15min.csv',
        'use_celestrack': True,
        'use_set_sw': True,
        'use_jpld': True,
        'use_omni_indices': True,
        'use_omni_magnetic_field': True,
        'use_omni_solar_wind': True,
        'lag_days_proxies':81, # 81 days
        'proxies_resolution':1,  # 1 day
        'lag_minutes_omni':2 * 24 * 60,
        'omni_resolution':1,  # 1 minute
        'lag_minutes_jpld':150,
        'jpld_resolution':15,  # 15 minutes
}
madrigal_dataset = MadrigalDatasetTimeSeries(config,
                                             torch_type=torch.float32)


MadrigalDatasetTimeSeries initialized with min_date: 2010-06-13 00:00:00 and max_date: 2024-07-31 23:45:00
Loading Madrigal dataset with config:
 {'madrigal_path': '/Users/ga00693/data_madrigal/subset_tec_10mln.csv', 'set_sw_path': '/Users/ga00693/data_madrigal/set_sw.csv', 'celestrack_path': '/Users/ga00693/data_madrigal/celestrack_sw.csv', 'omni_indices_path': '/Users/ga00693/data_madrigal/merged_omni_indices.csv', 'omni_magnetic_field_path': '/Users/ga00693/data_madrigal/merged_omni_magnetic_field.csv', 'omni_solar_wind_path': '/Users/ga00693/data_madrigal/merged_omni_solar_wind.csv', 'jpld_path': '/Users/ga00693/data_madrigal/jpld_vtec_15min.csv', 'use_celestrack': True, 'use_set_sw': True, 'use_jpld': True, 'use_omni_indices': True, 'use_omni_magnetic_field': True, 'use_omni_solar_wind': True, 'lag_days_proxies': 81, 'proxies_resolution': 1, 'lag_minutes_omni': 2880, 'omni_resolution': 1, 'lag_minutes_jpld': 150, 'jpld_resolution': 15}
Madrigal data loaded with shape: (9075780, 5

In [3]:
import pandas as pd
import torch
from omegaconf import OmegaConf
from tft_torch import tft
import tft_torch.loss as tft_loss
import torch.nn.init as init
import numpy as np
import wandb
from pyfiglet import Figlet
from termcolor import colored
from tqdm import tqdm
import argparse
import pprint
import time
from torch import optim
from torch.utils.data import RandomSampler, SequentialSampler
import random


In [4]:
print('Ionopy Model Training -> Forecasting the ionosphere vTEC')
f = Figlet(font='5lineoblique')
print(colored(f.renderText('Ionopy 1.0'), 'red'))
f = Figlet(font='digital')
print(colored(f.renderText("Training Forecasting Model"), 'blue'))
#print(colored(f'Version {ionopy.__version__}\n','blue'))

Ionopy Model Training -> Forecasting the ionosphere vTEC
[31m                                                                              
   ___   ___                                                                  
      / /                                                              ___    
     / /     ___       __      ___      ___                /_  /     //   ) ) 
    / /    //   ) ) //   ) ) //   ) ) //   ) ) //   / /     / /     //   / /  
   / /    //   / / //   / / //   / / //___/ / ((___/ /     / /     //   / /   
__/ /___ ((___/ / //   / / ((___/ / //            / /     / /    (|(___/ /    
[0m
[34m+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+
|T|r|a|i|n|i|n|g| |F|o|r|e|c|a|s|t|i|n|g| |M|o|d|e|l|
+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+
[0m


In [5]:
wandb_inactive=True
if wandb_inactive == False:
    wandb.init(project='', group='', config=vars(opt))
    # wandb.init(mode="disabled")
    if opt.run_name != '':
        wandb.run.name = opt.run_name
        wandb.run.save()


# print('Arguments:\n{}\n'.format(' '.join(sys.argv[1:])))
# print('Config:')
# pprint.pprint(vars(opt), depth=2, width=1)
# print()


if 'float32':
    torch_type=torch.float32
elif 'float64':
    torch_type=torch.float64
else:
    raise ValueError('Invalid torch type. Only float32 and float64 are supported')

# if opt.device.startswith('cuda'):
#     device = torch.device(opt.device if torch.cuda.is_available() else 'cpu')
# else:
#     device=torch.device('cpu') 
#print(f"Device is {device}")

In [6]:
madrigal_dataset[0]['inputs'].shape,madrigal_dataset[0]['jpld'].shape,madrigal_dataset[0]['celestrack'].shape, madrigal_dataset[0]['set_sw'].shape, madrigal_dataset[0]['omni_magnetic_field'].shape, madrigal_dataset[0]['omni_solar_wind'].shape, madrigal_dataset[0]['omni_indices'].shape

(torch.Size([7]),
 torch.Size([11, 20]),
 torch.Size([82, 1]),
 torch.Size([82, 9]),
 torch.Size([2881, 3]),
 torch.Size([2881, 4]),
 torch.Size([2881, 3]))

In [7]:
# set configuration
num_historical_numeric=0

if madrigal_dataset.config['omni_indices_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['omni_indices'].shape[1]
if madrigal_dataset.config['omni_magnetic_field_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['omni_magnetic_field'].shape[1]
if madrigal_dataset.config['omni_solar_wind_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['omni_solar_wind'].shape[1]
if madrigal_dataset.config['celestrack_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['celestrack'].shape[1]
if madrigal_dataset.config['set_sw_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['set_sw'].shape[1]
if madrigal_dataset.config['jpld_path'] is not None:
    num_historical_numeric+=madrigal_dataset[0]['jpld'].shape[1]

print(f"Historical input features of the model: {num_historical_numeric}")

input_dimension=len(madrigal_dataset[0]['inputs'])
print(f"Static features of the model: {input_dimension}")

Historical input features of the model: 40
Static features of the model: 7


In [8]:
model_type='tft'
dropout=0.2
state_size=64
lstm_layers=2
attention_heads=4

if num_historical_numeric==0:
    raise ValueError('No historical numeric data found in the dataset')
if model_type=='tft':
    data_props = {'num_historical_numeric': num_historical_numeric,
                'num_static_numeric': input_dimension,
                'num_future_numeric': 1,
                }

    configuration = {
                    'model':
                        {
                            'dropout': dropout,
                            'state_size': state_size,
                            'output_quantiles': [0.1, 0.5, 0.9],
                            'lstm_layers': lstm_layers,
                            'attention_heads': attention_heads,
                        },
                    'task_type': 'regression',
                    'target_window_start': None,
                    'data_props': data_props,
                    }
    # initialize TFT model 
    ts_ionopy_model = tft.TemporalFusionTransformer(OmegaConf.create(configuration))
    # weight init
    ts_ionopy_model.apply(weight_init)


#ts_ionopy_model.to(device)
# if opt.model_path is not None:
#     ts_karman_model.load_state_dict(torch.load(opt.model_path))

num_params=sum(p.numel() for p in ts_ionopy_model.parameters() if p.requires_grad)
print(f'Karman model num parameters: {num_params}')

#(q_0.9-q_0.1)/2.5632

Karman model num parameters: 1433223


In [None]:
from torch.utils.data import RandomSampler, SequentialSampler

idx_test_fold=2
test_month_idx = 2 * (idx_test_fold - 1)
validation_month_idx = test_month_idx + 2
print(test_month_idx,validation_month_idx)
madrigal_dataset._set_indices(test_month_idx=[test_month_idx], validation_month_idx=[validation_month_idx],custom={ 2012: {"validation":8, "test":9},
                                                                                                                    2013: {"validation":4, "test":5},
                                                                                                                    2015: {"validation":2, "test":3},
                                                                                                                    2022: {"validation":0, "test":1},
                                                                                                                    2024: {"validation":5,"test":6}})
train_dataset = madrigal_dataset.train_dataset()
validation_dataset = madrigal_dataset.validation_dataset()
test_dataset = madrigal_dataset.test_dataset()
print(f'Training dataset example: {train_dataset[0].items()}')

2 4
Creating training, validation and test sets.


15 years to iterate through.: 100%|██████████| 15/15 [00:07<00:00,  2.14it/s]

Train size: 7329735
Validation size: 718736
Test size: 771343
Training dataset example: dict_items([('date', '2010-06-13 00:02:30.000000'), ('inputs', tensor([ 0.3294, -0.9442,  0.0109,  0.9999,  0.5778,  0.1392,  0.9903])), ('tec', tensor(-0.2729)), ('dtec', tensor(0.7676)), ('jpld', tensor([[-1.9474e+00, -2.1021e+00, -6.4197e-01, -8.6000e-02, -1.9941e-01,
         -9.8301e-01, -1.8532e+00, -6.0305e-02, -8.8329e-01, -2.0373e-01,
          7.5752e-02, -1.6949e+00,  1.0042e-01, -1.0778e+00, -7.2952e-01,
          3.5696e-01, -2.4823e-01,  4.9686e-01,  5.8356e-01,  4.5663e-01],
        [-1.9474e+00, -2.1021e+00, -6.8695e-01, -1.6257e-01, -2.1614e-01,
         -9.8301e-01, -1.9911e+00, -5.4295e-02, -9.2390e-01, -1.5301e-01,
          5.3175e-02, -1.6759e+00,  1.4999e-01, -1.2200e+00, -6.2681e-01,
          3.9137e-01, -2.6722e-01,  4.7105e-01,  5.8356e-01,  4.5663e-01],
        [-1.9474e+00, -1.9502e+00, -7.5702e-01, -2.1116e-01, -2.1614e-01,
         -9.4123e-01, -2.0398e+00, -6.6339e-02




In [10]:
train_sampler = RandomSampler(train_dataset, num_samples=len(train_dataset))
validation_sampler = RandomSampler(validation_dataset, num_samples=len(validation_dataset))
test_sampler = SequentialSampler(test_dataset)


In [11]:
lr=.4*1e-3
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, list(ts_ionopy_model.parameters())),
    lr=lr,
    amsgrad=True,
)

#scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[25,50,75,100,125,150,175,200,225,230,240,250,260,270], gamma=0.8, verbose=False)
criterion=torch.nn.MSELoss()

# And the dataloader
#seed them
g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x325e8d0f0>

In [13]:
batch_size=32
seed_worker=0
num_workers=0


train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    pin_memory=False,
    num_workers=num_workers,
    sampler=train_sampler,
    drop_last=True,
    worker_init_fn=seed_worker,
    generator=g
)
validation_loader = torch.utils.data.DataLoader(
    validation_dataset,
    batch_size=batch_size,
    pin_memory=False,
    num_workers=num_workers,
    sampler=validation_sampler,
    drop_last=True,
    worker_init_fn=seed_worker,
    generator=g
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    pin_memory=False,
    num_workers=num_workers,
    sampler=test_sampler,
    drop_last=True,
    worker_init_fn=seed_worker,
    generator=g
)