#### Before running this notebook, please make sure to download and unpack the FTS dataset:
The Forecaster Test Set contains both the test sequences and the corresponding forecasts for the three models (GPTCast8x8, GPTCast16x16, and Linda) that are used in the paper.
- run the `download_data.py` script in the `data` directory.
```bash
cd data
python download_data.py
```
- unpack the downloaded `fts.tar` file in the `data` directory.
```bash
cd data
tar -xvf fts.tar
```

In [None]:
import sys
sys.path.append('..')
import os

import pickle
from tqdm import tqdm
from multiprocessing import Pool

from pysteps.verification.ensscores import rankhist_init, rankhist_accum
from glob import glob
import xarray as xr

import numpy as np
from tqdm import tqdm


Pysteps configuration file found at: /home/gabriele/Documents/fbk/meteo/GPTCast/.venv/lib/python3.12/site-packages/pysteps/pystepsrc



In [2]:
def reflectivity_to_rainrate(arr: np.ndarray,
                             minmax: tuple = (-20, 60),
                             a: float = 200.0,
                             b: float = 1.6):
    """
    Input is 0 - 60 reflectivity value (dbZ)
        
    Output is mm/h rain rate
    """
    Z = 10.0 ** (arr / 10.0)  # wradlib.trafo.idecibel
    rr = (Z / a) ** (1.0 / b)  # wradlib.zr.z_to_r
    rr[rr < 0.04] = 0.
    return rr

def generate_rank_hist_by_lead_time(obs, pred, pred_times):
    assert pred.shape[1] == len(pred_times), 'Sizes mismatch!'
    # Init cat tables
    rankhist_tables = {lead_time: rankhist_init(pred.shape[0]) for lead_time in pred_times}
    # Calculate scores for cat_tables
    for lt_idx, lead_time in enumerate(pred_times):
        # for ev in range(pred.shape[2]):
        rankhist_accum(rankhist_tables[lead_time], pred[:,lt_idx,:,:,:], obs[lt_idx,:,:,:])
    return rankhist_tables

def generate_rank_hist(obs, pred):
    # assert pred.shape[1] == len(pred_times), 'Sizes mismatch!'
    # Init cat tables
    rankhist_tables = {1000: rankhist_init(pred.shape[0])}
    # Calculate scores for cat_tables
    rankhist_accum(rankhist_tables[1000], pred, obs)
    return rankhist_tables

def sum_tables_by_lead_times(tables):
    tab_all = tables[0]
    for tab in tables[1:]:
        for lt in tab:
            tab_all[lt]['n'] = tab[lt]['n'] + tab_all[lt]['n']
    return tab_all

In [3]:
# available models are "gptcast_16x16", "gptcast_8x8" and "linda"
mod_name = 'gptcast_8x8'
# mod_name = 'gptcast_16x16'
# mod_name = 'linda'

input_data_path = f'../data/fts/{mod_name}/'
output_data_path = f'../data/verification_fts/{mod_name}/'

output_fn = 'tables_verification_nc_rank_hist.pkl'

In [4]:
pred_times = list(range(5, 125, 5))
rank_times = [15, 30, 60, 90, 120]

min_lat = 0
max_lat = 256
min_lon = 0
max_lon = 256

In [5]:
# loop over all available verification sequences and calculate scores
file_list = glob(input_data_path + '*.nc')
file_list.sort()
len(file_list)

197

In [6]:
idx_rank_times = [i for i in range(len(pred_times)) if pred_times[i] in rank_times]
print(idx_rank_times)
print(pred_times)

[2, 5, 11, 17, 23]
[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120]


In [7]:
def worker_selected_lt(file_i: str):
    np.random.seed(seed=44)
    nc_file = xr.open_dataset(file_i)
    nc_file = nc_file.sel(height=slice(min_lat,max_lat), width=slice(min_lon,max_lon))  
    ens_pred = reflectivity_to_rainrate(nc_file.ensemble.data)[:,idx_rank_times,:,:]
    obs = reflectivity_to_rainrate(nc_file.observation.data)[idx_rank_times,:,:]
    ens = np.expand_dims(ens_pred, axis=2)
    obs = np.expand_dims(obs, axis=1)
    # print(ens_pred.shape, obs.shape)
    # print(len(rank_times))
    rank_hist_table = generate_rank_hist_by_lead_time(obs, ens, rank_times)

    return rank_hist_table

In [8]:
# Calculating rank hist table for each selected lead time
with Pool(30) as p:
    tables = list(tqdm(p.imap(worker_selected_lt, file_list), total=len(file_list)))

100%|██████████| 197/197 [01:20<00:00,  2.45it/s]


In [9]:
tables[0]
tab_by_lead_time = sum_tables_by_lead_times(tables)
tab_by_lead_time

{15: {'num_ens_members': 20,
  'n': array([1189113,  631264,  548704,  518232,  486220,  471580,  465389,
          466091,  448771,  441660,  446324,  435142,  436190,  438600,
          444806,  447071,  454738,  470445,  489970,  529710, 1022206]),
  'X_min': None},
 30: {'num_ens_members': 20,
  'n': array([995158, 584339, 512985, 467011, 447101, 432872, 417538, 405236,
         395498, 388060, 384284, 377463, 376414, 380401, 384075, 386733,
         380665, 386636, 422845, 462941, 963981]),
  'X_min': None},
 60: {'num_ens_members': 20,
  'n': array([619819, 412505, 378400, 350362, 333285, 318623, 313410, 303173,
         296523, 291785, 287278, 282063, 282426, 280732, 278110, 278831,
         280055, 287865, 303449, 335661, 645337]),
  'X_min': None},
 90: {'num_ens_members': 20,
  'n': array([340921, 256231, 239188, 224685, 214777, 208785, 202605, 196515,
         191937, 188170, 183059, 179595, 178512, 176332, 174808, 174430,
         176740, 180634, 195722, 213242, 350873]),
 

In [10]:
def worker_all(file_i: str):
    np.random.seed(seed=44)
    nc_file = xr.open_dataset(file_i)
    nc_file = nc_file.sel(height=slice(min_lat,max_lat))  
    ens_pred = reflectivity_to_rainrate(nc_file.ensemble.data)[:,idx_rank_times,:,:]
    obs = reflectivity_to_rainrate(nc_file.observation.data)[idx_rank_times,:,:]
    rank_hist_table = generate_rank_hist(obs, ens_pred)

    return rank_hist_table

In [11]:
# Calculating rank hist table for all lead times
with Pool(30) as p:
    tables = list(tqdm(p.imap(worker_all, file_list), total=len(file_list)))

100%|██████████| 197/197 [01:19<00:00,  2.49it/s]


In [12]:
tables[0]
tab_all_lt = sum_tables_by_lead_times(tables)
tab_all_lt

{1000: {'num_ens_members': 20,
  'n': array([3291117, 2007760, 1796291, 1673148, 1589945, 1537455, 1501760,
         1473065, 1434869, 1408014, 1395053, 1364403, 1361569, 1362396,
         1366511, 1372855, 1379080, 1415072, 1509086, 1649318, 3144266]),
  'X_min': None}}

In [13]:
tab_by_lead_time[1000] = tab_all_lt[1000]
tab_by_lead_time

{15: {'num_ens_members': 20,
  'n': array([1189113,  631264,  548704,  518232,  486220,  471580,  465389,
          466091,  448771,  441660,  446324,  435142,  436190,  438600,
          444806,  447071,  454738,  470445,  489970,  529710, 1022206]),
  'X_min': None},
 30: {'num_ens_members': 20,
  'n': array([995158, 584339, 512985, 467011, 447101, 432872, 417538, 405236,
         395498, 388060, 384284, 377463, 376414, 380401, 384075, 386733,
         380665, 386636, 422845, 462941, 963981]),
  'X_min': None},
 60: {'num_ens_members': 20,
  'n': array([619819, 412505, 378400, 350362, 333285, 318623, 313410, 303173,
         296523, 291785, 287278, 282063, 282426, 280732, 278110, 278831,
         280055, 287865, 303449, 335661, 645337]),
  'X_min': None},
 90: {'num_ens_members': 20,
  'n': array([340921, 256231, 239188, 224685, 214777, 208785, 202605, 196515,
         191937, 188170, 183059, 179595, 178512, 176332, 174808, 174430,
         176740, 180634, 195722, 213242, 350873]),
 

In [14]:
# Save score tables to file
os.makedirs(output_data_path, exist_ok=True)
out_file = open(output_data_path + output_fn, "wb")
pickle.dump(tab_by_lead_time, out_file)
out_file.close()