In [1]:
from haversine import haversine_vector
import pandas as pd
import numpy as np
import glob
import os
import sys
sys.path.append('../resources/library')
from tropical_cyclone.georeferencing import round_to_grid

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# select the model to analyze
selected_model = '01_vgg_v3_relu'
# define inference directory to draw detections
dataset_dir = '../data/inference'
# get ibtracs directory
ibtracs_src = '../data/ibtracs/filtered/ibtracs_main-tracks_6h_1980-2021_TS-NR-ET-MX-SS-DS.csv'
# define test years
test_years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
# kilometer threshold
max_distance = 300

In [3]:
# csv file containing all the data dates
dates_csv = '../data/inference/dates.csv'
# read dates csv file
dates_df = pd.read_csv(dates_csv)
# convert dates to pandas
dates_df['dates'] = pd.to_datetime(dates_df['dates'])
# get only dates in test years
dates_df = dates_df[dates_df['dates'].dt.year.isin(test_years)]
# get dates as array
dates = dates_df['dates'].to_numpy()

In [4]:
# get model directory
model_dir = os.path.join(dataset_dir, selected_model)
# get inference filenames
inference_files =sorted(glob.glob(os.path.join(model_dir,'*.csv')))

In [5]:
# load csv files
csv_files = []
for file in inference_files:
    csv_files.append(pd.read_csv(file, index_col=0))
# merge csv files together
detections = pd.concat(csv_files).reset_index(drop=True)
# convert iso time with pandas
detections['ISO_TIME'] = pd.to_datetime(detections['ISO_TIME'])
# add WS as np.inf
detections['WS'] = np.inf
detections

Unnamed: 0,ISO_TIME,LAT,LON,WS
0,2014-07-26 00:00:00,66.50,101.75,inf
1,2014-07-26 00:00:00,67.50,290.75,inf
2,2014-07-26 00:00:00,56.75,244.25,inf
3,2014-07-26 00:00:00,53.00,252.50,inf
4,2014-07-26 00:00:00,45.25,123.25,inf
...,...,...,...,...
3270,2015-09-18 18:00:00,15.25,211.50,inf
3271,2015-09-18 18:00:00,20.00,230.00,inf
3272,2015-09-18 18:00:00,18.00,245.75,inf
3273,2015-09-18 18:00:00,16.25,282.25,inf


In [6]:
columns = ['ISO_TIME','SID','NATURE','WMO_WIND','LAT','LON']
# load ibtracs
ibtracs = pd.read_csv(ibtracs_src, index_col=0)
# convert iso time with pandas
ibtracs['ISO_TIME'] = pd.to_datetime(ibtracs['ISO_TIME'])
# get only those detections that are within test years
ibtracs = ibtracs[ibtracs['ISO_TIME'].isin(dates)].reset_index(drop=True)
# get only some columns from ibtracs
ibtracs = ibtracs[columns]
# round lat and lon to be comparable with training data
ibtracs['LAT'] = round_to_grid(ibtracs['LAT'], grid_res=0.25)
ibtracs['LON'] = round_to_grid(ibtracs['LON'], grid_res=0.25)
ibtracs

  ibtracs = pd.read_csv(ibtracs_src, index_col=0)


Unnamed: 0,ISO_TIME,SID,NATURE,WMO_WIND,LAT,LON
0,2011-04-01 00:00:00,2011091N09112,NR,,9.00,111.75
1,2011-04-01 06:00:00,2011091N09112,NR,,8.75,111.75
2,2011-04-01 12:00:00,2011091N09112,NR,,8.75,111.50
3,2011-04-01 18:00:00,2011091N09112,TS,,8.75,111.75
4,2011-04-02 00:00:00,2011091N09112,TS,,9.00,111.50
...,...,...,...,...,...,...
20272,2020-12-23 18:00:00,2020353N06129,TS,,8.00,105.25
20273,2020-12-24 00:00:00,2020353N06129,TS,,7.75,104.00
20274,2020-12-24 06:00:00,2020353N06129,TS,,7.75,102.75
20275,2020-12-24 12:00:00,2020353N06129,TS,,8.25,101.75


# Localization

In [7]:
# merge together detections and ibtracs
matches = pd.merge(left=detections, right=ibtracs, on='ISO_TIME')
# compute haversine distance between any couple of points
matches['HDIST'] = haversine_vector(array1=matches[['LAT_x','LON_x']].to_numpy(), array2=matches[['LAT_y','LON_y']].to_numpy(), normalize=True)
matches

Unnamed: 0,ISO_TIME,LAT_x,LON_x,WS,SID,NATURE,WMO_WIND,LAT_y,LON_y,HDIST
0,2014-07-26 00:00:00,66.50,101.75,inf,2014197N10137,ET,,39.50,126.00,3363.378507
1,2014-07-26 00:00:00,66.50,101.75,inf,2014204N10239,TS,40,12.00,224.00,10118.807471
2,2014-07-26 00:00:00,66.50,101.75,inf,2014207N12255,DS,30,12.25,255.00,10988.663027
3,2014-07-26 00:00:00,67.50,290.75,inf,2014197N10137,ET,,39.50,126.00,8047.852134
4,2014-07-26 00:00:00,67.50,290.75,inf,2014204N10239,TS,40,12.00,224.00,7798.388355
...,...,...,...,...,...,...,...,...,...,...
11459,2015-09-18 18:00:00,16.25,282.25,inf,2015258N12320,TS,25,18.00,312.75,3243.363360
11460,2015-09-18 18:00:00,16.25,282.25,inf,2015260N12187,TS,30,17.25,184.00,10317.199783
11461,2015-09-18 18:00:00,19.25,311.75,inf,2015257N14152,TS,75,26.75,142.75,14764.343827
11462,2015-09-18 18:00:00,19.25,311.75,inf,2015258N12320,TS,25,18.00,312.75,174.418652


In [8]:
# make a copy of the matches dataframe
mts = matches.copy()
# remove all the distances above 300 km
mts = mts[mts['HDIST'] < max_distance]
# group by LATx and LONx and find the minimum (to remove x duplicates)
mts = mts.groupby(by=['ISO_TIME','LAT_x','LON_x','SID','NATURE','WMO_WIND']).min('HDIST').reset_index()
# repeat grouping by LATy and LONy and find the minimum (to remove y duplicates)
mts = mts.groupby(by=['ISO_TIME','LAT_y','LON_y','SID','NATURE','WMO_WIND']).min('HDIST')
# reset the index
mts = mts.reset_index()
# show result
mts

Unnamed: 0,ISO_TIME,LAT_y,LON_y,SID,NATURE,WMO_WIND,LAT_x,LON_x,WS,HDIST
0,2014-07-26 00:00:00,12.00,224.00,2014204N10239,TS,40,14.00,222.50,inf,275.438216
1,2014-07-26 00:00:00,12.25,255.00,2014207N12255,DS,30,13.50,254.50,inf,149.187017
2,2014-07-26 00:00:00,39.50,126.00,2014197N10137,ET,,39.00,124.75,inf,121.145163
3,2014-07-26 06:00:00,12.25,223.75,2014204N10239,TS,35,13.75,222.25,inf,232.873285
4,2014-07-26 06:00:00,13.25,254.50,2014207N12255,TS,30,14.50,253.00,inf,213.396211
...,...,...,...,...,...,...,...,...,...,...
480,2015-09-18 12:00:00,17.50,313.50,2015258N12320,TS,25,18.75,312.50,inf,174.603638
481,2015-09-18 12:00:00,25.50,142.25,2015257N14152,TS,80,25.00,142.50,inf,61.018333
482,2015-09-18 18:00:00,17.25,184.00,2015260N12187,TS,30,17.75,183.00,inf,119.738384
483,2015-09-18 18:00:00,18.00,312.75,2015258N12320,TS,25,19.25,311.75,inf,174.418652


In [13]:
# print(f"{ibtracs.value_counts('NATURE') - mts.value_counts('NATURE')}\n\nout of\n\n{ibtracs.value_counts('NATURE')}")

In [10]:
print(f"Model {selected_model} Localization results")
print(f"   Min distance ({np.round(mts['HDIST'].min(),2)} km)")
print(f"   Max distance ({np.round(mts['HDIST'].max(),2)} km)")
print(f"   Average distance ({np.round(mts['HDIST'].mean(),2)} km)")
print(f"   Median distance ({np.round(mts['HDIST'].median(),2)} km)")

Model 01_vgg_v3_relu Localization results
   Min distance (24.82 km)
   Max distance (297.97 km)
   Average distance (151.28 km)
   Median distance (147.5 km)


# Classification

In [11]:
n_dets = len(detections)
n_tp = len(mts)
n_obs = len(ibtracs)

In [12]:
print(f"Model {selected_model} Classification results")
print(f"   TP : {n_tp} out of {n_obs} observations")
print(f"   FP : {n_dets - n_tp} out of {n_dets} ML detections")
print(f"   FN : {n_obs - n_tp} out of {n_obs} observations")

Model 01_vgg_v3_relu Classification results
   TP : 485 out of 20277 observations
   FP : 2790 out of 3275 ML detections
   FN : 19792 out of 20277 observations
