In [None]:
from haversine import haversine_vector
import pandas as pd
import numpy as np
import glob
import os
import sys
sys.path.append('../resources/library')
from tropical_cyclone.georeferencing import round_to_grid
from tropical_cyclone.cyclone import init_track_dataframe, tracking_algorithm, track_matching
from tropical_cyclone.visualize import plot_tracks

import warnings
warnings.filterwarnings('ignore')

In [2]:
# select the model to analyze
# selected_model = '02_swin_msl_vo_850'
# selected_model = '03_vgg_v3_relu_ks3_msl_vo_850'
# selected_model = '04_vgg_v3_relu_ks5_msl_vo_850'
# selected_model = '05_vgg_v3_linear_ks3_msl_vo_850'
# selected_model = '06_swin_msl_vo_850'
selected_model = '07_vgg_v3_silu_ks3_msl_vo_850'

# define inference directory to draw detections
dataset_dir = '../data/inference'
# get ibtracs directory
ibtracs_src = '../data/ibtracs/filtered/ibtracs_main-tracks_6h_1980-2021_TS-NR-ET-MX-SS-DS.csv'
# define test years (same as paper)
test_years = [i for i in range(1980,2020)]
# test_years = [1983, 1984, 1993, 1994, 2003, 2004, 2013, 2014]
# test_years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
# test_years = [1993]
# kilometer threshold
max_distance_detection = 1000.0

In [None]:
# get model directory
model_dir = os.path.join(dataset_dir, selected_model)
# get inference filenames
inference_files = [os.path.join(model_dir, f'{year}.csv') for year in test_years]
model_dir, inference_files

In [None]:
# load csv files
csv_files = []
for file in inference_files:
    csv_files.append(pd.read_csv(file, index_col=0))
# merge csv files together
detections = pd.concat(csv_files).reset_index(drop=True)
# convert iso time with pandas
detections['ISO_TIME'] = pd.to_datetime(detections['ISO_TIME'])
# add WS as np.inf
detections['WS'] = np.inf
detections

In [None]:
columns = ['ISO_TIME','SID','NATURE','WMO_WIND','LAT','LON']
# load ibtracs
observations = pd.read_csv(ibtracs_src, index_col=0)
# convert iso time with pandas
observations['ISO_TIME'] = pd.to_datetime(observations['ISO_TIME'])
# get only some columns from ibtracs
observations = observations[columns]
# round lat and lon to be comparable with training data
observations['LAT'] = round_to_grid(observations['LAT'], grid_res=0.25)
observations['LON'] = round_to_grid(observations['LON'], grid_res=0.25)
observations

In [6]:
tmp = pd.merge(left=detections, right=observations, on='ISO_TIME', how='inner')
tmp = tmp[tmp['ISO_TIME'].dt.year.isin(test_years)]
dates = tmp['ISO_TIME'].to_numpy()

In [None]:
print(f'There are {len(observations)} observations and {len(detections)} detections')

In [8]:
# get only detections and observations present on both dataframes
detections = detections[detections['ISO_TIME'].isin(dates)].reset_index(drop=True)
observations = observations[observations['ISO_TIME'].isin(dates)].reset_index(drop=True)

# select only TCs belonging to a certain nature
# observations = observations[observations['NATURE'].isin(['TS','SS','ET'])]

In [None]:
print(f'There are {len(observations)} observations and {len(detections)} detections')

# Localization

In [None]:
# merge together detections and ibtracs
matches = pd.merge(left=detections, right=observations, on='ISO_TIME')
# compute haversine distance between any couple of points
matches['HDIST'] = haversine_vector(array1=matches[['LAT_x','LON_x']].to_numpy(), array2=matches[['LAT_y','LON_y']].to_numpy(), normalize=True)
matches.head()

In [None]:
# remove all the distances above `max_distance_localization` km
matches = matches[matches['HDIST'] < max_distance_detection]
# group by LATx and LONx and find the minimum (to remove x duplicates)
matches = matches.groupby(by=['ISO_TIME','LAT_x','LON_x','SID','NATURE','WMO_WIND']).min('HDIST').reset_index()
# repeat grouping by LATy and LONy and find the minimum (to remove y duplicates)
matches = matches.groupby(by=['ISO_TIME','LAT_y','LON_y','SID','NATURE','WMO_WIND']).min('HDIST').reset_index()
# show result
matches

In [None]:
min_distance_localization = matches['HDIST'].min()
max_distance_localization = matches['HDIST'].max()
mean_distance_localization = matches['HDIST'].mean()
median_distance_localization = matches['HDIST'].median()

print(f"Model {selected_model} Localization results")
print(f"   Min distance ({np.round(min_distance_localization,2)} km)")
print(f"   Max distance ({np.round(max_distance_localization,2)} km)")
print(f"   Average distance ({np.round(mean_distance_localization,2)} km)")
print(f"   Median distance ({np.round(median_distance_localization,2)} km)")

In [13]:
# plot_detections(detections, observations)

# Classification

In [14]:
def F_beta(beta, precision, recall):
    return (1 + beta**2) * ((precision * recall) / ((beta**2 * precision) + recall))

In [15]:
n_dets = len(detections)
n_tp = len(matches)
n_obs = len(observations)
n_fp = n_dets - n_tp
n_fn = n_obs - n_tp

In [16]:
precision = n_tp / (n_tp + n_fp)
recall = n_tp / (n_tp + n_fn)
f2_score = F_beta(beta=2, precision=precision, recall=recall) * 100

In [None]:
print(f"Model {selected_model} Classification results")
print(f"   F2 : {np.round(f2_score,2)} % (precision={np.round(precision, 2)}, recall={np.round(recall,2)})")
print(f"   TP : {n_tp} out of {n_obs} observations ({np.round(n_tp / n_obs * 100)} %)")
print(f"   FP : {n_fp} out of {n_dets} ML detections ({np.round(n_fp / n_dets * 100)} %)")
print(f"   FN : {n_fn} out of {n_obs} observations ({np.round(n_fn / n_obs * 100)} %)")

# Tracking

In [18]:
# minimum track length (1 day)
min_track_count = 12
# maximum distance (in km) between each consecutive tc
max_distance_tracking = 400.0
# minimum speed of wind in order to consider the track true
min_wind_speed = 17.0
# maximum distance between matches between tracks
max_track_distance_tracking = 300.0

grid_res = 0.25
km_to_deg = 110.474

# whether or not to plot the tracks
plot = False

In [None]:
# rename SID to TRACK_ID
observed_tracks = observations.rename(columns={'SID':'TRACK_ID'})
# get only long enough tracks for the comparison
valid_observations_sids = observed_tracks.groupby('TRACK_ID').filter(lambda x: len(x) >= min_track_count)['TRACK_ID'].unique()
# filter out the observations
observed_tracks = observed_tracks[observed_tracks['TRACK_ID'].isin(valid_observations_sids)].reset_index(drop=True)
observed_tracks.head()

In [None]:
# apply tracking scheme
tracking_src = f'/Users/davide/Developer/ml-tropical-cyclones-detection/data/inference/{selected_model}/tracking.csv'
if not os.path.exists(tracking_src):
    detected_tracks = init_track_dataframe(detections)
    detected_tracks = tracking_algorithm(detected_tracks, max_distance_tracking, min_track_count)
    detected_tracks.to_csv(tracking_src)
else:
    detected_tracks = pd.read_csv(tracking_src, index_col=0)
# store detected tracks to disk
detected_tracks.head()

In [21]:
# paper_detected_tracks = init_track_dataframe(detections)
# paper_detected_tracks = paper_tracking_algorithm(paper_detected_tracks, max_distance, min_track_count)
# paper_detected_tracks.head()

In [None]:
print(f'There are:')
print(f'   - {len(detected_tracks["TRACK_ID"].unique())} detected tracks')
# print(f'   - {len(paper_detected_tracks["TRACK_ID"].unique())} detected tracks (paper)')
print(f'   - {len(observed_tracks["TRACK_ID"].unique())} observed tracks')

In [33]:
if plot:plot_tracks(detected_tracks, observed_tracks)
# plot_tracks(detected_tracks[pd.to_datetime(detected_tracks['ISO_TIME']).dt.year.isin([2005])], observed_tracks[pd.to_datetime(observed_tracks['ISO_TIME']).dt.year.isin([2005])])

# Track Matching

In [25]:
max_track_distance_matching = 300.0

In [None]:
track_matches = track_matching(detected_tracks, observed_tracks, max_track_distance_matching)
track_matches

In [None]:
# H = HITS = True Positive
H = len(track_matches[(track_matches['DET_TRACK_ID']!='') & (track_matches['OBS_TRACK_ID']!='')])
# M = Miss = False Negative
M = len(track_matches[(track_matches['DET_TRACK_ID']=='') & (track_matches['OBS_TRACK_ID']!='')])
# FA = False Alarm = False Positive
FA = len(track_matches[(track_matches['DET_TRACK_ID']!='') & (track_matches['OBS_TRACK_ID']=='')])

POD = (H / (H + M))
FAR = (FA / (H + FA))

print(f"Hits : {H}")
print(f"Miss : {M}")
print(f"False Alarm : {FA}")
print(f"POD : {POD}")
print(f"FAR : {FAR}")

# Save to file

In [None]:
columns = [
    'model',
    'max_distance_detection', 
    'n_dets', 
    'n_tp', 
    'n_obs', 
    'n_fp', 
    'n_fn', 
    'precision', 
    'recall', 
    'f2_score', 
    'min_distance_localization', 
    'max_distance_localization', 
    'mean_distance_localization', 
    'median_distance_localization', 
    'min_track_count', 
    'max_distance_tracking', 
    'min_wind_speed', 
    'max_track_distance_matching', 
    'max_track_distance_tracking', 
    'H', 
    'M', 
    'FA', 
    'POD', 
    'FAR', 
    'ibtracs_src', 
    'test_years', 
]
dst = '/Users/davide/Developer/ml-tropical-cyclones-detection/data/inference/results_analysis.csv'
if os.path.exists(dst):
    results = pd.read_csv(dst, index_col=0)
else:
    results = pd.DataFrame(columns=columns)
results

In [None]:
results = pd.concat([results, pd.DataFrame(data={
    'model': [selected_model],
    'max_distance_detection': [max_distance_detection], 
    'n_dets': [n_dets], 
    'n_tp': [n_tp], 
    'n_obs': [n_obs], 
    'n_fp': [n_fp], 
    'n_fn': [n_fn], 
    'precision': [precision], 
    'recall': [recall], 
    'f2_score': [f2_score], 
    'min_distance_localization': [min_distance_localization], 
    'max_distance_localization': [max_distance_localization], 
    'mean_distance_localization': [mean_distance_localization], 
    'median_distance_localization': [median_distance_localization], 
    'min_track_count': [min_track_count], 
    'max_distance_tracking': [max_distance_tracking], 
    'min_wind_speed': [min_wind_speed], 
    'max_track_distance_matching': [max_track_distance_matching], 
    'max_track_distance_tracking': [max_track_distance_tracking], 
    'H': [H], 
    'M': [M], 
    'FA': [FA], 
    'POD': [POD], 
    'FAR': [FAR], 
    'ibtracs_src': [ibtracs_src], 
    'test_years': [test_years], 
})])
results

In [30]:
results = results.reset_index(drop=True)
results.to_csv(dst)

In [None]:
sys.path.append('/Users/davide/Developer/ml-tropical-cyclones-detection/resources/library/dynamicopy-0.6.1')
import dynamicopy

columnsd = {'ISO_TIME': 'time','LAT':'lat','LON':'lon','TRACK_ID':'track_id','WS':'ws'}
columns = list(columnsd.values())

bobs = dynamicopy.load_ibtracs()
bobs = bobs[bobs['basin'].isin(['WNP','ENP','NATL'])].reset_index(drop=True)

dets = detected_tracks.rename(columns={'ISO_TIME':'time','LAT':'lat','LON':'lon','TRACK_ID':'track_id'})
obss = observed_tracks.rename(columns={'ISO_TIME':'time','LAT':'lat','LON':'lon','TRACK_ID':'track_id'})

dets['lon'] = (dets['lon'] + 540) % 360 - 180
obss['lon'] = (obss['lon'] + 540) % 360 - 180
bobs['lon'] = (bobs['lon'] + 540) % 360 - 180
bobs = bobs[(bobs['lon']>=100) & (bobs['lon']<=320) & (bobs['lat']>=0) & (bobs['lat']<=70)]
bobs = bobs[bobs['time'].isin(dates)]

In [None]:
match_bourdin = dynamicopy.match_tracks(dets, bobs, "ours", 'bourdin', max_dist=max_track_distance_matching, min_overlap=0, ref=True)

n_match = len(match_bourdin[f'id_bourdin'].unique())
n_observations = len(bobs.track_id.unique())
n_detections = len(dets.track_id.unique())

POD = n_match / n_observations
FAR = 1 - (n_match / n_detections)
H, M, FA = n_match, (n_observations-n_match), n_detections - n_match

print(f"Hits : {H}")
print(f"Misses : {M}")
print(f"False Alarms : {FA}")
print(f"POD : {POD}")
print(f"FAR : {FAR}")

In [None]:
match_our_ibtracs = dynamicopy.match_tracks(dets, obss, "ours", 'ibtracs', max_dist=max_track_distance_matching, min_overlap=0, ref=True)

n_match = len(match_our_ibtracs[f'id_ibtracs'].unique())
n_observations = len(obss.track_id.unique())
n_detections = len(dets.track_id.unique())

POD = n_match / n_observations
FAR = 1 - (n_match / n_detections)
H, M, FA = n_match, (n_observations-n_match), n_detections - n_match

# POD, FAR, H, M, FA
print(f"Hits : {H}")
print(f"Misses : {M}")
print(f"False Alarms : {FA}")
print(f"POD : {POD}")
print(f"FAR : {FAR}")

In [None]:
match_obs_ibtracs = dynamicopy.match_tracks(bobs, obss, "bourdin", 'ibtracs', max_dist=max_track_distance_matching, min_overlap=0, ref=True)

n_match = min(len(match_obs_ibtracs[f'id_bourdin'].unique()), len(match_obs_ibtracs[f'id_ibtracs'].unique()))
n_observations = len(obss.track_id.unique())
n_detections = len(bobs.track_id.unique())

POD = n_match / n_observations
FAR = 1 - (n_match / n_detections)
H, M, FA = n_match, (n_observations-n_match), n_detections - n_match

# POD, FAR, H, M, FA
print(f"Hits : {H}")
print(f"Misses : {M}")
print(f"False Alarms : {FA}")
print(f"POD : {POD}")
print(f"FAR : {FAR}")

match_obs_ibtracs = dynamicopy.match_tracks(obss, bobs, "ibtracs", 'bourdin', max_dist=max_track_distance_matching, min_overlap=0, ref=True)

n_match = min(len(match_obs_ibtracs[f'id_bourdin'].unique()), len(match_obs_ibtracs[f'id_ibtracs'].unique()))
n_observations = len(bobs.track_id.unique())
n_detections = len(obss.track_id.unique())

POD = n_match / n_observations
FAR = 1 - (n_match / n_detections)
H, M, FA = n_match, (n_observations-n_match), n_detections - n_match

# POD, FAR, H, M, FA
print(f"\nHits : {H}")
print(f"Misses : {M}")
print(f"False Alarms : {FA}")
print(f"POD : {POD}")
print(f"FAR : {FAR}")

# Paper Results

In the paper, with the ML ensemble we have the following results:

- F2-score : 53 %
- Euclidean distance : 117.06 km
- Hit rate : 88.91 %
- POD : 71.49 %
- FAR : 23.00 %