In [3]:
import netCDF4 as nc
import numpy as np
import pandas as pd
from netCDF4 import num2date

file_path = "IBTrACS.WP.v04r01.nc"
ds = nc.Dataset(file_path)

In [87]:

iso_time_raw = ds.variables['iso_time'][:]  
iso_time_str = np.array([''.join(ch.decode('utf-8') for ch in row).strip() if not np.ma.is_masked(row) else '' for row in iso_time_raw.reshape(-1, iso_time_raw.shape[2])])

# print(iso_time_str[:2])  

iso_time_parsed = pd.to_datetime(iso_time_str, errors='coerce')

# print(iso_time_parsed[:2])  

gpm_start = pd.Timestamp("2000-06-01T00:00:00")
gpm_end = pd.Timestamp("2025-02-21T12:30:00")

valid_time_mask = (iso_time_parsed >= gpm_start) & (iso_time_parsed <= gpm_end)

sid_raw = ds.variables['sid'][:]  
sid_str = [''.join([ch.decode('utf-8') for ch in sid_raw[i, :]]).strip() for i in range(sid_raw.shape[0])]


In [None]:
time_var = ds.variables['time'][:]

time_units = ds.variables['time'].units
time_calendar = ds.variables['time'].calendar if 'calendar' in ds.variables['time'].ncattrs() else 'standard'

# to datetime
iso_time_dt = num2date(time_var, time_units, calendar=time_calendar)

# print(iso_time_dt[0:1])  

# GPM range
gpm_start = pd.Timestamp("2000-06-01T00:00:00")
gpm_end = pd.Timestamp("2025-02-21T12:30:00")

valid_time_mask = (iso_time_dt >= gpm_start) & (iso_time_dt <= gpm_end)

# strom id
sid_raw = ds.variables['sid'][:]  
# turn to string
sid_str = [''.join([ch.decode('utf-8') for ch in sid_raw[i, :]]).strip() for i in range(sid_raw.shape[0])]
# print(sid_str[0])  


[[cftime.DatetimeGregorian(1884, 6, 24, 16, 0, 0, 27, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 24, 18, 0, 0, 40, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 24, 21, 0, 0, 40, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 25, 0, 0, 0, 40, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 25, 3, 0, 0, 40, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 25, 4, 0, 0, 27, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 25, 6, 0, 0, 40, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 25, 9, 0, 0, 40, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 25, 12, 0, 0, 40, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 25, 15, 0, 0, 40, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 25, 16, 0, 0, 27, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 25, 18, 0, 0, 40, has_year_zero=False)
  cftime.DatetimeGregorian(1884, 6, 25, 21, 0, 0, 40, has_year_zero=False)
  cftime.DatetimeGregorian(188

In [81]:
# manully select the variables from usa data （Their fill value is the same -9999.0）
# I use the merged latitudes and longitudes, strom speed and direction from IBTrACS to make GPM data align with data from all sources
# 'usa_gust'(max speed of gust) and 'usa_eye'(eye diameter) are always NaN, I am not sure if they are important, but finally threw them away
variables_to_extract = [
    'usa_lat', 'usa_lon', 'storm_dir', 'storm_speed', 'usa_wind', 'usa_pres', 'usa_poci', 'usa_roci', 
    'usa_rmw','usa_eye'
]

filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        if fill_value is not None:
            filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        else:
            filtered_values = np.where(filtered_values == -12345.0, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)
filtered_df.to_csv("filtered_storms.csv", index=False, na_rep="")

In [None]:
# iso_time for filtering, time-costing
# manully select the variables from usa data （Their fill value is the same -9999.0）
# I use the merged latitudes and longitudes, strom speed and direction from IBTrACS to make GPM data align with data from all sources
# 'usa_gust'(max speed of gust) and 'usa_eye'(eye diameter) are always NaN, I am not sure if they are important, but finally threw them away
variables_to_extract = [
    'usa_lat', 'usa_lon', 'storm_dir', 'storm_speed', 'usa_wind', 'usa_pres', 'usa_poci', 'usa_roci', 
    'usa_rmw'
]
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []
for storm_idx, sid in enumerate(sid_str):
    time_mask = (iso_time_parsed >= gpm_start) & (iso_time_parsed <= gpm_end)
    
    # time 
    valid_times = iso_time_parsed[time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, :][time_mask[:var_data.shape[1]]]

        if fill_value is not None:
            filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        else:
            filtered_values = np.where(filtered_values == -12345.0, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)
filtered_df.to_csv("filtered_storms.csv", index=False, na_rep="")




In [52]:
# Haversine formula
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c  
    return km

shenzhen_lat = 22.5431
shenzhen_lon = 114.0579

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['usa_lon'], filtered_df['usa_lat'], 
                                                shenzhen_lon, shenzhen_lat)

filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]


filtered_df_shenzhen.to_csv("filtered_storms_shenzhen_500km.csv", index=False)

# Delete observations without full USA values
cols_to_check = ['usa_wind', 'usa_pres', 'usa_poci', 'usa_roci', 'usa_rmw']

filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv("filtered_storms_deleted.csv", index=False)

print(filtered_df_deleted.head())

                sid                        time  usa_lat     usa_lon  \
2370  2001204N19127  2001-07-24 00:00:00.000040     20.4  118.300003   
2371  2001204N19127  2001-07-24 03:00:00.000040     20.5  117.599998   
2372  2001204N19127  2001-07-24 06:00:00.000040     20.5  116.900002   
2373  2001204N19127  2001-07-24 09:00:00.000040     20.5  116.300003   
2374  2001204N19127  2001-07-24 12:00:00.000040     20.6  115.699997   

      storm_dir  storm_speed  usa_wind  usa_pres  usa_poci  usa_roci  usa_rmw  \
2370      280.0         14.0      55.0     984.0    1005.0      52.0     50.0   
2371      275.0         13.0      63.0     978.0    1005.0      79.0     45.0   
2372      275.0         12.0      70.0     972.0    1005.0     105.0     40.0   
2373      275.0         11.0      78.0     965.0    1005.0     120.0     38.0   
2374      275.0         11.0      85.0     958.0    1005.0     135.0     35.0   

      usa_eye  distance_to_shenzhen  
2370  -9999.0            499.432200  
2371

In [13]:
# Convert time to standard format for submitting to GEE
filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("filtered_storms_deleted_formatted.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = filtered_df_de

## JAPAN

In [45]:
# manully select the variables from usa data （Their fill value is the same -9999.0）
# I use the merged latitudes and longitudes, strom speed and direction from IBTrACS to make GPM data align with data from all sources
# 'usa_gust'(max speed of gust) and 'usa_eye'(eye diameter) are always NaN, I am not sure if they are important, but finally threw them away
variables_to_extract = ['lat','lon',
    'tokyo_lat', 'tokyo_lon','storm_dir', 'storm_speed', 'tokyo_grade', 'tokyo_wind', 'tokyo_pres',
    'tokyo_r50_dir', 'tokyo_r50_long', 'tokyo_r50_short', 'tokyo_r30_dir', 'tokyo_r30_long', 
    'tokyo_r30_short', 'tokyo_land'
]
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        if fill_value is not None:
            filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        else:
            filtered_values = np.where(filtered_values == -12345.0, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_JPN.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

# Delete observations without full USA values
cols_to_check = ['tokyo_wind', 'tokyo_pres']

# This standard: those with all JPN values missing are deleted, then number of observations: 2403
# filtered_df_deleted = filtered_df_gd[~(filtered_df_gd[cols_to_check] == -9999.0).all(axis=1)]

# Then new standard: those with at least one JPN values missing are deleted
# number of observations :503

filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]


In [46]:
# Convert time to standard format for submitting to GEE
filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_JPN_formatted.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = filtered_df_de

CMA(CN) has only lat, lon, wind and pressure

In [62]:


variables_to_extract = ['lat','lon',
'cma_lat', 'cma_lon','storm_dir', 'storm_speed', 'cma_wind', 'cma_pres']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_cma.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = ['cma_wind', 'cma_pres']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv('storms_cma_sz.csv', index=False)

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_cma_formatted.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = filtered_df_de

In [58]:
variables_to_extract = ['lat','lon',
    'kma_lat', 'kma_lon','storm_dir', 'storm_speed', 'kma_wind', 'kma_pres', 'kma_r50_dir', 'kma_r50_long', 'kma_r50_short', 'kma_r30_dir', 'kma_r30_long', 'kma_r30_short'
]
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_KMA.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = ['kma_wind', 'kma_pres']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]


In [57]:
filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_KMA_formatted.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_deleted['time'] = filtered_df_de

New Delhi

In [None]:


variables_to_extract = ['lat','lon',
    'newdelhi_lat', 'newdelhi_lon','storm_dir', 'storm_speed', 
    'newdelhi_wind', 'newdelhi_pres', 'newdelhi_dp', 'newdelhi_poci']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_Ndl.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = [ 
    'newdelhi_wind', 'newdelhi_pres', 'newdelhi_dp', 'newdelhi_poci']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_Ndl_formatted.csv", index=False)


LA Reunion

In [None]:

variables_to_extract = ['lat','lon',
'reunion_lat', 'reunion_lon','storm_dir', 'storm_speed', 
'reunion_pres', 'reunion_wind','reunion_rmw','reunion_gust']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_reunion.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = [ 'reunion_wind'
'reunion_pres', 'reunion_rmw','reunion_gust']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv('storms_reunion_sz.csv', index=False)

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_reunion_formatted.csv", index=False)


In [65]:
variables_to_extract = ['lat','lon',
'bom_lat', 'bom_lon','storm_dir', 'storm_speed', 
'bom_wind', 'bom_pres','bom_rmw', 'bom_roci', 'bom_poci', 'bom_eye', 'bom_gust']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_bom.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = [ 
'bom_wind', 'bom_pres','bom_rmw', 'bom_roci', 'bom_poci', 'bom_eye', 'bom_gust']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv('storms_bom_sz.csv', index=False)

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_bom_formatted.csv", index=False)


Nadi

In [69]:

variables_to_extract = ['lat','lon',
'nadi_lat', 'nadi_lon','storm_dir', 'storm_speed', 'nadi_wind', 'nadi_pres']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_nadi.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = ['nadi_wind', 'nadi_pres']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv('storms_nadi_sz.csv', index=False)

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_nadi_formatted.csv", index=False)


Newzealand  Wellington

In [71]:


variables_to_extract = ['lat','lon',
'wellington_lat', 'wellington_lon', 'storm_dir', 'storm_speed', 'wellington_wind', 'wellington_pres']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_nz.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = ['wellington_wind', 'wellington_pres']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv('storms_nz_sz.csv', index=False)

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_nz_formatted.csv", index=False)


In [75]:

variables_to_extract = ['lat','lon',
'ds824_lat', 'ds824_lon','storm_dir', 'storm_speed', 'ds824_wind', 'ds824_pres']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_ds824.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = ['ds824_wind', 'ds824_pres']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv('storms_ds824_sz.csv', index=False)

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_ds824_formatted.csv", index=False)


In [None]:
# ['td9636_lat', 'td9636_lon', 'td9636_stage', 'td9636_wind', 'td9636_pres']

variables_to_extract = ['lat','lon',
'td9636_lat', 'td9636_lon','storm_dir', 'storm_speed', 'td9636_wind', 'td9636_pres']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_td9636.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = ['td9636_wind', 'td9636_pres']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv('storms_td9636_sz.csv', index=False)

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_td9636_formatted.csv", index=False)


In [80]:
# ['td9635_lat', 'td9635_lon', 'td9635_stage', 'td9635_wind', 'td9635_pres']

variables_to_extract = ['lat','lon',
'td9635_lat', 'td9635_lon','storm_dir', 'storm_speed', 'td9635_wind', 'td9635_pres']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_td9635.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = ['td9635_wind', 'td9635_pres']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv('storms_td9635_sz.csv', index=False)

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_td9635_formatted.csv", index=False)


In [83]:
# ['neumann_lat', 'neumann_lon', 'neumann_class', 'neumann_wind', 'neumann_pres']

variables_to_extract = ['lat','lon',
'neumann_lat', 'neumann_lon','storm_dir', 'storm_speed', 'neumann_wind', 'neumann_pres']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_neumann.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = ['neumann_wind', 'neumann_pres']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv('storms_neumann_sz.csv', index=False)

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_neumann_formatted.csv", index=False)


In [85]:
# 'source_mlc', 'mlc_lat', 'mlc_lon', 'mlc_class', 'mlc_wind', 'mlc_pres'


variables_to_extract = ['lat','lon',
'mlc_lat', 'mlc_lon', 'storm_dir', 'storm_speed', 'mlc_wind', 'mlc_pres']
filtered_data = {'sid': [], 'time': []}

for var_name in variables_to_extract:
    filtered_data[var_name] = []

for storm_idx, sid in enumerate(sid_str):
    time_mask = valid_time_mask[storm_idx, :]  

# time 
    valid_times = iso_time_dt[storm_idx, time_mask]
    filtered_data['sid'].extend([sid] * len(valid_times))
    filtered_data['time'].extend(valid_times)

    # add parameters' data      
    for var_name in variables_to_extract:
        var_data = ds.variables[var_name][:]
        fill_value = ds.variables[var_name].getncattr('_FillValue') if '_FillValue' in ds.variables[var_name].ncattrs() else None

        filtered_values = var_data[storm_idx, time_mask]

        
        filtered_values = np.where(filtered_values == fill_value, np.nan, filtered_values)
        filtered_values = np.array(filtered_values, dtype=float)
        filtered_values = filtered_values.astype(float)                    
        filtered_values = np.round(filtered_values, 6)
        filtered_data[var_name].extend(filtered_values)

filtered_df = pd.DataFrame(filtered_data)

filtered_df.to_csv("storms_mlc.csv", index=False, na_rep="")

filtered_df['distance_to_shenzhen'] = haversine(filtered_df['lon'], filtered_df['lat'], 
                                                shenzhen_lon, shenzhen_lat)


filtered_df_shenzhen = filtered_df[filtered_df['distance_to_shenzhen'] <= 500]

cols_to_check = ['mlc_wind', 'mlc_pres']


filtered_df_deleted = filtered_df_shenzhen[~(filtered_df_shenzhen[cols_to_check] == -9999.0).any(axis=1)]
filtered_df_deleted.to_csv('storms_mlc_sz.csv', index=False)

filtered_df_deleted['time'] = filtered_df_deleted['time'].astype(str)
filtered_df_deleted['time'] = pd.to_datetime(filtered_df_deleted['time'], errors='coerce')
# yyyy-MM-dd HH:mm:ss
filtered_df_deleted['time'] = filtered_df_deleted['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
filtered_df_deleted.to_csv("storms_mlc_formatted.csv", index=False)
