In [None]:
import pandas as pd

def read_csv_to_dataframe(file_path):
  """Reads a CSV file into a pandas DataFrame.

  Args:
    file_path: The path to the CSV file.

  Returns:
    A pandas DataFrame containing the data from the CSV file,
    or None if an error occurs.
  """
  try:
    df = pd.read_csv(file_path)
    return df
  except FileNotFoundError:
    print(f"Error: File not found at '{file_path}'")
    return None
  except pd.errors.ParserError:
    print(f"Error: Could not parse the CSV file at '{file_path}'")
    return None
  except Exception as e:
    print(f"An unexpected error occurred: {e}")
    return None


import pandas as pd
import json

def read_json(file_path):
  """Reads a JSON file into a pandas DataFrame.

  Args:
    file_path: The path to the JSON file.

  Returns:
    A pandas DataFrame containing the data from the JSON file,
    or None if an error occurs.
  """
  try:
    with open(file_path, 'r') as file:
      data = json.load(file)

    return data

  except FileNotFoundError:
    print(f"Error: File not found at '{file_path}'")
    return None
  except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in file '{file_path}'")
    return None
  except Exception as e:
    print(f"An unexpected error occurred: {e}")
    return None

def getHurDatObsDF(hurdat):
    df = pd.DataFrame()
    for row in hurdat:
        storm_name = row['name'] + row['storm_id']
        for obs in row['observations']:
            newObs = obs.copy()
            newObs['storm_name'] = storm_name
            df = pd.concat([df, pd.DataFrame([newObs])], ignore_index=True)
    return df

In [None]:
hurdat = read_json('processed-hurdat2.json')
hurdat

[{'storm_id': '1950',
  'name': 'ABLE',
  'num_records': 51,
  'observations': [{'date': '1950-08-12',
    'time': '00:00 UTC',
    'latitude': 17.1,
    'longitude': -55.5,
    'wind_speed': 35,
    'pressure': -999},
   {'date': '1950-08-12',
    'time': '06:00 UTC',
    'latitude': 17.7,
    'longitude': -56.3,
    'wind_speed': 40,
    'pressure': -999},
   {'date': '1950-08-12',
    'time': '12:00 UTC',
    'latitude': 18.2,
    'longitude': -57.4,
    'wind_speed': 45,
    'pressure': -999},
   {'date': '1950-08-12',
    'time': '18:00 UTC',
    'latitude': 19.0,
    'longitude': -58.6,
    'wind_speed': 50,
    'pressure': -999},
   {'date': '1950-08-13',
    'time': '00:00 UTC',
    'latitude': 20.0,
    'longitude': -60.0,
    'wind_speed': 50,
    'pressure': -999},
   {'date': '1950-08-13',
    'time': '06:00 UTC',
    'latitude': 20.7,
    'longitude': -61.1,
    'wind_speed': 50,
    'pressure': -999},
   {'date': '1950-08-13',
    'time': '12:00 UTC',
    'latitude': 21.3

In [None]:
hurdat_df = getHurDatObsDF(hurdat)
hurdat_df

Unnamed: 0,date,time,latitude,longitude,wind_speed,pressure,storm_name
0,1950-08-12,00:00 UTC,17.1,-55.5,35,-999,ABLE1950
1,1950-08-12,06:00 UTC,17.7,-56.3,40,-999,ABLE1950
2,1950-08-12,12:00 UTC,18.2,-57.4,45,-999,ABLE1950
3,1950-08-12,18:00 UTC,19.0,-58.6,50,-999,ABLE1950
4,1950-08-13,00:00 UTC,20.0,-60.0,50,-999,ABLE1950
...,...,...,...,...,...,...,...
27859,2023-10-23,18:00 UTC,11.5,-83.2,25,1007,TWENTY-ONE2023
27860,2023-10-24,00:00 UTC,12.2,-83.4,25,1007,TWENTY-ONE2023
27861,2023-10-24,01:30 UTC,12.4,-83.5,25,1007,TWENTY-ONE2023
27862,2023-10-24,06:00 UTC,13.0,-83.8,25,1007,TWENTY-ONE2023


In [None]:
def searchForHurricane(lat, lng, dateTimeStr):
    date = dateTimeStr[0:10]
    time = dateTimeStr[11:16] + ' UTC'
    print(date, time)
    return hurdat_df[(hurdat_df['latitude'] == lat) & (hurdat_df['longitude'] == lng) & (hurdat_df['date'] == date) & (hurdat_df['time'] == time)]

In [None]:
era5AllLatLng = read_csv_to_dataframe('era5ForPossiblyNotAllLatLongOnHurDat2ObservationDates.csv')
era5AllLatLng

Unnamed: 0,latitude,longitude,valid_time,msl,sst,vertical_wind_shear,relative_humidity,vorticity
0,6.0,-99.0,2020-05-07 00:00:00,100857.0,302.0,6.353390,70.682290,0.000014
1,6.0,-99.0,2020-05-20 00:00:00,100948.0,302.0,4.723331,91.982090,0.000015
2,6.0,-99.0,2020-05-20 12:00:00,100979.0,302.0,3.085481,91.525840,0.000110
3,6.0,-99.0,2020-05-21 00:00:00,100929.0,302.0,6.755966,78.600490,0.000042
4,6.0,-99.0,2020-07-19 12:00:00,100904.0,301.0,8.187444,65.447105,0.000018
...,...,...,...,...,...,...,...,...
16241,41.0,-63.0,2020-08-17 12:00:00,100957.0,301.0,5.334395,92.450970,0.000020
16242,41.0,-63.0,2020-08-17 18:00:00,100620.0,301.0,7.991423,63.981960,0.000032
16243,41.0,-58.0,2020-08-31 06:00:00,100860.0,300.0,7.764542,82.190010,0.000041
16244,41.0,-54.0,2020-08-16 12:00:00,100931.0,300.0,9.093582,75.314610,0.000055


In [None]:
rowsMatchingObs = read_csv_to_dataframe('rowsMatching2020ObservationsDates.csv')
rowsMatchingObs = rowsMatchingObs.dropna() # some sst values are NaN?
rowsMatchingObs

Unnamed: 0,latitude,longitude,valid_time,msl,sst,vertical_wind_shear,relative_humidity,vorticity
0,28.0,-79.0,2020-05-16 18:00:00,100933.0,299.0,3.730620,65.402400,0.000176
1,28.0,-78.0,2020-05-16 18:00:00,101065.0,299.0,4.696961,73.052444,0.000144
2,28.0,-78.0,2020-05-17 00:00:00,100989.0,299.0,11.209113,71.477080,0.000150
3,29.0,-78.0,2020-05-17 00:00:00,100933.0,298.0,10.880550,71.320500,0.000178
4,29.0,-78.0,2020-05-17 06:00:00,100975.0,298.0,6.782000,67.788246,0.000146
...,...,...,...,...,...,...,...,...
3050,13.0,-84.0,2020-11-17 12:00:00,100017.0,301.0,19.659649,91.256490,0.000064
3052,14.0,-84.0,2020-11-17 12:00:00,99835.0,301.0,12.241644,97.742940,0.000494
3061,13.0,-88.0,2020-11-18 06:00:00,100740.0,301.0,15.612284,98.965570,0.000233
3065,13.0,-89.0,2020-11-18 12:00:00,100836.0,301.0,12.168504,97.515305,0.000385


The below would probably just be best put in a 2D data structure so you could get O(1) accesses. You could put all the data in that location too so there was no need to figure out what data corresponds to that lat/long

In [None]:
from scipy.spatial import KDTree
import pandas as pd
import numpy as np

latLngPairs = era5AllLatLng[['latitude', 'longitude']].to_numpy()
latLngTree = KDTree(latLngPairs)

dist, idx = latLngTree.query([[6,-13]], k=5)

for i in idx:
    print(era5AllLatLng.iloc[i])

      latitude  longitude           valid_time       msl    sst  \
601        6.0      -12.0  2020-10-17 18:00:00  100998.0  301.0   
602        6.0      -11.0  2020-10-17 18:00:00  100979.0  301.0   
2473       8.0      -13.0  2020-10-18 18:00:00  100990.0  301.0   
2474       8.0      -13.0  2020-10-26 18:00:00  100924.0  302.0   
2471       8.0      -14.0  2020-05-07 06:00:00  100996.0  302.0   

      vertical_wind_shear  relative_humidity  vorticity  
601              0.858433          62.783030   0.000011  
602              0.956752          62.557198   0.000015  
2473             3.237162          67.329230   0.000014  
2474             9.512054          63.194946   0.000018  
2471             3.574359          75.195110   0.000015  


In [None]:
allAttributes = era5AllLatLng[['latitude', 'longitude', 'msl', 'sst', 'vertical_wind_shear', 'relative_humidity', 'vorticity']].to_numpy()

allAttributesTree = KDTree(allAttributes)

print(str(allAttributes[0]) + '\n----------\n')
dist, idx = allAttributesTree.query(allAttributes[0], k=5)

for i in idx:
    print(era5AllLatLng.iloc[i])

[ 6.0000000e+00 -9.9000000e+01  1.0085700e+05  3.0200000e+02
  6.3533897e+00  7.0682290e+01  1.3573421e-05]
----------

latitude                               6.0
longitude                            -99.0
valid_time             2020-05-07 00:00:00
msl                               100857.0
sst                                  302.0
vertical_wind_shear                6.35339
relative_humidity                 70.68229
vorticity                         0.000014
Name: 0, dtype: object
latitude                               7.0
longitude                            -99.0
valid_time             2020-10-15 12:00:00
msl                               100858.0
sst                                  300.0
vertical_wind_shear               8.472389
relative_humidity                 67.43022
vorticity                         0.000023
Name: 617, dtype: object
latitude                               6.0
longitude                            -98.0
valid_time             2020-05-09 00:00:00
msl            

The exact lat longs are not present here since we rounded them off. But shouldn't there be more. There are only 3k rows?

In [None]:
notPresent = 0
for i in hurdat:
    for obs in i['observations']:
        lat = obs['latitude']
        lon = obs['longitude']
        date = obs['date']
        time = obs['time'][0:5]
        dateStr = f'{date} {time}:00'



        check = (rowsMatchingObs['latitude'] == lat) & (rowsMatchingObs['longitude'] == lon) & (rowsMatchingObs['valid_time'] == dateStr)
        if rowsMatchingObs[check].empty:
            notPresent += 1
        else:
            print(i)

print(notPresent)

num = 0
for i in hurdat:
    num += len(i['observations'])
print(num)

{'storm_id': '2020', 'name': 'CRISTOBAL', 'num_records': 45, 'observations': [{'date': '2020-06-01', 'time': '18:00 UTC', 'latitude': 19.4, 'longitude': -90.9, 'wind_speed': 25, 'pressure': 1006}, {'date': '2020-06-02', 'time': '00:00 UTC', 'latitude': 19.6, 'longitude': -91.6, 'wind_speed': 25, 'pressure': 1005}, {'date': '2020-06-02', 'time': '06:00 UTC', 'latitude': 19.6, 'longitude': -92.1, 'wind_speed': 30, 'pressure': 1005}, {'date': '2020-06-02', 'time': '12:00 UTC', 'latitude': 19.5, 'longitude': -92.5, 'wind_speed': 35, 'pressure': 1004}, {'date': '2020-06-02', 'time': '18:00 UTC', 'latitude': 19.2, 'longitude': -92.6, 'wind_speed': 40, 'pressure': 1001}, {'date': '2020-06-03', 'time': '00:00 UTC', 'latitude': 19.0, 'longitude': -92.5, 'wind_speed': 45, 'pressure': 996}, {'date': '2020-06-03', 'time': '06:00 UTC', 'latitude': 18.9, 'longitude': -92.3, 'wind_speed': 50, 'pressure': 994}, {'date': '2020-06-03', 'time': '12:00 UTC', 'latitude': 18.8, 'longitude': -92.2, 'wind_spe

Huge number of dates not present. OH BECAUSE there are only 2020!!!
When accounting for that, 0 missing.

In [None]:
notPresent = 0
for i in hurdat:
    for obs in i['observations']:
        if obs['date'][0:5] != '2020':
            continue
        date = obs['date']
        time = obs['time'][0:5]
        dateStr = f'{date} {time}:00'



        check = (rowsMatchingObs['valid_time'] == dateStr)
        if rowsMatchingObs[check].empty:
            notPresent += 1

print(notPresent)

num = 0
for i in hurdat:
    num += len(i['observations'])
print(num)

0
27864


In [None]:
notPresent = 0
for i in hurdat:
    for obs in i['observations']:
        if obs['date'][0:5] != '2020':
            continue
        date = obs['date']
        time = obs['time'][0:5]
        dateStr = f'{date} {time}:00'


        check = (era5AllLatLng['valid_time'] == dateStr)
        if era5AllLatLng[check].empty:
            notPresent += 1

print(notPresent)

num = 0
for i in hurdat:
    num += len(i['observations'])
print(num)

0
27864


In [None]:
def searchForRow(df, lat, lng, dateTimeStr):
    return df[(df['latitude'] == lat) & (df['longitude'] == lng) & (df['valid_time'] == dateTimeStr)]

In [None]:
'''
Look at the attributes of the hurricanes from hurdat2 without lat/long and see which ones are similar on meteorological data
'''
rowsMatchingObsTreeInput = rowsMatchingObs[['msl', 'sst', 'vertical_wind_shear', 'relative_humidity', 'vorticity']].to_numpy()

rowsMatchingObsNumberAttrTree = KDTree(rowsMatchingObsTreeInput)

print(rowsMatchingObs.iloc[0])
print('------------')
dist, idx = rowsMatchingObsNumberAttrTree.query(rowsMatchingObsTreeInput[0], k=5)

for ix, i in enumerate(idx):
    print(searchForRow(rowsMatchingObs, rowsMatchingObs.iloc[i]['latitude'], rowsMatchingObs.iloc[i]['longitude'], rowsMatchingObs.iloc[i]['valid_time']))
    # print(rowsMatchingObs.iloc[i])
    print(dist[ix])
    print('**********')

latitude                              28.0
longitude                            -79.0
valid_time             2020-05-16 18:00:00
msl                               100933.0
sst                                  299.0
vertical_wind_shear                3.73062
relative_humidity                  65.4024
vorticity                         0.000176
Name: 0, dtype: object
------------
   latitude  longitude           valid_time       msl    sst  \
0      28.0      -79.0  2020-05-16 18:00:00  100933.0  299.0   

   vertical_wind_shear  relative_humidity  vorticity  
0              3.73062            65.4024   0.000176  
0.0
**********
     latitude  longitude           valid_time       msl    sst  \
771      13.0      -46.0  2020-08-13 00:00:00  100934.0  301.0   

     vertical_wind_shear  relative_humidity  vorticity  
771             2.041944           64.81503   0.000228  
2.862975574359587
**********
     latitude  longitude           valid_time       msl    sst  \
399      34.0      -77.0

In [None]:
dateTimeStr = '2020-09-10 12:00:00'
date = dateTimeStr[0:10]
time = dateTimeStr[11:16] + ' UTC'
print(date, time)
hurdat_df[(hurdat_df['date'] == date) & (hurdat_df['time'] == time)]
# hurdat_df[(hurdat_df['latitude'] == lat) & (hurdat_df['longitude'] == lng) & (hurdat_df['date'] == date) & (hurdat_df['time'] == time)]

2020-09-10 12:00 UTC


Unnamed: 0,date,time,latitude,longitude,wind_speed,pressure,storm_name
25520,2020-09-10,12:00 UTC,21.3,-48.9,45,998,PAULETTE2020
25609,2020-09-10,12:00 UTC,18.4,-35.3,40,1002,RENE2020


In [None]:
dateTimeStr = '2020-09-10 12:00:00'
date = dateTimeStr[0:10]
time = dateTimeStr[11:16] + ' UTC'
print(date, time)
rowsMatchingObs[(rowsMatchingObs['valid_time'] == dateTimeStr)]

2020-09-10 12:00 UTC


Unnamed: 0,latitude,longitude,valid_time,msl,sst,vertical_wind_shear,relative_humidity,vorticity
1257,21.0,-49.0,2020-09-10 12:00:00,100471.0,301.0,3.689332,83.94174,0.000267
1258,21.0,-48.0,2020-09-10 12:00:00,100770.0,301.0,7.175591,76.68992,0.000127
1259,22.0,-49.0,2020-09-10 12:00:00,100474.0,301.0,19.063284,97.45919,0.000443
1260,22.0,-48.0,2020-09-10 12:00:00,100815.0,301.0,23.080862,94.81247,0.000139
1580,18.0,-36.0,2020-09-10 12:00:00,101110.0,300.0,14.793862,90.66482,0.000141
1581,18.0,-35.0,2020-09-10 12:00:00,101037.0,300.0,18.697905,83.29773,0.000262
1582,19.0,-36.0,2020-09-10 12:00:00,101026.0,300.0,12.426521,79.15945,0.000205
1583,19.0,-35.0,2020-09-10 12:00:00,100924.0,300.0,2.709577,64.62881,0.000303


Do search within certain radius of neighbors and see how they plot.

In [118]:
hurdat_df[hurdat_df['date'] == '2020-05-19']

Unnamed: 0,date,time,latitude,longitude,wind_speed,pressure,storm_name
25159,2020-05-19,00:00 UTC,36.2,-73.1,50,991,ARTHUR2020
25160,2020-05-19,06:00 UTC,36.8,-71.4,50,990,ARTHUR2020
25161,2020-05-19,12:00 UTC,37.0,-69.5,55,989,ARTHUR2020
25162,2020-05-19,18:00 UTC,36.9,-67.8,50,991,ARTHUR2020


In [121]:
stormNameCol = []
for idx, row in rowsMatchingObs.iterrows():
    date = row['valid_time'][0:10]
    time = row['valid_time'][11:16] + ' UTC'
    lat = row['latitude']
    lon = row['longitude']

    hurdat_df_row = hurdat_df[(hurdat_df['date'] == date) & (hurdat_df['time'] == time) & (hurdat_df['latitude'] < lat+1) & (hurdat_df['latitude'] > lat-1)& (hurdat_df['longitude'] < lon+1) & (hurdat_df['longitude'] > lon-1)]
    if not hurdat_df_row.empty:
        stormNameCol.append(hurdat_df_row['storm_name'].iloc[0])
    else:
        print('!!!')
        stormNameCol.append(None)

print(stormNameCol)


['ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'ARTHUR2020', 'BERTHA2020', 'BERTHA2020', 'BERTHA2020', 'BERTHA2020', 'BERTHA2020', 'BERTHA2020', 'BERTHA2020', 'BERTHA2020', 'BERTHA2020', 'CRISTOBAL2020', 'CRISTOBAL2020', 'CRISTOBAL2020', 'CRISTOBAL

In [126]:
stormNameColNP = np.array(stormNameCol)
rowsMatchingObs['storm_name'] = stormNameColNP
rowsMatchingObs.to_csv('rowsMatching2020ObservationsDatesWithNames.csv')

In [131]:
rowsMatchingObs[rowsMatchingObs['storm_name'] == 'ARTHUR2020']

Unnamed: 0,latitude,longitude,valid_time,msl,sst,vertical_wind_shear,relative_humidity,vorticity,storm_name
0,28.0,-79.0,2020-05-16 18:00:00,100933.0,299.0,3.73062,65.4024,0.000176,ARTHUR2020
1,28.0,-78.0,2020-05-16 18:00:00,101065.0,299.0,4.696961,73.052444,0.000144,ARTHUR2020
2,28.0,-78.0,2020-05-17 00:00:00,100989.0,299.0,11.209113,71.47708,0.00015,ARTHUR2020
3,29.0,-78.0,2020-05-17 00:00:00,100933.0,298.0,10.88055,71.3205,0.000178,ARTHUR2020
4,29.0,-78.0,2020-05-17 06:00:00,100975.0,298.0,6.782,67.788246,0.000146,ARTHUR2020
5,29.0,-77.0,2020-05-17 06:00:00,101053.0,298.0,9.961258,78.2556,0.00015,ARTHUR2020
6,30.0,-78.0,2020-05-17 06:00:00,100978.0,298.0,9.919909,70.19047,0.000164,ARTHUR2020
7,30.0,-77.0,2020-05-17 06:00:00,101065.0,297.0,3.56652,75.57652,0.000156,ARTHUR2020
8,30.0,-78.0,2020-05-17 12:00:00,100961.0,298.0,6.920937,77.149475,0.000184,ARTHUR2020
9,30.0,-77.0,2020-05-17 12:00:00,100967.0,297.0,8.013131,77.6427,0.000208,ARTHUR2020


In [127]:
def findSimilarHurricanesAccordingToWeatherData(points):
    rowsMatchingObsTreeInput = rowsMatchingObs[['msl', 'sst', 'vertical_wind_shear', 'relative_humidity', 'vorticity']].to_numpy()
    rowsMatchingObsNumberAttrTree = KDTree(rowsMatchingObsTreeInput)

    hurricaneFreq = {}

    for point in points:
        dist, idx = rowsMatchingObsNumberAttrTree.query(point, k=5)
        for ix, i in enumerate(idx):
            if rowsMatchingObs.iloc[i]['storm_name'] in hurricaneFreq:
                hurricaneFreq = hurricaneFreq.get([rowsMatchingObs.iloc[i]['storm_name']], 0) + 1
            # print(searchForRow(rowsMatchingObs, rowsMatchingObs.iloc[i]['latitude'], rowsMatchingObs.iloc[i]['longitude'], rowsMatchingObs.iloc[i]['valid_time']))
            # print(rowsMatchingObs.iloc[i])
            # print(dist[ix])
            # print('**********')

    return hurricaneFreq

def getColsForHurricane(df, hurricaneName, cols=['latitude', 'longitude']):
    return np.array((df[df['storm_name'] == hurricaneName])[cols])

def getColsForHurDatRowObs(hurricaneName, cols=['latitude', 'longitude']):
    df_match = rowsMatchingObs[rowsMatchingObs['storm_name'] == hurricaneName]
    print(df_match)
    return np.array(df_match[cols])

    # return getColsForHurricane(rowsMatchingObs, hurricaneName, cols)

In [128]:
katrinaLatLng = getColsForHurDatRowObs('KATRINA2005', cols=['msl', 'sst', 'vertical_wind_shear', 'relative_humidity', 'vorticity'])
findSimilarHurricanesAccordingToWeatherData(katrinaLatLng)

Empty DataFrame
Columns: [latitude, longitude, valid_time, msl, sst, vertical_wind_shear, relative_humidity, vorticity, storm_name]
Index: []


{}

In [None]:
'''
I want to compare the meteorological data for all the hurricanes in the hurdat2 dataset
for this i need the points in era5 for each hurricane
this needs to include the storm_name in each column





'''