In [1]:
!pip install cdsapi

Collecting cdsapi
  Downloading cdsapi-0.7.5-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting datapi (from cdsapi)
  Downloading datapi-0.1.1-py3-none-any.whl.metadata (17 kB)
Collecting multiurl>=0.3.2 (from datapi->cdsapi)
  Downloading multiurl-0.3.3.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading cdsapi-0.7.5-py2.py3-none-any.whl (12 kB)
Downloading datapi-0.1.1-py3-none-any.whl (26 kB)
Building wheels for collected packages: multiurl
  Building wheel for multiurl (setup.py) ... [?25l[?25hdone
  Created wheel for multiurl: filename=multiurl-0.3.3-py3-none-any.whl size=21230 sha256=d616fe8b78f5500fcde10cc959cf7cdfc6680eba75c2ce9cdf6a3dcfb9c7bc88
  Stored in directory: /root/.cache/pip/wheels/be/05/e0/65a6edb0a000498aeaefbadd80228bf5ed1bdbb82840ca1692
Successfully built multiurl
Installing collected packages: multiurl, datapi, cdsapi
Successfully installed cdsapi-0.7.5 datapi-0.1.1 multiurl-0.3.3


In [3]:
# To redownload processed datasets

import requests
import os

def downloadProcessedDataset(url, file_path):
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes

    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes

    with open(file_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192): #8KB chunks
            f.write(chunk)

    print(f"File downloaded to: {file_path}")

downloadProcessedDataset('https://object-store.os-api.cci2.ecmwf.int/cci2-prod-cache/fa0a2809d2170d85f22464487b4c71a.zip', '/content/pressure.zip')
downloadProcessedDataset('https://object-store.os-api.cci2.ecmwf.int/cci2-prod-cache/2da99adab6184270e2e8389f0b3484ea.zip', '/content/msl_sst.zip')

File downloaded to: /content/pressure.zip
File downloaded to: /content/msl_sst.zip


In [4]:
!unzip pressure.zip

Archive:  pressure.zip
  inflating: data_stream-oper_stepType-instant.nc  


In [5]:
!unzip msl_sst.zip

Archive:  msl_sst.zip
replace data_stream-oper_stepType-instant.nc? [y]es, [n]o, [A]ll, [N]one, [r]ename: r
new name: msl_sst.zip
replace msl_sst.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: r
new name: msl_sst.nc
  inflating: msl_sst.nc              


In [1]:
import xarray as xr

# Load the NetCDF dataset
mslAndSST = xr.open_dataset('msl_sst.nc')
pressure = xr.open_dataset('data_stream-oper_stepType-instant.nc')

print(mslAndSST)
print(pressure)

<xarray.Dataset> Size: 860MB
Dimensions:     (valid_time: 856, latitude: 261, longitude: 481)
Coordinates:
    number      int64 8B ...
  * valid_time  (valid_time) datetime64[ns] 7kB 2020-05-01 ... 2020-11-30T18:...
  * latitude    (latitude) float64 2kB 65.0 64.75 64.5 64.25 ... 0.5 0.25 0.0
  * longitude   (longitude) float64 4kB -120.0 -119.8 -119.5 ... -0.5 -0.25 0.0
    expver      (valid_time) <U4 14kB ...
Data variables:
    msl         (valid_time, latitude, longitude) float32 430MB ...
    sst         (valid_time, latitude, longitude) float32 430MB ...
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2024-12-08T04:02 GRIB to CDM+CF via cfgrib-0.9.1...
<xarray.Dataset> Size: 5GB
Dimensions:         (valid_time: 856, pressure_lev

In [2]:
import pandas as pd
import numpy as np

try:
    u_850 = pressure['u'].sel(pressure_level=850)
    u_200 = pressure['u'].sel(pressure_level=200)
    v_850 = pressure['v'].sel(pressure_level=850)
    v_200 = pressure['v'].sel(pressure_level=200)

    vertical_wind_shear = np.sqrt((u_850 - u_200)**2 + (v_850 - v_200)**2)

    pressure['vertical_wind_shear'] = vertical_wind_shear

except KeyError as e:
    print(f"Error: Key {e} not found in the dataset. Check variable and level names.")

except ValueError as e:
    print(f"Error processing data: {e}")

pressure

In [3]:
pressure_700 = pressure['r'].sel(pressure_level=700)
pressure['relative_humidity'] = pressure_700

pressure

In [4]:
vorticity = pressure['vo'].sel(pressure_level=850)
pressure['vorticity'] = vorticity
pressure

In [5]:
def dropColumns(ds, cols=['expver', 'number']):
    for i in cols:
        if i in ds:
            ds = ds.drop_vars(i)
            print(f"Column '{i}' dropped successfully.")
        else:
            print(f"Column '{i}' not found in the dataset.")

    return ds

pressure = dropColumns(pressure, cols=['r', 'u', 'v', 'vo'])
pressure

Column 'r' dropped successfully.
Column 'u' dropped successfully.
Column 'v' dropped successfully.
Column 'vo' dropped successfully.


In [6]:
pressure = dropColumns(pressure, cols=['expver', 'pressure_level', 'number'])
pressure

Column 'expver' dropped successfully.
Column 'pressure_level' dropped successfully.
Column 'number' dropped successfully.


In [7]:
pressure_df = pressure.to_dataframe()
pressure_df.reset_index(inplace=True)
pressure_df

Unnamed: 0,valid_time,latitude,longitude,vertical_wind_shear,relative_humidity,vorticity
0,2020-05-01 00:00:00,65.0,-120.00,10.906098,96.652046,-0.000003
1,2020-05-01 00:00:00,65.0,-119.75,11.006050,97.007515,-0.000002
2,2020-05-01 00:00:00,65.0,-119.50,11.106084,97.523140,-0.000004
3,2020-05-01 00:00:00,65.0,-119.25,11.131938,98.273140,-0.000007
4,2020-05-01 00:00:00,65.0,-119.00,11.071660,98.984077,-0.000016
...,...,...,...,...,...,...
107463091,2020-11-30 18:00:00,0.0,-1.00,4.230934,47.203552,0.000004
107463092,2020-11-30 18:00:00,0.0,-0.75,4.546702,47.797302,0.000004
107463093,2020-11-30 18:00:00,0.0,-0.50,4.905839,47.996521,0.000003
107463094,2020-11-30 18:00:00,0.0,-0.25,5.152977,48.437927,0.000007


In [8]:
import json

hurdat = json.load(open('processed-hurdat2.json'))
hurdat

[{'storm_id': '1950',
  'name': 'ABLE',
  'num_records': 51,
  'observations': [{'date': '1950-08-12',
    'time': '00:00 UTC',
    'latitude': 17.1,
    'longitude': -55.5,
    'wind_speed': 35,
    'pressure': -999},
   {'date': '1950-08-12',
    'time': '06:00 UTC',
    'latitude': 17.7,
    'longitude': -56.3,
    'wind_speed': 40,
    'pressure': -999},
   {'date': '1950-08-12',
    'time': '12:00 UTC',
    'latitude': 18.2,
    'longitude': -57.4,
    'wind_speed': 45,
    'pressure': -999},
   {'date': '1950-08-12',
    'time': '18:00 UTC',
    'latitude': 19.0,
    'longitude': -58.6,
    'wind_speed': 50,
    'pressure': -999},
   {'date': '1950-08-13',
    'time': '00:00 UTC',
    'latitude': 20.0,
    'longitude': -60.0,
    'wind_speed': 50,
    'pressure': -999},
   {'date': '1950-08-13',
    'time': '06:00 UTC',
    'latitude': 20.7,
    'longitude': -61.1,
    'wind_speed': 50,
    'pressure': -999},
   {'date': '1950-08-13',
    'time': '12:00 UTC',
    'latitude': 21.3

In [12]:
hurdatObservations = []
for row in hurdat:
    for obs in row['observations']:
        newObs = obs.copy()
        newObs['storm_name'] = row['name'] + row['storm_id']
        newObs['datetime'] = obs['date'] + ' ' + obs['time'][0:5] + ':00'
        hurdatObservations.append(newObs)

hurdatObservations

[{'date': '1950-08-12',
  'time': '00:00 UTC',
  'latitude': 17.1,
  'longitude': -55.5,
  'wind_speed': 35,
  'pressure': -999,
  'storm_name': 'ABLE1950',
  'datetime': '1950-08-12 00:00:00'},
 {'date': '1950-08-12',
  'time': '06:00 UTC',
  'latitude': 17.7,
  'longitude': -56.3,
  'wind_speed': 40,
  'pressure': -999,
  'storm_name': 'ABLE1950',
  'datetime': '1950-08-12 06:00:00'},
 {'date': '1950-08-12',
  'time': '12:00 UTC',
  'latitude': 18.2,
  'longitude': -57.4,
  'wind_speed': 45,
  'pressure': -999,
  'storm_name': 'ABLE1950',
  'datetime': '1950-08-12 12:00:00'},
 {'date': '1950-08-12',
  'time': '18:00 UTC',
  'latitude': 19.0,
  'longitude': -58.6,
  'wind_speed': 50,
  'pressure': -999,
  'storm_name': 'ABLE1950',
  'datetime': '1950-08-12 18:00:00'},
 {'date': '1950-08-13',
  'time': '00:00 UTC',
  'latitude': 20.0,
  'longitude': -60.0,
  'wind_speed': 50,
  'pressure': -999,
  'storm_name': 'ABLE1950',
  'datetime': '1950-08-13 00:00:00'},
 {'date': '1950-08-13',
 

In [13]:
import pandas as pd

hurdat_df = pd.DataFrame(hurdatObservations)
hurdat_df.to_csv('hurdat_data.csv', index=False)

Unnamed: 0,date,time,latitude,longitude,wind_speed,pressure,storm_name,datetime
0,1950-08-12,00:00 UTC,17.1,-55.5,35,-999,ABLE1950,1950-08-12 00:00:00
1,1950-08-12,06:00 UTC,17.7,-56.3,40,-999,ABLE1950,1950-08-12 06:00:00
2,1950-08-12,12:00 UTC,18.2,-57.4,45,-999,ABLE1950,1950-08-12 12:00:00
3,1950-08-12,18:00 UTC,19.0,-58.6,50,-999,ABLE1950,1950-08-12 18:00:00
4,1950-08-13,00:00 UTC,20.0,-60.0,50,-999,ABLE1950,1950-08-13 00:00:00
...,...,...,...,...,...,...,...,...
27859,2023-10-23,18:00 UTC,11.5,-83.2,25,1007,TWENTY-ONE2023,2023-10-23 18:00:00
27860,2023-10-24,00:00 UTC,12.2,-83.4,25,1007,TWENTY-ONE2023,2023-10-24 00:00:00
27861,2023-10-24,01:30 UTC,12.4,-83.5,25,1007,TWENTY-ONE2023,2023-10-24 01:30:00
27862,2023-10-24,06:00 UTC,13.0,-83.8,25,1007,TWENTY-ONE2023,2023-10-24 06:00:00


In [33]:
def find_closest_row(dataframe, target_lat, target_long, target_datetime):
    # Filter rows with matching datetime
    filtered_df = dataframe[dataframe['valid_time'] == target_datetime]

    if filtered_df.empty:
        return None  # No match for datetime

    # Compute distances (Haversine or simple Euclidean here)
    distances = np.sqrt((filtered_df["latitude"] - target_lat)**2 + (filtered_df["longitude"] - target_long)**2)

    # Find the index of the minimum distance
    closest_index = distances.idxmin()
    return dataframe.loc[closest_index]

In [35]:
for index, row in hurdat_df.iterrows():
    if row['date'][0:4] != '2020':
        continue
    # Only want 1 row per lat/long. Since hurdat observations don't exactly line up with increments of lat/lng from era5, have to look within a bound
    # Latitude increment is 0.25 so looking at roughly 0.25/2 =0.125 above and below. To break ties looking at 0.124 below and 0.126 above
    # Longitude increment is 0.2 so looking at 0.1 bound.
    # checkLat = (pressure_df['latitude'] < row['latitude'] + 0.124) & (pressure_df['latitude'] > row['latitude'] - 0.126)
    # checkLng = (pressure_df['longitude'] < row['longitude'] + 0.9) & (pressure_df['longitude'] > row['longitude'] - 0.11)
    # checkTime = (pressure_df['valid_time'] == row['datetime'])

    # if pressure_df[checkLat & checkLng & checkTime].empty:
    #     print(row['storm_name'])
    #     print(row['latitude'] + 0.124, row['latitude'] - 0.126)
    #     print(row['longitude'] + 0.9, row['longitude'] + 0.11)
    closestRow = find_closest_row(pressure_df, row['latitude'], row['longitude'], row['datetime'])
    if closestRow is None:
        print(row['storm_name'])
        print(row['datetime'])

BERTHA2020
2020-05-27 13:30:00
CRISTOBAL2020
2020-06-03 13:00:00
CRISTOBAL2020
2020-06-07 22:00:00
FAY2020
2020-07-10 20:00:00
GONZALO2020
2020-07-25 15:30:00
HANNA2020
2020-07-25 22:00:00
HANNA2020
2020-07-25 23:15:00
ISAIAS2020
2020-07-30 16:15:00
ISAIAS2020
2020-07-31 09:00:00
ISAIAS2020
2020-08-01 13:00:00
ISAIAS2020
2020-08-04 03:10:00
LAURA2020
2020-08-21 20:30:00
LAURA2020
2020-08-21 23:30:00
LAURA2020
2020-08-23 04:30:00
LAURA2020
2020-08-24 02:00:00
NANA2020
2020-09-03 03:00:00
PAULETTE2020
2020-09-14 08:50:00
SALLY2020
2020-09-16 09:45:00
BETA2020
2020-09-22 02:45:00
ALPHA2020
2020-09-18 18:40:00
GAMMA2020
2020-10-03 16:45:00
GAMMA2020
2020-10-06 03:00:00
DELTA2020
2020-10-07 10:30:00
DELTA2020
2020-10-09 23:00:00
ZETA2020
2020-10-27 03:55:00
ZETA2020
2020-10-28 21:00:00
ETA2020
2020-11-03 21:00:00
ETA2020
2020-11-08 08:55:00
ETA2020
2020-11-09 04:00:00
ETA2020
2020-11-12 09:20:00
IOTA2020
2020-11-17 03:40:00


In [8]:
mslAndSST = dropColumns(mslAndSST, cols=['expver', 'number'])
mslAndSST

Column 'expver' dropped successfully.
Column 'number' dropped successfully.


In [9]:
mslSST_df = mslAndSST.to_dataframe()
mslSST_df.reset_index(inplace=True)
mslSST_df

Unnamed: 0,valid_time,latitude,longitude,msl,sst
0,2020-05-01 00:00:00,65.0,-120.00,101768.0625,
1,2020-05-01 00:00:00,65.0,-119.75,101779.3125,
2,2020-05-01 00:00:00,65.0,-119.50,101791.0625,
3,2020-05-01 00:00:00,65.0,-119.25,101807.8125,
4,2020-05-01 00:00:00,65.0,-119.00,101819.8125,
...,...,...,...,...,...
107463091,2020-11-30 18:00:00,0.0,-1.00,101121.6875,299.939697
107463092,2020-11-30 18:00:00,0.0,-0.75,101120.4375,299.980713
107463093,2020-11-30 18:00:00,0.0,-0.50,101117.4375,300.115479
107463094,2020-11-30 18:00:00,0.0,-0.25,101114.6875,300.203369


In [None]:
# Convert 'valid_time' to datetime objects if they aren't already
pressure_df['valid_time'] = pd.to_datetime(pressure_df['valid_time'])
mslSST_df['valid_time'] = pd.to_datetime(mslSST_df['valid_time'])

# Perform the merge operation
merged_df = pd.merge(pressure_df, mslSST_df, on=['latitude', 'longitude', 'valid_time'], how='inner')

merged_df

Need to download the ERA5 data in chunks each year from 1950-2024. Do all the processing for pressure and the msl data. Comb through the rows and only keep the ones that match hurdat paths. Save each of those years individually and keep going.