In [72]:
!pip install cdsapi



In [73]:
from google.colab import userdata
apikey = userdata.get("CDS_TOKEN")
with open("/root/.cdsapirc", "w") as f:
    print("url: https://cds.climate.copernicus.eu/api", file=f)
    print(f"key: {apikey}", file=f)

In [74]:
import requests
import os
import cdsapi

def downloadProcessedDataset(url, file_path):
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes

    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes

    with open(file_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192): #8KB chunks
            f.write(chunk)

    print(f"File downloaded to: {file_path}")

def downloadNewPressureDataset(years, months, fileName):
    dataset = "reanalysis-era5-pressure-levels"
    request = {
        "product_type": ["reanalysis"],
        "variable": ["relative_humidity",
            "u_component_of_wind",
            "v_component_of_wind",
            "vorticity"],
        "year": years,
        "month": months,
        "day": [
            "01", "02", "03",
            "04", "05", "06",
            "07", "08", "09",
            "10", "11", "12",
            "13", "14", "15",
            "16", "17", "18",
            "19", "20", "21",
            "22", "23", "24",
            "25", "26", "27",
            "28", "29", "30",
            "31"
        ],
        "time": [
            "00:00",
            "06:00",
            "12:00",
            "18:00"
        ],
        "pressure_level": ["200", "700", "850"],
        "data_format": "netcdf",
        "download_format": "zip",
        "area": [65, -120, 0, 0]
    }

    client = cdsapi.Client()
    client.retrieve(dataset, request, fileName)

def downloadNewSingleLevelsDataset(years, months, fileName):
    dataset = "reanalysis-era5-single-levels"
    request = {
        "product_type": ["reanalysis"],
        "variable": ["mean_sea_level_pressure",
            "sea_surface_temperature"],
        "year": years,
        "month": months,
        "day": [
            "01", "02", "03",
            "04", "05", "06",
            "07", "08", "09",
            "10", "11", "12",
            "13", "14", "15",
            "16", "17", "18",
            "19", "20", "21",
            "22", "23", "24",
            "25", "26", "27",
            "28", "29", "30",
            "31"
        ],
        "time": [
            "00:00",
            "06:00",
            "12:00",
            "18:00"
        ],
        "data_format": "netcdf",
        "download_format": "zip",
        "area": [65, -120, 0, 0]
    }

    client = cdsapi.Client()
    client.retrieve(dataset, request, fileName)

In [75]:
import xarray as xr

def unzipFile(fileName):
    !unzip -o $fileName

def openDataset(fileName):
    return xr.open_dataset(fileName)

In [82]:
import pandas as pd
import numpy as np

def dropColumns(ds, cols=['expver', 'number']):
    for i in cols:
        if i in ds:
            ds = ds.drop_vars(i)
            print(f"Column '{i}' dropped successfully.")
        else:
            print(f"Column '{i}' not found in the dataset.")

    return ds

def preprocessPressureData(ds):
    try:
        u_850 = ds['u'].sel(pressure_level=850)
        u_200 = ds['u'].sel(pressure_level=200)
        v_850 = ds['v'].sel(pressure_level=850)
        v_200 = ds['v'].sel(pressure_level=200)

        vertical_wind_shear = np.sqrt((u_850 - u_200)**2 + (v_850 - v_200)**2)

        ds['vertical_wind_shear'] = vertical_wind_shear

    except KeyError as e:
        print(f"Error: Key {e} not found in the dataset. Check variable and level names.")

    except ValueError as e:
        print(f"Error processing data: {e}")

    pressure_700 = ds['r'].sel(pressure_level=700)
    ds['relative_humidity'] = pressure_700

    vorticity = ds['vo'].sel(pressure_level=850)
    ds['vorticity'] = vorticity

    ds = dropColumns(ds, cols=['r', 'u', 'v', 'vo'])
    ds = dropColumns(ds, cols=['expver', 'pressure_level', 'number'])

    df = ds.to_dataframe()
    df.reset_index(inplace=True)
    return df

def preprocessTemperature(ds):
    ds = dropColumns(ds, cols=['expver', 'number'])

    df = ds.to_dataframe()
    df.reset_index(inplace=True)
    return df

In [None]:
import pandas as pd

hurdat_df = pd.read_csv('hurdat_data.csv')
hurdat_df

In [78]:
import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    """Calculate the great-circle distance between two points on the Earth."""
    R = 6371  # Earth's radius in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

def find_closest_row(dataframe, target_lat, target_long, target_datetime):
    # Filter rows with matching datetime
    filtered_df = dataframe[dataframe['valid_time'] == target_datetime]

    if filtered_df.empty:
        return None  # No match for datetime

    # Compute distances (Haversine or simple Euclidean here)
    # distances = np.sqrt((filtered_df["latitude"] - target_lat)**2 + (filtered_df["longitude"] - target_long)**2)
    distances = haversine(target_lat, target_long, filtered_df["latitude"], filtered_df["longitude"])

    # Find the index of the minimum distance
    closest_index = distances.idxmin()
    return dataframe.loc[closest_index].copy()

In [79]:
def findRowsMatchingYear(df, reference_df):
    newRows = []
    for idx, row in df.iterrows():
        # print(row['storm_name'], row['latitude'], row['longitude'], row['datetime'])
        closestRow = find_closest_row(reference_df, row['latitude'], row['longitude'], row['datetime'])
        if closestRow is not None:
            closestRow['storm_name'] = row['storm_name']
            closestRow['hurricane_path_latitude'] = row['latitude']
            closestRow['hurricane_path_longitude'] = row['longitude']
            # print(closestRow)

            newRows.append(closestRow)

    return newRows

In [80]:
import pandas as pd

def findEarliestAndLatestMonthRange(df, year):
    # Filter rows based on the year
    year_df = df[df['date'].str.startswith(str(year))]

    # Convert the 'date' column to datetime
    dates = pd.to_datetime(year_df['date'], format='%Y-%m-%d')

    # Extract the months
    months = dates.dt.month

    # Find the earliest and latest months
    earliest_month = months.min()
    latest_month = months.max()

    return list(range(earliest_month, latest_month+1))

In [12]:
!mkdir data
!mkdir processed

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘processed’: File exists


In [None]:
for year in range(2006, 2024):
    downloadNewPressureDataset([year], findEarliestAndLatestMonthRange(hurdat_df, year), f'/content/data/pressure_{year}.zip')
    unzipFile(f'/content/data/pressure_{year}.zip')
    pressure_df = openDataset(f'/content/data_stream-oper_stepType-instant.nc')
    pressure_df = preprocessPressureData(pressure_df)

    year_df = hurdat_df[hurdat_df['date'].str.startswith(str(year))]
    print(f'Before {hurdat_df.shape[0]}')
    print(f'After {year_df.shape[0]}')
    rowsMatchingYear = findRowsMatchingYear(year_df, pressure_df)

    newRows_df = pd.DataFrame(rowsMatchingYear)
    newRows_df.to_csv(f'/content/processed/{year}_Era5_Rows.csv', index=False)


    print(year)
    print(newRows_df)


In [84]:
for year in range(1957, 2024):
    downloadNewSingleLevelsDataset([year], findEarliestAndLatestMonthRange(hurdat_df, year), f'/content/data/temperature_{year}.zip')
    unzipFile(f'/content/data/temperature_{year}.zip')
    temperature_df = openDataset(f'/content/data_stream-oper_stepType-instant.nc')
    temperature_df = preprocessTemperature(temperature_df)

    year_df = hurdat_df[hurdat_df['date'].str.startswith(str(year))]
    print(f'Before {hurdat_df.shape[0]}')
    print(f'After {year_df.shape[0]}')
    rowsMatchingYear = findRowsMatchingYear(year_df, temperature_df)

    newRows_df = pd.DataFrame(rowsMatchingYear)
    newRows_df.to_csv(f'/content/processed-temperature/{year}_Era5_Temperature_Rows.csv', index=False)


    print(year)
    print(newRows_df)


2024-12-16 02:35:51,895 INFO [2024-09-28T00:00:00] **Welcome to the New Climate Data Store (CDS)!** This new system is in its early days of full operations and still undergoing enhancements and fine tuning. Some disruptions are to be expected. Your 
[feedback](https://jira.ecmwf.int/plugins/servlet/desk/portal/1/create/202) is key to improve the user experience on the new CDS for the benefit of everyone. Thank you.
INFO:datapi.legacy_api_client:[2024-09-28T00:00:00] **Welcome to the New Climate Data Store (CDS)!** This new system is in its early days of full operations and still undergoing enhancements and fine tuning. Some disruptions are to be expected. Your 
[feedback](https://jira.ecmwf.int/plugins/servlet/desk/portal/1/create/202) is key to improve the user experience on the new CDS for the benefit of everyone. Thank you.
2024-12-16 02:35:51,899 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
INFO:datapi.le

KeyboardInterrupt: 

-------------------------------------
Combining downloaded processed rows

In [None]:
allRows = []
for year in range(1950, 2023+1):
    df = pd.read_csv(f'/content/processed-data/{year}_Era5_Rows.csv')
    print(df)
    allRows.append(df)

allRows_df = pd.concat(allRows, ignore_index=True)
allRows_df.to_csv('/content/1950-2023-hurricanePath_era5_rows.csv', index=False)

-----------------
Constructing KD Tree

In [59]:
import pandas as pd

# Assuming the CSV file is in the current working directory or you provide the full path
df = pd.read_csv('/content/1950-2023-hurricanePath_era5_rows.csv')

# Now you can work with the DataFrame 'df'
# Example: print the first 5 rows
print(df)

                valid_time  latitude  longitude  vertical_wind_shear  \
0      1950-08-12 00:00:00     17.00     -55.50            10.592421   
1      1950-08-12 06:00:00     17.75     -56.25            15.716592   
2      1950-08-12 12:00:00     18.25     -57.50            11.104238   
3      1950-08-12 18:00:00     19.00     -58.50            13.652968   
4      1950-08-13 00:00:00     20.00     -60.00             9.493796   
...                    ...       ...        ...                  ...   
27214  2023-10-23 12:00:00     11.50     -82.50             7.004334   
27215  2023-10-23 18:00:00     11.50     -83.25             7.655430   
27216  2023-10-24 00:00:00     12.25     -83.50             1.646199   
27217  2023-10-24 06:00:00     13.00     -83.75             4.906357   
27218  2023-10-24 12:00:00     13.50     -84.50             8.622382   

       relative_humidity  vorticity      storm_name  hurricane_path_latitude  \
0              12.747797   0.000054        ABLE1950    

In [60]:
from sklearn.neighbors import KDTree
import pandas as pd

# Select relevant columns for KD-tree construction
features = ['vertical_wind_shear', 'relative_humidity', 'vorticity']
data = df[features].values  # Convert to NumPy array

# Construct the KD-tree
tree = KDTree(data, leaf_size=2)  # You can adjust leaf_size as needed

# Example usage: find the k-nearest neighbors for a specific point
# Replace with your desired query point (e.g., a new observation)
query_point = [10, 50, 0.005] #example

distances, indices = tree.query([query_point], k=5) #Finds 5 nearest neighbors

print("Distances:", distances)
print("Indices of nearest neighbors:", indices)


# Get the corresponding rows from the DataFrame for the nearest neighbors
nearest_neighbors_df = df.iloc[indices[0]]

nearest_neighbors_df

Distances: [[0.04011646 0.24499769 0.31191236 0.50785918 0.55556244]]
Indices of nearest neighbors: [[26868  1108  7407    92 24820]]


Unnamed: 0,valid_time,latitude,longitude,vertical_wind_shear,relative_humidity,vorticity,storm_name,hurricane_path_latitude,hurricane_path_longitude
26868,2023-09-06 00:00:00,28.0,-33.75,9.978934,49.966194,0.000235,KATIA2023,28.0,-33.7
1108,1953-08-30 18:00:00,14.0,-35.0,9.765664,49.928684,4.7e-05,CAROL1953,14.0,-35.0
7407,1975-08-24 12:00:00,22.5,-69.75,10.294131,49.89631,3.4e-05,CAROLINE1975,22.4,-69.8
92,1950-08-29 00:00:00,23.5,-87.75,9.577256,49.7186,0.000256,BAKER1950,23.5,-87.7
24820,2020-08-16 06:00:00,20.0,-63.5,10.404525,49.619232,1.2e-05,JOSEPHINE2020,20.0,-63.4


In [61]:
!pip install cartopy



In [62]:
def find_similar_storms(storm_name, df, tree, k=5):
    """
    Finds similar storms based on KD-tree nearest neighbors.

    Args:
        storm_name (str): The name of the storm to find similar storms for.
        df (pd.DataFrame): The DataFrame containing storm data.
        tree (KDTree): The KDTree object built from storm features.
        k (int, optional): The number of nearest neighbors to consider. Defaults to 5.

    Returns:
        list: A list of tuples, where each tuple contains the similar storm name,
              frequency count, and total distance.
    """
    storm_rows = df[df['storm_name'] == storm_name]
    storm_stats = storm_rows[['vertical_wind_shear', 'relative_humidity', 'vorticity']]

    storm_counts = {}
    storm_dist = {}

    for idx, (shear, humid, vort) in storm_stats.iterrows():
        point = (shear, humid, vort)
        distances, indices = tree.query([point], k=k)

        nearest_neighbors = df.iloc[indices[0]]
        nearest_neighbors = nearest_neighbors[nearest_neighbors['storm_name'] != storm_name]

        for i, neighbor_storm_name in enumerate(nearest_neighbors['storm_name']):
            storm_counts[neighbor_storm_name] = storm_counts.get(neighbor_storm_name, 0) + 1
            storm_dist[neighbor_storm_name] = storm_dist.get(neighbor_storm_name, 0) + distances[0][i]

    sorted_storm_counts = sorted(storm_counts.items(), key=lambda item: item[1], reverse=True)

    # Return the top similar storms with counts and distances
    return [(storm_name, count, storm_dist[storm_name]) for storm_name, count in sorted_storm_counts[:5]]

# Example usage:
similar_storms = find_similar_storms('KATRINA2005', df, tree)

for storm_name, count, total_distance in similar_storms:
    print(f"{storm_name}: {count} @ {total_distance}")

HANNA2008: 3 @ 0.332025152088705
GABRIELLE1989: 2 @ 0.20453511158165466
BEULAH1967: 2 @ 0.10473296892733955
EASY1951: 2 @ 0.03357988564693728
DANIELLE2022: 2 @ 0.05299595745815091


In [None]:
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

def plot_similar_hurricane_paths(storm_name, save_path, df, sorted_storm_counts):
    """
    Plots the paths of similar hurricanes on a world map.

    Args:
        storm_name (str): The name of the storm to find similar storms for.
        save_path (str): The path to save the plot image.
        df (pd.DataFrame): The DataFrame containing storm data.
        sorted_storm_counts (list): A list of tuples, where each tuple contains the
                                   similar storm name and frequency count, sorted
                                   in descending order of frequency.
    """
    hurricanes_to_plot = [hurricane for hurricane, _, _ in sorted_storm_counts]
    hurricanes_to_plot.insert(0, storm_name)

    fig = plt.figure(figsize=(10, 6))
    ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.LAND)

    for hurricane_name in hurricanes_to_plot:
        hurricane_data = df[df['storm_name'] == hurricane_name]
        ax.plot(hurricane_data['hurricane_path_longitude'], hurricane_data['hurricane_path_latitude'],
                transform=ccrs.PlateCarree(), label=hurricane_name)

    ax.set_title(f'Hurricane Paths Similar to {storm_name}')  # Title with storm name
    ax.legend()

    plt.savefig(save_path)  # Save the plot
    plt.show()  # Display the plot


# Example usage:
plot_similar_hurricane_paths('KATRINA2005', '/content/plots/my_plot.png', df, similar_storms)

In [None]:
for storm in df['storm_name'].unique():
    similar_storms = find_similar_storms(storm, df, tree)
    plot_similar_hurricane_paths(storm, f'/content/plots/{storm}_plot.png', df, similar_storms)

In [None]:
!zip plots.zip -r plots

----------------------

In [None]:
!unzip test.zip -d renamed

Archive:  test.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of test.zip or
        test.zip.zip, and cannot find test.zip.ZIP, period.


In [None]:
!unzip msl_sst.zip

Archive:  msl_sst.zip
replace data_stream-oper_stepType-instant.nc? [y]es, [n]o, [A]ll, [N]one, [r]ename: r
new name: msl_sst.zip
replace msl_sst.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: r
new name: msl_sst.nc
  inflating: msl_sst.nc              


In [None]:
import xarray as xr

# Load the NetCDF dataset
mslAndSST = xr.open_dataset('msl_sst.nc')
pressure = xr.open_dataset('data_stream-oper_stepType-instant.nc')

print(mslAndSST)
print(pressure)

<xarray.Dataset> Size: 860MB
Dimensions:     (valid_time: 856, latitude: 261, longitude: 481)
Coordinates:
    number      int64 8B ...
  * valid_time  (valid_time) datetime64[ns] 7kB 2020-05-01 ... 2020-11-30T18:...
  * latitude    (latitude) float64 2kB 65.0 64.75 64.5 64.25 ... 0.5 0.25 0.0
  * longitude   (longitude) float64 4kB -120.0 -119.8 -119.5 ... -0.5 -0.25 0.0
    expver      (valid_time) <U4 14kB ...
Data variables:
    msl         (valid_time, latitude, longitude) float32 430MB ...
    sst         (valid_time, latitude, longitude) float32 430MB ...
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2024-12-08T04:02 GRIB to CDM+CF via cfgrib-0.9.1...
<xarray.Dataset> Size: 5GB
Dimensions:         (valid_time: 856, pressure_lev

In [None]:
import pandas as pd
import numpy as np

try:
    u_850 = pressure['u'].sel(pressure_level=850)
    u_200 = pressure['u'].sel(pressure_level=200)
    v_850 = pressure['v'].sel(pressure_level=850)
    v_200 = pressure['v'].sel(pressure_level=200)

    vertical_wind_shear = np.sqrt((u_850 - u_200)**2 + (v_850 - v_200)**2)

    pressure['vertical_wind_shear'] = vertical_wind_shear

except KeyError as e:
    print(f"Error: Key {e} not found in the dataset. Check variable and level names.")

except ValueError as e:
    print(f"Error processing data: {e}")

pressure

In [None]:
pressure_700 = pressure['r'].sel(pressure_level=700)
pressure['relative_humidity'] = pressure_700

pressure

In [None]:
vorticity = pressure['vo'].sel(pressure_level=850)
pressure['vorticity'] = vorticity
pressure

In [None]:
def dropColumns(ds, cols=['expver', 'number']):
    for i in cols:
        if i in ds:
            ds = ds.drop_vars(i)
            print(f"Column '{i}' dropped successfully.")
        else:
            print(f"Column '{i}' not found in the dataset.")

    return ds

pressure = dropColumns(pressure, cols=['r', 'u', 'v', 'vo'])
pressure

In [None]:
pressure = dropColumns(pressure, cols=['expver', 'pressure_level', 'number'])
pressure

In [None]:
pressure_df = pressure.to_dataframe()
pressure_df.reset_index(inplace=True)
pressure_df

In [None]:
import json

hurdat = json.load(open('processed-hurdat2.json'))
hurdat

[{'storm_id': '1950',
  'name': 'ABLE',
  'num_records': 51,
  'observations': [{'date': '1950-08-12',
    'time': '00:00 UTC',
    'latitude': 17.1,
    'longitude': -55.5,
    'wind_speed': 35,
    'pressure': -999},
   {'date': '1950-08-12',
    'time': '06:00 UTC',
    'latitude': 17.7,
    'longitude': -56.3,
    'wind_speed': 40,
    'pressure': -999},
   {'date': '1950-08-12',
    'time': '12:00 UTC',
    'latitude': 18.2,
    'longitude': -57.4,
    'wind_speed': 45,
    'pressure': -999},
   {'date': '1950-08-12',
    'time': '18:00 UTC',
    'latitude': 19.0,
    'longitude': -58.6,
    'wind_speed': 50,
    'pressure': -999},
   {'date': '1950-08-13',
    'time': '00:00 UTC',
    'latitude': 20.0,
    'longitude': -60.0,
    'wind_speed': 50,
    'pressure': -999},
   {'date': '1950-08-13',
    'time': '06:00 UTC',
    'latitude': 20.7,
    'longitude': -61.1,
    'wind_speed': 50,
    'pressure': -999},
   {'date': '1950-08-13',
    'time': '12:00 UTC',
    'latitude': 21.3

In [None]:
hurdatObservations = []
for row in hurdat:
    for obs in row['observations']:
        newObs = obs.copy()
        newObs['storm_name'] = row['name'] + row['storm_id']
        newObs['datetime'] = obs['date'] + ' ' + obs['time'][0:5] + ':00'
        hurdatObservations.append(newObs)

hurdatObservations

[{'date': '1950-08-12',
  'time': '00:00 UTC',
  'latitude': 17.1,
  'longitude': -55.5,
  'wind_speed': 35,
  'pressure': -999,
  'storm_name': 'ABLE1950',
  'datetime': '1950-08-12 00:00:00'},
 {'date': '1950-08-12',
  'time': '06:00 UTC',
  'latitude': 17.7,
  'longitude': -56.3,
  'wind_speed': 40,
  'pressure': -999,
  'storm_name': 'ABLE1950',
  'datetime': '1950-08-12 06:00:00'},
 {'date': '1950-08-12',
  'time': '12:00 UTC',
  'latitude': 18.2,
  'longitude': -57.4,
  'wind_speed': 45,
  'pressure': -999,
  'storm_name': 'ABLE1950',
  'datetime': '1950-08-12 12:00:00'},
 {'date': '1950-08-12',
  'time': '18:00 UTC',
  'latitude': 19.0,
  'longitude': -58.6,
  'wind_speed': 50,
  'pressure': -999,
  'storm_name': 'ABLE1950',
  'datetime': '1950-08-12 18:00:00'},
 {'date': '1950-08-13',
  'time': '00:00 UTC',
  'latitude': 20.0,
  'longitude': -60.0,
  'wind_speed': 50,
  'pressure': -999,
  'storm_name': 'ABLE1950',
  'datetime': '1950-08-13 00:00:00'},
 {'date': '1950-08-13',
 

In [None]:
import pandas as pd

hurdat_df = pd.DataFrame(hurdatObservations)
hurdat_df.to_csv('hurdat_data.csv', index=False)

In [None]:
for index, row in hurdat_df.iterrows():
    if row['date'][0:4] != '2020':
        continue
    closestRow = find_closest_row(pressure_df, row['latitude'], row['longitude'], row['datetime'])
    if closestRow is None:
        print(row['storm_name'])
        print(row['datetime'])

BERTHA2020
2020-05-27 13:30:00
CRISTOBAL2020
2020-06-03 13:00:00
CRISTOBAL2020
2020-06-07 22:00:00
FAY2020
2020-07-10 20:00:00
GONZALO2020
2020-07-25 15:30:00
HANNA2020
2020-07-25 22:00:00
HANNA2020
2020-07-25 23:15:00
ISAIAS2020
2020-07-30 16:15:00
ISAIAS2020
2020-07-31 09:00:00
ISAIAS2020
2020-08-01 13:00:00
ISAIAS2020
2020-08-04 03:10:00
LAURA2020
2020-08-21 20:30:00
LAURA2020
2020-08-21 23:30:00
LAURA2020
2020-08-23 04:30:00
LAURA2020
2020-08-24 02:00:00
NANA2020
2020-09-03 03:00:00
PAULETTE2020
2020-09-14 08:50:00
SALLY2020
2020-09-16 09:45:00
BETA2020
2020-09-22 02:45:00
ALPHA2020
2020-09-18 18:40:00
GAMMA2020
2020-10-03 16:45:00
GAMMA2020
2020-10-06 03:00:00
DELTA2020
2020-10-07 10:30:00
DELTA2020
2020-10-09 23:00:00
ZETA2020
2020-10-27 03:55:00
ZETA2020
2020-10-28 21:00:00
ETA2020
2020-11-03 21:00:00
ETA2020
2020-11-08 08:55:00
ETA2020
2020-11-09 04:00:00
ETA2020
2020-11-12 09:20:00
IOTA2020
2020-11-17 03:40:00


In [None]:
mslAndSST = dropColumns(mslAndSST, cols=['expver', 'number'])
mslAndSST

Column 'expver' dropped successfully.
Column 'number' dropped successfully.


In [None]:
mslSST_df = mslAndSST.to_dataframe()
mslSST_df.reset_index(inplace=True)
mslSST_df

Unnamed: 0,valid_time,latitude,longitude,msl,sst
0,2020-05-01 00:00:00,65.0,-120.00,101768.0625,
1,2020-05-01 00:00:00,65.0,-119.75,101779.3125,
2,2020-05-01 00:00:00,65.0,-119.50,101791.0625,
3,2020-05-01 00:00:00,65.0,-119.25,101807.8125,
4,2020-05-01 00:00:00,65.0,-119.00,101819.8125,
...,...,...,...,...,...
107463091,2020-11-30 18:00:00,0.0,-1.00,101121.6875,299.939697
107463092,2020-11-30 18:00:00,0.0,-0.75,101120.4375,299.980713
107463093,2020-11-30 18:00:00,0.0,-0.50,101117.4375,300.115479
107463094,2020-11-30 18:00:00,0.0,-0.25,101114.6875,300.203369


In [None]:
# Convert 'valid_time' to datetime objects if they aren't already
pressure_df['valid_time'] = pd.to_datetime(pressure_df['valid_time'])
mslSST_df['valid_time'] = pd.to_datetime(mslSST_df['valid_time'])

# Perform the merge operation
merged_df = pd.merge(pressure_df, mslSST_df, on=['latitude', 'longitude', 'valid_time'], how='inner')

merged_df

Need to download the ERA5 data in chunks each year from 1950-2024. Do all the processing for pressure and the msl data. Comb through the rows and only keep the ones that match hurdat paths. Save each of those years individually and keep going.