In [1]:
import xarray as xr 

ds = xr.open_dataset('/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/01_raw/e16fe5f664d5290665a25e5afad51ce9.nc')

In [2]:
ds 

In [3]:
import numpy as np
from dinosaur import horizontal_interpolation
from dinosaur import spherical_harmonic
from dinosaur import xarray_utils
import neuralgcm

# Define source grid
source_lat = ds.latitude.values
source_lon = ds.longitude.values

source_grid = spherical_harmonic.Grid(
    latitude_nodes=len(source_lat),
    longitude_nodes=len(source_lon),
    latitude_spacing=xarray_utils.infer_latitude_spacing(ds.latitude),
    longitude_offset=xarray_utils.infer_longitude_offset(ds.longitude),
)

# Define target grid with 1° resolution
target_lat = np.arange(-90, 90, 1.0)  # From -90 to 90 degrees
target_lon = np.arange(0, 360, 1.0)     # From 0 to 359 degrees

target_grid = spherical_harmonic.Grid(
    latitude_nodes=len(target_lat),
    longitude_nodes=len(target_lon),
    latitude_spacing='gauss',              
    longitude_offset=0.0,               # Starting at 0 degrees
)

# Create the conservative regridder
regridder = horizontal_interpolation.ConservativeRegridder(
    source_grid=source_grid,
    target_grid=target_grid,
    skipna=True
)

# List of variables to regrid
variables_to_regrid = ['t2m', 'sst', 'tp', 'pev']

# Perform regridding for each variable
regridded_vars = {}
for var in variables_to_regrid:
    regridded_var = xarray_utils.regrid(ds[var], regridder)
    regridded_vars[var] = regridded_var

# Combine regridded variables into a new Dataset
ds_regridded = xr.Dataset(regridded_vars)



In [4]:
ds = ds_regridded

In [5]:
ds

In [6]:
import numpy as np
import xarray as xr

# Assuming your dataset is called ds
# Example dataset variables: t2m, sst, tp, pev

# 1. Extract relevant variables
variables = ['t2m', 'sst']
data_arrays = [ds[var] for var in variables]

# 2. Flatten latitude and longitude into a single node dimension
# Combine the latitude and longitude as one "node" dimension
data_arrays_flattened = [da.stack(node=('latitude', 'longitude')) for da in data_arrays]

# 3. Convert each variable's DataArray to a numpy array and add a new channel dimension
# (so we have the shape (time, nodes, channels))
target = np.stack([da.to_numpy() for da in data_arrays_flattened], axis=-1)


In [7]:
target

array([[[244.37465,       nan],
        [244.36493,       nan],
        [244.35608,       nan],
        ...,
        [247.77246, 271.45972],
        [247.7742 , 271.45972],
        [247.77585, 271.45972]],

       [[233.83032,       nan],
        [233.82687,       nan],
        [233.82463,       nan],
        ...,
        [249.8451 , 271.4602 ],
        [249.85892, 271.4602 ],
        [249.87314, 271.4602 ]],

       [[225.74751,       nan],
        [225.74786,       nan],
        [225.74991,       nan],
        ...,
        [253.00006, 271.45972],
        [253.00504, 271.45972],
        [253.0102 , 271.45972]],

       ...,

       [[224.98901,       nan],
        [224.98492,       nan],
        [224.98158,       nan],
        ...,
        [260.8049 , 271.46045],
        [260.81042, 271.46045],
        [260.81616, 271.46045]],

       [[235.16318,       nan],
        [235.15271,       nan],
        [235.14279,       nan],
        ...,
        [252.50299, 271.45996],
        [252.50853

In [8]:
# Save as .npy file
np.save('/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/target.npy', target)

In [9]:
# 1. Create a boolean mask where valid values are True and invalid (NaN) values are False
mask = ~np.isnan(target)

In [10]:
mask

array([[[ True, False],
        [ True, False],
        [ True, False],
        ...,
        [ True,  True],
        [ True,  True],
        [ True,  True]],

       [[ True, False],
        [ True, False],
        [ True, False],
        ...,
        [ True,  True],
        [ True,  True],
        [ True,  True]],

       [[ True, False],
        [ True, False],
        [ True, False],
        ...,
        [ True,  True],
        [ True,  True],
        [ True,  True]],

       ...,

       [[ True, False],
        [ True, False],
        [ True, False],
        ...,
        [ True,  True],
        [ True,  True],
        [ True,  True]],

       [[ True, False],
        [ True, False],
        [ True, False],
        ...,
        [ True,  True],
        [ True,  True],
        [ True,  True]],

       [[ True, False],
        [ True, False],
        [ True, False],
        ...,
        [ True,  True],
        [ True,  True],
        [ True,  True]]])

In [11]:
# Save as .npy file
np.save('/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/mask.npy', mask)

In [12]:
import numpy as np
import xarray as xr

# Assuming your dataset is called ds
# Example dataset variables: t2m, sst, tp, pev

# 1. Extract relevant variables
variables = ['tp', 'pev']
data_arrays = [ds[var] for var in variables]

# 2. Flatten latitude and longitude into a single node dimension
# Combine the latitude and longitude as one "node" dimension
data_arrays_flattened = [da.stack(node=('latitude', 'longitude')) for da in data_arrays]

# 3. Convert each variable's DataArray to a numpy array and add a new channel dimension
# (so we have the shape (time, nodes, channels))
covariates = np.stack([da.to_numpy() for da in data_arrays_flattened], axis=-1)


In [13]:
covariates

array([[[ 6.8090281e-05, -6.4479813e-05],
        [ 6.8090274e-05, -6.4586922e-05],
        [ 6.8090274e-05, -6.4637505e-05],
        ...,
        [ 3.4012645e-04,  1.3783574e-07],
        [ 3.4161820e-04,  1.3783574e-07],
        [ 3.4282752e-04,  1.3783574e-07]],

       [[ 1.0168270e-04, -3.0486590e-05],
        [ 1.0110193e-04, -3.0563067e-05],
        [ 1.0065243e-04, -3.0603795e-05],
        ...,
        [ 5.6263909e-04, -5.9604645e-08],
        [ 5.6481862e-04, -5.9604645e-08],
        [ 5.6690286e-04, -5.9604645e-08]],

       [[ 2.1595132e-04,  2.5787708e-06],
        [ 2.1529190e-04,  2.6204737e-06],
        [ 2.1478313e-04,  2.6416642e-06],
        ...,
        [ 5.3761341e-04, -7.4505806e-08],
        [ 5.3921936e-04, -7.4505806e-08],
        [ 5.4090080e-04, -7.4505806e-08]],

       ...,

       [[ 7.7386831e-05, -3.2376074e-06],
        [ 7.7386816e-05, -3.1983966e-06],
        [ 7.7273755e-05, -3.1145642e-06],
        ...,
        [ 4.4049055e-04,  1.4901161e-08],
     

In [14]:
# Save as .npy file
np.save('/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/covariates.npy', covariates)

In [15]:
import pandas as pd
import os

def generate_and_save_metadata(df: pd.DataFrame, lat_col: str = 'lat', lon_col: str = 'lon', save_directory: str = None) -> pd.DataFrame:
    """
    Generates metadata from a given DataFrame by identifying unique latitude and longitude
    combinations and assigning a unique node ID to each combination. The metadata is then
    saved to a specified Parquet file.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - lat_col (str): The name of the column containing latitude data.
    - lon_col (str): The name of the column containing longitude data.
    - save_directory (str, optional): The directory where the metadata file will be saved. If None, 
                                      the file will be saved in the current working directory.

    Returns:
    - pd.DataFrame: A DataFrame containing the metadata with latitude, longitude, and 'node_id' as columns.
    """
    # Ensure the DataFrame contains necessary columns
    if lat_col not in df.columns or lon_col not in df.columns:
        raise ValueError(f"DataFrame must contain '{lat_col}' and '{lon_col}' columns.")

    # Create a unique node ID for each unique latitude-longitude combination
    unique_lat_lon = df[[lat_col, lon_col]].drop_duplicates().reset_index(drop=True)
    unique_lat_lon['node_id'] = unique_lat_lon.index

    # Create the metadata DataFrame
    metadata = unique_lat_lon.set_index('node_id')

    # Handle save directory and save metadata to Parquet file
    if save_directory:
        os.makedirs(save_directory, exist_ok=True)  # Create directory if it doesn't exist
        file_path = os.path.join(save_directory, "metadata.parquet")
    else:
        file_path = "metadata.parquet"  # Save in the current working directory

    metadata.to_parquet(file_path)
    print(f"Metadata file saved at: {file_path}")

    return metadata


In [16]:
import pandas as pd
import os

# Assume you've already stacked your data arrays as 'data_arrays_flattened'
# We'll use one of the flattened DataArrays to extract the node information
da_flattened = data_arrays_flattened[0]  # Using the first variable for example

# Get the MultiIndex from the 'node' dimension
node_index = da_flattened.indexes['node']

# Extract latitude and longitude from the MultiIndex
latitudes = node_index.get_level_values('latitude').values
longitudes = node_index.get_level_values('longitude').values

# Create a DataFrame with 'lat' and 'lon' columns
df = pd.DataFrame({
    'lat': latitudes,
    'lon': longitudes
})

# Now use your 'generate_and_save_metadata' function
metadata = generate_and_save_metadata(
    df, 
    lat_col='lat', 
    lon_col='lon', 
    save_directory='/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/'
)


Metadata file saved at: /Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/metadata.parquet


In [17]:
# metadata

In [18]:
import pandas as pd
metadata = pd.read_parquet("/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/metadata.parquet")

In [19]:
metadata

Unnamed: 0_level_0,lat,lon
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-89.236642,0.0
1,-89.236642,1.0
2,-89.236642,2.0
3,-89.236642,3.0
4,-89.236642,4.0
...,...,...
64795,89.236642,355.0
64796,89.236642,356.0
64797,89.236642,357.0
64798,89.236642,358.0


In [21]:
# Converting the DataFrame to a numpy ndarray
metadata_array = metadata.to_numpy()


In [23]:
metadata_array

array([[-89.23664167,   0.        ],
       [-89.23664167,   1.        ],
       [-89.23664167,   2.        ],
       ...,
       [ 89.23664167, 357.        ],
       [ 89.23664167, 358.        ],
       [ 89.23664167, 359.        ]])

In [24]:
# Save as .npy file
np.save('/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/metadata.npy', metadata_array)

In [27]:
from typing import Optional, Union, List

from tsl.datasets.prototypes import TabularDataset

class DroughtDataset(TabularDataset):

    similarity_options = {'distance', 'correlation'}

    def __init__(self,
                 root: str = None
                 ):

        self.root = root

        # Load data
        target, mask, u, dist, metadata = self.load()

        covariates = {
            'u': (u),
            'metadata' : (metadata),
            'distances': (dist)
        }

        super().__init__(target=target,
                         mask=mask,
                         covariates=covariates,
                         similarity_score='distance',
                         temporal_aggregation='mean',
                         spatial_aggregation='mean',
                         name='DroughtDataset')

    def load(self):
        """
        Load data from files.

        Returns:
            tuple: Containing dataframe, mask, metadata, and distance matrix.
        """
        target_path = f"{self.root}target.npy"
        mask_path = f"{self.root}mask.npy"
        dist_path = f"{self.root}knn_distances.npy"
        covariates_path = f"{self.root}covariates.npy"
        metadata = f"{self.root}metadata.npy"

        # Load main data
        target = np.load(target_path)
        mask = np.load(mask_path)
        u = np.load(covariates_path)
        dist = np.load(dist_path)
        metadata = np.load(dist_path)

        return target, mask, u, dist, metadata

    def compute_similarity(self, method: str, **kwargs):
        """
        Compute similarity matrix based on the specified method.

        Args:
            method (str): The similarity computation method ('distance' or 'correlation').
            **kwargs: Additional keyword arguments for similarity computation.

        Returns:
            numpy.ndarray: Computed similarity matrix.

        Raises:
            ValueError: If an unknown similarity method is provided.
        """
        if method == "distance":
            # Calculate a Gaussian kernel similarity from the distance matrix, using a default or provided 'theta'
            theta = kwargs.get('theta', np.std(self.distances))
            return self.gaussian_kernel(self.distances, theta=theta)
        elif method == "correlation":
            # Compute the average correlation between nodes over the target features
            # Reshape target data to have nodes as columns
            target_values = self.target.values.reshape(len(self.target), -1, len(self.target_node_feature))
            # Average over the target features
            target_mean = target_values.mean(axis=2)
            # Compute correlation between nodes
            corr = np.corrcoef(target_mean, rowvar=False)
            return (corr + 1) / 2  # Normalize to [0, 1]
        else:
            raise ValueError(f"Unknown similarity method: {method}")

    @staticmethod
    def gaussian_kernel(distances, theta):
        """
        Compute Gaussian kernel similarity from distances.

        Args:
            distances (numpy.ndarray): Distance matrix.
            theta (float): Kernel bandwidth parameter.

        Returns:
            numpy.ndarray: Gaussian kernel similarity matrix.
        """
        return np.exp(-(distances ** 2) / (2 * (theta ** 2)))

In [28]:
dataset = DroughtDataset(root='/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/')

In [30]:
dataset.distances

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [86]:
dataset.target

array([[[247.86041, 271.45972],
        [247.86041, 271.45972],
        [247.86041, 271.45972],
        ...,
        [244.09088,       nan],
        [244.09088,       nan],
        [244.09088,       nan]],

       [[250.77211, 271.4602 ],
        [250.77211, 271.4602 ],
        [250.77211, 271.4602 ],
        ...,
        [233.52406,       nan],
        [233.52406,       nan],
        [233.52406,       nan]],

       [[253.26758, 271.45972],
        [253.26758, 271.45972],
        [253.26758, 271.45972],
        ...,
        [225.29492,       nan],
        [225.29492,       nan],
        [225.29492,       nan]],

       ...,

       [[261.77277, 271.46045],
        [261.77277, 271.46045],
        [261.77277, 271.46045],
        ...,
        [224.07355,       nan],
        [224.07355,       nan],
        [224.07355,       nan]],

       [[252.4182 , 271.45996],
        [252.4182 , 271.45996],
        [252.4182 , 271.45996],
        ...,
        [235.00414,       nan],
        [235.00414

In [87]:
print(f"Has missing values: {dataset.has_mask}")

Has missing values: True


In [88]:
dataset.mask

array([[[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]],

       ...,

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]]])

In [89]:
dataset.mask

array([[[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]],

       ...,

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]],

       [[ True,  True],
        [ True,  True],
        [ True,  True],
        ...,
        [ True, False],
        [ True, False],
        [ True, False]]])

In [90]:
dataset.covariates

{'u': array([[[ 4.3678284e-04,  1.3783574e-07],
         [ 4.3678284e-04,  1.3783574e-07],
         [ 4.3678284e-04,  1.3783574e-07],
         ...,
         [ 1.0681152e-04, -5.3983182e-05],
         [ 1.0681152e-04, -5.3983182e-05],
         [ 1.0681152e-04, -5.3983182e-05]],
 
        [[ 5.2356720e-04, -5.9604645e-08],
         [ 5.2356720e-04, -5.9604645e-08],
         [ 5.2356720e-04, -5.9604645e-08],
         ...,
         [ 1.3637543e-04, -2.0563602e-05],
         [ 1.3637543e-04, -2.0563602e-05],
         [ 1.3637543e-04, -2.0563602e-05]],
 
        [[ 6.1416626e-04, -7.4505806e-08],
         [ 6.1416626e-04, -7.4505806e-08],
         [ 6.1416626e-04, -7.4505806e-08],
         ...,
         [ 2.7084351e-04,  3.2633543e-06],
         [ 2.7084351e-04,  3.2633543e-06],
         [ 2.7084351e-04,  3.2633543e-06]],
 
        ...,
 
        [[ 5.3977966e-04,  1.4901161e-08],
         [ 5.3977966e-04,  1.4901161e-08],
         [ 5.3977966e-04,  1.4901161e-08],
         ...,
         [ 9

In [31]:
sim = dataset.compute_similarity("distance")  # or dataset.compute_similarity()

In [32]:
sim

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)

In [34]:
# Adjust connectivity to reduce the number of edges
connectivity = dataset.get_connectivity()

AssertionError: 