In [1]:
import xarray as xr 

ds = xr.open_dataset('/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/01_raw/e16fe5f664d5290665a25e5afad51ce9.nc')

In [2]:
ds 

In [3]:
import numpy as np
import xarray as xr

# Assuming your dataset is called ds
# Example dataset variables: t2m, sst, tp, pev

# 1. Extract relevant variables
variables = ['t2m', 'sst', 'tp', 'pev']
data_arrays = [ds[var] for var in variables]

# 2. Flatten latitude and longitude into a single node dimension
# Combine the latitude and longitude as one "node" dimension
data_arrays_flattened = [da.stack(node=('latitude', 'longitude')) for da in data_arrays]

# 3. Convert each variable's DataArray to a numpy array and add a new channel dimension
# (so we have the shape (time, nodes, channels))
target_array = np.stack([da.to_numpy() for da in data_arrays_flattened], axis=-1)

# Now target_array will have shape (time, nodes, channels)


In [4]:
target_array

array([[[ 2.4786041e+02,  2.7145972e+02,  4.3678284e-04,  1.3783574e-07],
        [ 2.4786041e+02,  2.7145972e+02,  4.3678284e-04,  1.3783574e-07],
        [ 2.4786041e+02,  2.7145972e+02,  4.3678284e-04,  1.3783574e-07],
        ...,
        [ 2.4409088e+02,            nan,  1.0681152e-04, -5.3983182e-05],
        [ 2.4409088e+02,            nan,  1.0681152e-04, -5.3983182e-05],
        [ 2.4409088e+02,            nan,  1.0681152e-04, -5.3983182e-05]],

       [[ 2.5077211e+02,  2.7146021e+02,  5.2356720e-04, -5.9604645e-08],
        [ 2.5077211e+02,  2.7146021e+02,  5.2356720e-04, -5.9604645e-08],
        [ 2.5077211e+02,  2.7146021e+02,  5.2356720e-04, -5.9604645e-08],
        ...,
        [ 2.3352406e+02,            nan,  1.3637543e-04, -2.0563602e-05],
        [ 2.3352406e+02,            nan,  1.3637543e-04, -2.0563602e-05],
        [ 2.3352406e+02,            nan,  1.3637543e-04, -2.0563602e-05]],

       [[ 2.5326758e+02,  2.7145972e+02,  6.1416626e-04, -7.4505806e-08],
        

In [5]:
# Save as .npy file
np.save('/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/target_array.npy', target_array)

In [15]:
from typing import Optional, Union, List

from tsl.datasets.prototypes import TabularDataset

class DroughtDataset(TabularDataset):

    similarity_options = {'distance', 'correlation'}

    def __init__(self,
                 root: str = None,
                 freq: Optional[str] = None,
                 ):

        self.root = root

        # Load data
        target = self.load()

        super().__init__(target=target,
                        #  mask=mask,
                        #  covariates=covariates,
                        #  freq=freq,
                         similarity_score='distance',
                         temporal_aggregation='mean',
                         spatial_aggregation='mean',
                         name='DroughtDataset')

    def load(self):
        """
        Load data from files.

        Returns:
            tuple: Containing dataframe, mask, metadata, and distance matrix.
        """
        data_path = f"{self.root}target_array.npy"

        # Load main data
        target = np.load(data_path)

        return target

    def compute_similarity(self, method: str, **kwargs):
        """
        Compute similarity matrix based on the specified method.

        Args:
            method (str): The similarity computation method ('distance' or 'correlation').
            **kwargs: Additional keyword arguments for similarity computation.

        Returns:
            numpy.ndarray: Computed similarity matrix.

        Raises:
            ValueError: If an unknown similarity method is provided.
        """
        if method == "distance":
            # Calculate a Gaussian kernel similarity from the distance matrix, using a default or provided 'theta'
            theta = kwargs.get('theta', np.std(self.distances))
            return self.gaussian_kernel(self.distances, theta=theta)
        elif method == "correlation":
            # Compute the average correlation between nodes over the target features
            # Reshape target data to have nodes as columns
            target_values = self.target.values.reshape(len(self.target), -1, len(self.target_node_feature))
            # Average over the target features
            target_mean = target_values.mean(axis=2)
            # Compute correlation between nodes
            corr = np.corrcoef(target_mean, rowvar=False)
            return (corr + 1) / 2  # Normalize to [0, 1]
        else:
            raise ValueError(f"Unknown similarity method: {method}")

    @staticmethod
    def gaussian_kernel(distances, theta):
        """
        Compute Gaussian kernel similarity from distances.

        Args:
            distances (numpy.ndarray): Distance matrix.
            theta (float): Kernel bandwidth parameter.

        Returns:
            numpy.ndarray: Gaussian kernel similarity matrix.
        """
        return np.exp(-(distances ** 2) / (2 * (theta ** 2)))

In [16]:
dataset = DroughtDataset(root='/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/')

In [17]:
dataset

DroughtDataset(length=96, n_nodes=1038240, n_channels=4)

In [18]:
dataset.target

array([[[ 2.4786041e+02,  2.7145972e+02,  4.3678284e-04,  1.3783574e-07],
        [ 2.4786041e+02,  2.7145972e+02,  4.3678284e-04,  1.3783574e-07],
        [ 2.4786041e+02,  2.7145972e+02,  4.3678284e-04,  1.3783574e-07],
        ...,
        [ 2.4409088e+02,            nan,  1.0681152e-04, -5.3983182e-05],
        [ 2.4409088e+02,            nan,  1.0681152e-04, -5.3983182e-05],
        [ 2.4409088e+02,            nan,  1.0681152e-04, -5.3983182e-05]],

       [[ 2.5077211e+02,  2.7146021e+02,  5.2356720e-04, -5.9604645e-08],
        [ 2.5077211e+02,  2.7146021e+02,  5.2356720e-04, -5.9604645e-08],
        [ 2.5077211e+02,  2.7146021e+02,  5.2356720e-04, -5.9604645e-08],
        ...,
        [ 2.3352406e+02,            nan,  1.3637543e-04, -2.0563602e-05],
        [ 2.3352406e+02,            nan,  1.3637543e-04, -2.0563602e-05],
        [ 2.3352406e+02,            nan,  1.3637543e-04, -2.0563602e-05]],

       [[ 2.5326758e+02,  2.7145972e+02,  6.1416626e-04, -7.4505806e-08],
        

In [None]:
dataset.target

In [24]:
import xarray as xr
import numpy as np
import pandas as pd

# Load the dataset
# ds = xr.open_dataset('your_dataset.nc')

# For the purpose of this example, we'll assume 'ds' is already defined

# Variables to include
variables = ['t2m', 'sst', 'tp', 'pev']

# Stack latitude and longitude into 'node'
ds_stacked = ds.stack(node=('latitude', 'longitude'))

# Initialize an empty DataFrame
df = pd.DataFrame()

for var in variables:
    # Extract data variable
    da = ds_stacked[var]
    
    # Convert to DataFrame
    df_var = da.to_pandas()
    
    # Create MultiIndex columns
    df_var.columns = pd.MultiIndex.from_tuples(
        [(node, var) for node in df_var.columns],
        names=['node', 'channel']
    )
    
    # Concatenate variables
    df = pd.concat([df, df_var], axis=1)

# Set 'date' as the index if not already
if df.index.name != 'date':
    df = df.set_index('date')

# Convert to numpy array
numpy_array = df.values.reshape(df.shape[0], -1, len(variables))

print("DataFrame shape:", df.shape)
print("numpy.ndarray shape:", numpy_array.shape)


DataFrame shape: (96, 4152960)
numpy.ndarray shape: (96, 1038240, 4)


In [25]:
numpy_array

array([[[ 2.4786041e+02,  2.4786041e+02,  2.4786041e+02,  2.4786041e+02],
        [ 2.4786041e+02,  2.4786041e+02,  2.4786041e+02,  2.4786041e+02],
        [ 2.4786041e+02,  2.4786041e+02,  2.4786041e+02,  2.4786041e+02],
        ...,
        [-5.3983182e-05, -5.3983182e-05, -5.3983182e-05, -5.3983182e-05],
        [-5.3983182e-05, -5.3983182e-05, -5.3983182e-05, -5.3983182e-05],
        [-5.3983182e-05, -5.3983182e-05, -5.3983182e-05, -5.3983182e-05]],

       [[ 2.5077211e+02,  2.5077211e+02,  2.5077211e+02,  2.5077211e+02],
        [ 2.5077211e+02,  2.5077211e+02,  2.5077211e+02,  2.5077211e+02],
        [ 2.5077211e+02,  2.5077211e+02,  2.5077211e+02,  2.5077211e+02],
        ...,
        [-2.0563602e-05, -2.0563602e-05, -2.0563602e-05, -2.0563602e-05],
        [-2.0563602e-05, -2.0563602e-05, -2.0563602e-05, -2.0563602e-05],
        [-2.0563602e-05, -2.0563602e-05, -2.0563602e-05, -2.0563602e-05]],

       [[ 2.5326758e+02,  2.5326758e+02,  2.5326758e+02,  2.5326758e+02],
        

In [8]:
import dask.dataframe as dd

ddf = ds.to_dask_dataframe()


In [9]:
ddf.to_parquet('/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/02_intermediate/era5.parquet')


In [11]:
import polars as pl

df = pl.read_parquet("/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/02_intermediate/era5.parquet/part.0.parquet")

In [12]:
df

date,latitude,longitude,number,expver,t2m,sst,tp,pev,__null_dask_index__
i64,f64,f64,i64,str,f32,f32,f32,f32,i64
20150101,90.0,0.0,0,"""0001""",247.860413,271.459717,0.000437,1.3784e-7,0
20150101,90.0,0.25,0,"""0001""",247.860413,271.459717,0.000437,1.3784e-7,1
20150101,90.0,0.5,0,"""0001""",247.860413,271.459717,0.000437,1.3784e-7,2
20150101,90.0,0.75,0,"""0001""",247.860413,271.459717,0.000437,1.3784e-7,3
20150101,90.0,1.0,0,"""0001""",247.860413,271.459717,0.000437,1.3784e-7,4
…,…,…,…,…,…,…,…,…,…
20221201,-90.0,358.75,0,"""0001""",244.45224,,0.00025,-0.000031,99671035
20221201,-90.0,359.0,0,"""0001""",244.45224,,0.00025,-0.000031,99671036
20221201,-90.0,359.25,0,"""0001""",244.45224,,0.00025,-0.000031,99671037
20221201,-90.0,359.5,0,"""0001""",244.45224,,0.00025,-0.000031,99671038


In [13]:
df = df.drop(["expver", "__null_dask_index__"])

In [14]:
df

date,latitude,longitude,number,t2m,sst,tp,pev
i64,f64,f64,i64,f32,f32,f32,f32
20150101,90.0,0.0,0,247.860413,271.459717,0.000437,1.3784e-7
20150101,90.0,0.25,0,247.860413,271.459717,0.000437,1.3784e-7
20150101,90.0,0.5,0,247.860413,271.459717,0.000437,1.3784e-7
20150101,90.0,0.75,0,247.860413,271.459717,0.000437,1.3784e-7
20150101,90.0,1.0,0,247.860413,271.459717,0.000437,1.3784e-7
…,…,…,…,…,…,…,…
20221201,-90.0,358.75,0,244.45224,,0.00025,-0.000031
20221201,-90.0,359.0,0,244.45224,,0.00025,-0.000031
20221201,-90.0,359.25,0,244.45224,,0.00025,-0.000031
20221201,-90.0,359.5,0,244.45224,,0.00025,-0.000031


In [19]:
import polars as pl

# Assuming 'df' is your existing DataFrame
df = df.with_columns(
    pl.col("date")
    .cast(pl.Utf8)  # Step 1: Cast integer to string
    .str.strptime(pl.Date, format="%Y%m%d")  # Step 2: Parse string to date
)


In [20]:
df 

date,latitude,longitude,number,t2m,sst,tp,pev
date,f64,f64,i64,f32,f32,f32,f32
2015-01-01,90.0,0.0,0,247.860413,271.459717,0.000437,1.3784e-7
2015-01-01,90.0,0.25,0,247.860413,271.459717,0.000437,1.3784e-7
2015-01-01,90.0,0.5,0,247.860413,271.459717,0.000437,1.3784e-7
2015-01-01,90.0,0.75,0,247.860413,271.459717,0.000437,1.3784e-7
2015-01-01,90.0,1.0,0,247.860413,271.459717,0.000437,1.3784e-7
…,…,…,…,…,…,…,…
2022-12-01,-90.0,358.75,0,244.45224,,0.00025,-0.000031
2022-12-01,-90.0,359.0,0,244.45224,,0.00025,-0.000031
2022-12-01,-90.0,359.25,0,244.45224,,0.00025,-0.000031
2022-12-01,-90.0,359.5,0,244.45224,,0.00025,-0.000031


In [21]:
df.write_parquet("/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/03_primary/era5_data.parquet")