In [1]:
import xarray as xr 

ds = xr.open_dataset('ml-drought-forecasting/soil-water-forecasting/data/01_raw/ERA5_monthly_averaged_data_on_single_levels.nc')

In [2]:
ds

# Format datatype

In [3]:
import pandas as pd

# 1. Convert 'date' coordinate to datetime
ds['date'] = pd.to_datetime(ds['date'].astype(str), format='%Y%m%d')

# Fill na values

In [4]:
# INTO 3.data_preprocessing.ipynb 

import xarray as xr
from typing import List, Union

def fillna_in_variables(
    ds: xr.Dataset,
    variables: List[str],
    fill_value: Union[int, float]
) -> xr.Dataset:
    """
    Fills NaN values in specified variables of an xarray.Dataset with a provided value.

    Args:
        ds (xr.Dataset): The input dataset containing the data variables.
        variables (List[str]): A list of variable names in the dataset for which to fill NaN values.
        fill_value (Union[int, float]): The value to fill NaN values with (e.g., 0).

    Returns:
        xr.Dataset: A new dataset with NaN values filled in the specified variables.
    
    Raises:
        ValueError: If a variable in `variables` does not exist in the dataset.
    
    Example:
        filled_ds = fillna_in_variables(ds, ["t2m", "swvl1"], fill_value=0)
    """
    # Check if each variable exists in the dataset
    for var in variables:
        if var not in ds:
            raise ValueError(f"Variable '{var}' not found in the dataset.")
    
    # Fill NaN values for the specified variables
    filled_ds = ds.copy()
    for var in variables:
        filled_ds[var] = filled_ds[var].fillna(fill_value)
    
    return filled_ds

ds = fillna_in_variables(ds, ["sst"], fill_value=0)

# Drop irrelavat variables 

# Interploation

In [5]:
import numpy as np
import xarray as xr

# Step 1: Define target grid with 1° resolution
target_lat = np.arange(-90, 90.1, 1)   # From -90 to 90 degrees inclusive
target_lon = np.arange(0, 360, 1)      # From 0 to 359.5 degrees inclusive

# Step 2: Create target grid Dataset (optional, for reference)
target_grid = xr.Dataset(
    {
        "latitude": (["latitude"], target_lat),
        "longitude": (["longitude"], target_lon),
    }
)

# Step 3: Ensure latitude is ascending
if ds.latitude[0] > ds.latitude[-1]:
    ds = ds.sortby("latitude")

# Step 4: Perform interpolation
ds = ds.interp(latitude=target_lat, longitude=target_lon, method="linear")


In [6]:
ds

# Smaller data range for testing 

In [7]:
# Filter the dataset for the date range 2016-01-01 to 2023-12-01
ds = ds.sel(date=slice("2016-01-01", "2023-12-01"))


In [8]:
ds

In [9]:
ds.to_netcdf('/teamspace/studios/this_studio/ml-drought-forecasting/soil-water-forecasting/data/02_intermediate/preprocessed_data.nc')