In [38]:
import xarray as xr

example = xr.open_dataset('C:/Users/23603526/Documents/GitHub/WeatherForecasting/data\era5_ground_truth_1464x49x69.nc')

base = xr.open_dataset('C:/Users/23603526/Documents/GitHub/WeatherForecasting/data\era5_2015_2020.nc')
base = base.sortby('latitude', ascending=True)
base.load()

In [39]:
# Drop expver and number dimension

base = base.drop('expver')
base = base.drop('number')


  base = base.drop('expver')
  base = base.drop('number')


In [40]:
# Rename valid_time dimension to time

base = base.rename({'valid_time': 'time'})

In [42]:
base['wind_speed'] = (base.u**2 + base.v**2)**0.5

In [43]:
training = base.sel(time=slice('2015-01-01', '2019-12-31'))
testing = base.sel(time=slice('2020-01-01', '2020-12-31'))

In [45]:
training.to_netcdf('C:/Users/23603526/Documents/GitHub/WeatherForecasting/data/era5_training.nc')

In [47]:
testing.to_netcdf('C:/Users/23603526/Documents/GitHub/WeatherForecasting/data/era5_testing.nc')

In [None]:
from torch.utils.data import Dataset
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from typing import Tuple

class WeatherDataset(Dataset):
    def __init__(self, dataset, window_size: int = 5, steps: int = 1, data_split: str = 'train') -> None:
        self.dataset = dataset

        self.min_value = self.dataset.wind_speed.values.min()
        self.max_value = self.dataset.wind_speed.values.max()

        self.mean_value = self.dataset.wind_speed.values.mean()
        self.std_value = self.dataset.wind_speed.values.std()    

        self.window_size = window_size
        self.steps = steps
        self.data_split = data_split    


    def __len__(self) -> int:

        if self.data_split == 'train':
            dataset_length = len(self.X_train)
        elif self.data_split == 'val':
            dataset_length = len(self.X_val)
        elif self.data_split == 'test':
            dataset_length = len(self.X_test)
        else:
            raise ValueError("data_split must be 'train', 'val', or 'test'")
        
        total_window_size = (self.window_size + self.steps)  * self.intervals
        num_windows = dataset_length - total_window_size + self.intervals  
        
        return max(0, num_windows)  


    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

        if self.data_split == 'train':
            x = self.X_train_t[idx:idx + self.window_size]
            F = self.F_train_t[idx + self.window_size]
            y = self.X_train_t[idx + self.window_size:idx + self.window_size + self.steps] 
            return x, F, y
        elif self.data_split == 'val':
            x = self.X_val_t[idx:idx + self.window_size]
            F = self.F_val_t[idx + self.window_size]
            y = self.X_val_t[idx + self.window_size:idx + self.window_size + self.steps] 
            return x, F, y
        elif self.data_split == 'test':
            x = self.X_test_t[idx:idx + self.window_size]
            F = self.F_test_t[idx + self.window_size]
            y = self.X_test_t[idx + self.window_size:idx + self.window_size + self.steps] 
            return x, F, y
        else:
            raise ValueError("data_split must be 'train', 'val', or 'test'")


    def split_data(self, test_size: float = 0.1, val_size: float = 0.2, random_state: int = 42) -> None:
        
        
        data = self.dataset.wind_speed.values
        forcings = np.stack([self.dataset.time.dt.hour.values, self.dataset.time.dt.month.values], axis=-1)
        time_values = self.dataset.time.values

        # Split the data into train, validation, and test sets

        self.X_train, self.X_test, self.F_train, self.F_test, self.T_train, self.T_test = train_test_split(data, forcings, time_values, test_size=test_size, shuffle=False)

        self.X_train, self.X_val, self.F_train, self.F_val, self.T_train, self.T_val = train_test_split(self.X_train, self.F_train, self.T_train, test_size=val_size, shuffle=False)
    

    def normalize_data(self, method: str = 'min_max') -> None:
        """
        Normalizes the training, validation, and testing data using mean and standard deviation.

        Returns:
            None: Updates the instance attributes with normalized data as tensors.
        """

        self.X_train_t = self.normalize(self.X_train, method)
        self.X_val_t = self.normalize(self.X_val, method)
        self.X_test_t = self.normalize(self.X_test, method)

        # Convert to tensors
        self.X_train_t = torch.tensor(self.X_train_t).float()

        self.X_val_t = torch.tensor(self.X_val_t).float()

        self.X_test_t = torch.tensor(self.X_test_t).float()

        # Convert forcings to tensors
        self.F_train_t = torch.tensor(self.F_train).float()
        self.F_val_t = torch.tensor(self.F_val).float()
        self.F_test_t = torch.tensor(self.F_test).float()


    def normalize(self, data: np.ndarray, method: str = 'avg_std') -> np.ndarray:
        if method == 'min_max':
            return (data - self.min_value) / (self.max_value - self.min_value)
        else:
            return (data - self.mean_value) / self.std_value


In [25]:
weather_dataset = WeatherDataset(base)
weather_dataset.split_data()
weather_dataset.normalize_data()