# Model input data prep
In this notebook we will prepare 5 files for our model which are:

- target - our target variable
- mask 
- covariates - features 
- metadata 
- distance matrix

## Load data

In [None]:
import xarray as xr

ds = xr.open_dataset('/Users/adamprzychodni/Documents/Repos/ml-drought-forecasting/ml-modeling-pipeline/data/04_feature/features.nc')

In [None]:
ds

## Target

In [None]:
import numpy as np

# 1. Extract relevant variables
targets = ['swvl1']
data_arrays = [ds[var] for var in targets]

# 2. Flatten latitude and longitude into a single node dimension
# Combine the latitude and longitude as one "node" dimension
data_arrays_flattened = [da.stack(node=('latitude', 'longitude')) for da in data_arrays]

# 3. Convert each variable's DataArray to a numpy array and add a new channel dimension
# (so we have the shape (time, nodes, channels))
target = np.stack([da.to_numpy() for da in data_arrays_flattened], axis=-1)

In [None]:
target

In [None]:
# Save as .npy file
np.save('ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/target.npy', target)

## Mask 

In [None]:
mask = np.where(~np.isnan(target), 1, 0)

In [None]:
mask

In [None]:
# Save as .npy file
np.save('ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/mask.npy', mask)

## Covariates 

In [None]:
import numpy as np
import xarray as xr

# Assuming your dataset is called ds
# Example dataset variables: t2m, sst, tp, pev

# 1. Extract relevant variables
variables = ['t2m', 'd2m', 'msl', 'sp', 'sst', 'skt', 'e', 'pev', 'mlspr', 'ro', 'slt',
             # 'swvl1', #
             'stl1', 'cvh', 'lai_hv', 'cvl', 'tcc', 'mper', 'tco3', 'lsm']

data_arrays = [ds[var] for var in variables]

# 2. Flatten latitude and longitude into a single node dimension
# Combine the latitude and longitude as one "node" dimension
data_arrays_flattened = [da.stack(node=('latitude', 'longitude')) for da in data_arrays]

# 3. Convert each variable's DataArray to a numpy array and add a new channel dimension
# (so we have the shape (time, nodes, channels))
covariates = np.stack([da.to_numpy() for da in data_arrays_flattened], axis=-1)

covariates = np.nan_to_num(covariates, nan=0.0)

In [None]:
covariates

In [None]:
# Save as .npy file
np.save('ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/covariates.npy', covariates)

## Metadata

In [None]:
import pandas as pd
import os

def generate_and_save_metadata(df: pd.DataFrame, lat_col: str = 'lat', lon_col: str = 'lon', save_directory: str = None) -> pd.DataFrame:
    """
    Generates metadata from a given DataFrame by identifying unique latitude and longitude
    combinations and assigning a unique node ID to each combination. The metadata is then
    saved to a specified Parquet file.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - lat_col (str): The name of the column containing latitude data.
    - lon_col (str): The name of the column containing longitude data.
    - save_directory (str, optional): The directory where the metadata file will be saved. If None, 
                                      the file will be saved in the current working directory.

    Returns:
    - pd.DataFrame: A DataFrame containing the metadata with latitude, longitude, and 'node_id' as columns.
    """
    # Ensure the DataFrame contains necessary columns
    if lat_col not in df.columns or lon_col not in df.columns:
        raise ValueError(f"DataFrame must contain '{lat_col}' and '{lon_col}' columns.")

    # Create a unique node ID for each unique latitude-longitude combination
    unique_lat_lon = df[[lat_col, lon_col]].drop_duplicates().reset_index(drop=True)
    unique_lat_lon['node_id'] = unique_lat_lon.index

    # Create the metadata DataFrame
    metadata = unique_lat_lon.set_index('node_id')

    # Handle save directory and save metadata to Parquet file
    if save_directory:
        os.makedirs(save_directory, exist_ok=True)  # Create directory if it doesn't exist
        file_path = os.path.join(save_directory, "metadata.parquet")
    else:
        file_path = "metadata.parquet"  # Save in the current working directory

    metadata.to_parquet(file_path)
    print(f"Metadata file saved at: {file_path}")

    return metadata


In [None]:
import pandas as pd
import os

# Assume you've already stacked your data arrays as 'data_arrays_flattened'
# We'll use one of the flattened DataArrays to extract the node information
da_flattened = data_arrays_flattened[0]  # Using the first variable for example

# Get the MultiIndex from the 'node' dimension
node_index = da_flattened.indexes['node']

# Extract latitude and longitude from the MultiIndex
latitudes = node_index.get_level_values('latitude').values
longitudes = node_index.get_level_values('longitude').values

# Create a DataFrame with 'lat' and 'lon' columns
df = pd.DataFrame({
    'lat': latitudes,
    'lon': longitudes
})

# Now use your 'generate_and_save_metadata' function
metadata = generate_and_save_metadata(
    df,
    lat_col='lat',
    lon_col='lon',
    save_directory='ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/'
)


In [None]:
import pandas as pd
metadata = pd.read_parquet("ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/metadata.parquet")

In [None]:
metadata

In [None]:
# Converting the DataFrame to a numpy ndarray
metadata_array = metadata.to_numpy()

In [None]:
# Save as .npy file
np.save('ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/metadata.npy', metadata_array)

## Distance matrix 

In [None]:
from tsl.ops.similarities import geographical_distance
# Calculate geographical distances with coordinates converted to radians.
distances = geographical_distance(metadata_array, to_rad=True)

In [None]:
distances

In [None]:
# Save as .npy file
np.save('ml-drought-forecasting/ml-modeling-pipeline/data/05_model_input/distances.npy', distances)