# Explore Merging Inputs into Xarray Datasets

# Imports
Using the environment that you set up in the example notebook. Run the following. 


In [30]:
import sys
import os
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr

# Demo Merging Inputs into Single DataSet
Variations on Paul's refatoring branch: https://github.com/EcohydrologyTeam/ClearWater-modules/blob/bd0c52267f4000303c1d593f7975ad8fc5423788/src/clearwater_modules_v2/run_dev.py 

In [2]:
Path.cwd()

PosixPath('/Users/aaufdenkampe/Documents/Python/ClearWater-modules/examples/dev_sandbox')

## Read CSVs & Process into Dataframes

In [3]:
model_name = 'sumwere_creek_coarse_p48'

In [4]:
# Path to the data directory
user = "anthony"

if user == "paul":
    DATA_ROOT = Path(
        r"C:\Users\ptomasula\Repositories\ClearWater-modules\data_temp"
    ) / model_name
else:
    DATA_ROOT = (
        Path.cwd().parent.parent.parent 
        / "Clearwater-riverine/examples/data_temp" 
        / model_name
    )
DATA_ROOT.exists()

True

In [5]:
def process_csv_inputs(
    csv_path: os.PathLike,
    index_col: int | str = 0,
) -> pd.DataFrame: 
    df = pd.read_csv(csv_path, index_col=index_col, parse_dates=True)
    df.dropna(axis="index", how="all", inplace=True)
    frequency = pd.infer_freq(df.index)
    df.index.freq = frequency
    df.index.rename(f"time_{frequency}", inplace=True)
    return df


In [6]:
### Air Temperature
### Measured at 5 min intervals, such as from a weather station
air_temperature_df = process_csv_inputs(
    DATA_ROOT / "cwr_boundary_conditions_TairC_p28.csv",
)
air_temperature_df.info()
air_temperature_df

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1440 entries, 2022-05-12 00:00:00 to 2022-05-16 23:55:00
Freq: 5min
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TairC   1440 non-null   float64
dtypes: float64(1)
memory usage: 22.5 KB


Unnamed: 0_level_0,TairC
time_5min,Unnamed: 1_level_1
2022-05-12 00:00:00,66.0
2022-05-12 00:05:00,66.0
2022-05-12 00:10:00,66.0
2022-05-12 00:15:00,66.0
2022-05-12 00:20:00,65.0
...,...
2022-05-16 23:35:00,72.0
2022-05-16 23:40:00,72.0
2022-05-16 23:45:00,71.0
2022-05-16 23:50:00,72.0


In [7]:
### Solar radiation
# Hourly estimates, such as from a climate reanalysis dataset 
# (i.e. ERA5, NLDAS, or CONUS404)
solar_df = process_csv_inputs(
    DATA_ROOT / "cwr_boundary_conditions_q_Solar_p28.csv",
)
solar_df.info()
solar_df

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 168 entries, 2022-05-10 17:00:00 to 2022-05-17 16:00:00
Freq: h
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   q_Solar  168 non-null    float64
dtypes: float64(1)
memory usage: 2.6 KB


Unnamed: 0_level_0,q_Solar
time_h,Unnamed: 1_level_1
2022-05-10 17:00:00,447.101675
2022-05-10 18:00:00,239.106842
2022-05-10 19:00:00,60.404514
2022-05-10 20:00:00,0.090223
2022-05-10 21:00:00,0.000000
...,...
2022-05-17 12:00:00,927.736352
2022-05-17 13:00:00,948.337344
2022-05-17 14:00:00,905.315856
2022-05-17 15:00:00,802.671790


In [8]:
# Add other required hourly meteorological boundary conditions
met_hourly_df = solar_df.copy()
met_hourly_df["cloudiness"] = 0.1
met_hourly_df["wind_speed"] = 3.0
met_hourly_df["atmospheric_pressure"] = 1013.0
met_hourly_df["atmospheric_vapor_pressure"] = (
    met_hourly_df["atmospheric_pressure"]
    - np.random.normal(0, 1, size=len(met_hourly_df))
)
met_hourly_df.info()
met_hourly_df

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 168 entries, 2022-05-10 17:00:00 to 2022-05-17 16:00:00
Freq: h
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   q_Solar                     168 non-null    float64
 1   cloudiness                  168 non-null    float64
 2   wind_speed                  168 non-null    float64
 3   atmospheric_pressure        168 non-null    float64
 4   atmospheric_vapor_pressure  168 non-null    float64
dtypes: float64(5)
memory usage: 7.9 KB


Unnamed: 0_level_0,q_Solar,cloudiness,wind_speed,atmospheric_pressure,atmospheric_vapor_pressure
time_h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-05-10 17:00:00,447.101675,0.1,3.0,1013.0,1013.432787
2022-05-10 18:00:00,239.106842,0.1,3.0,1013.0,1013.727130
2022-05-10 19:00:00,60.404514,0.1,3.0,1013.0,1013.867946
2022-05-10 20:00:00,0.090223,0.1,3.0,1013.0,1012.757997
2022-05-10 21:00:00,0.000000,0.1,3.0,1013.0,1012.491687
...,...,...,...,...,...
2022-05-17 12:00:00,927.736352,0.1,3.0,1013.0,1014.060578
2022-05-17 13:00:00,948.337344,0.1,3.0,1013.0,1013.741071
2022-05-17 14:00:00,905.315856,0.1,3.0,1013.0,1012.782859
2022-05-17 15:00:00,802.671790,0.1,3.0,1013.0,1013.867833


In [47]:
# convert to xarray and confirm whether coordinates are shared
met_hourly_ds = met_hourly_df.to_xarray()
print(met_hourly_ds.nbytes)
met_hourly_ds

8064


### Check sizes

In [58]:
met_hourly_ds.time_h.nbytes

1344

In [59]:
met_hourly_ds.q_Solar.nbytes

1344

In [65]:
met_hourly_ds.q_Solar.values.nbytes

1344

In [64]:
# Confirm that size of dataset is equal to the size of each of the data arrays, only counting the coordinate once
1344 * (1+5)

8064

### Check Memory addresses

In [56]:
id(met_hourly_ds.time_h)

4982576800

In [55]:
id(met_hourly_ds.q_Solar.time_h)

4980861520

In [57]:
id(met_hourly_ds.cloudiness.time_h)

4980857824

NOTE: It looks like there is a different memory addres for the time coordiate when they are accessed via an xr.DataArray

## Convert to Xarray Datasets & Merge

In [28]:
df_list = [air_temperature_df, met_hourly_df]

In [29]:
met_bc_ds = xr.merge(
    [df.to_xarray() for df in df_list], 
)
met_bc_ds

In [None]:
met_bc_ds.nbytes

31104

In [33]:
met_bc_ds.time_5min.nbytes

11520

In [44]:
met_bc_ds.time_h.values.nbytes

1344

# Demo Multiple Time Coordinates in a Merged Dataset
and loop through the different variables

## Load Xarray Tutorial Datasets

In [3]:
input_ds = xr.tutorial.open_dataset('air_temperature')
input_ds

## Create new DataArray with downsampled time dimension

In [4]:
# Create new DataArray with downsampled time dimension
daily_da = input_ds.air.resample(time='1D').mean() - 273.15
daily_da = daily_da.rename({'time': 'day'})
daily_da.name = 'temp'
daily_da.attrs['units'] = 'degC'
daily_da

In [5]:
# add daily dataarray to input dataset
input_ds['temp'] = daily_da
input_ds

## Run a Process on along two time coordinates

In [10]:
for step in input_ds.time.values:
    if step in input_ds.day.values:
        print(step)

2013-01-01T00:00:00.000000000
2013-01-02T00:00:00.000000000
2013-01-03T00:00:00.000000000
2013-01-04T00:00:00.000000000
2013-01-05T00:00:00.000000000
2013-01-06T00:00:00.000000000
2013-01-07T00:00:00.000000000
2013-01-08T00:00:00.000000000
2013-01-09T00:00:00.000000000
2013-01-10T00:00:00.000000000
2013-01-11T00:00:00.000000000
2013-01-12T00:00:00.000000000
2013-01-13T00:00:00.000000000
2013-01-14T00:00:00.000000000
2013-01-15T00:00:00.000000000
2013-01-16T00:00:00.000000000
2013-01-17T00:00:00.000000000
2013-01-18T00:00:00.000000000
2013-01-19T00:00:00.000000000
2013-01-20T00:00:00.000000000
2013-01-21T00:00:00.000000000
2013-01-22T00:00:00.000000000
2013-01-23T00:00:00.000000000
2013-01-24T00:00:00.000000000
2013-01-25T00:00:00.000000000
2013-01-26T00:00:00.000000000
2013-01-27T00:00:00.000000000
2013-01-28T00:00:00.000000000
2013-01-29T00:00:00.000000000
2013-01-30T00:00:00.000000000
2013-01-31T00:00:00.000000000
2013-02-01T00:00:00.000000000
2013-02-02T00:00:00.000000000
2013-02-03