# merge_hourly_standardization debugging 
Demonstrate issues with merge_hourly_standardization function. <br>Date: May 18, 2025

In [87]:
import xarray as xr 
import pandas as pd
import logging

from merge_hourly_standardization import merge_hourly_standardization

### 1. Read in a sample zarr, setup for input to function
1. Make a simple logger that just prints to the console (required input to function)
2. Read in zarr as xr.Dataset 
3. Convert xr.Dataset --> pd.DataFrame (required input to function)
4. Change MultiIndex DataFrame to a flat (single-level) DataFrame, with `time` and `station` as normal columns 
5. Input DataFrame, variable attributes, and logger to `merge_hourly_standardization`

In [None]:
# Create a simple logger that just prints to the console
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()  

# Read in zarr as xarray object 
s3_path = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_69007093217.zarr/"
ds = xr.open_zarr(s3_path)
var_attrs = {
        var: ds[var].attrs for var in list(ds.data_vars.keys())
    }  # Attributes from each variable

# Convert to dataframe 
df = ds.to_dataframe()

# Set index MultiIndex --> normal df 
df.reset_index(inplace=True)

# Input dataframe to merge_hourly_standardization function 
df_after_merge, var_attrs = merge_hourly_standardization(df, var_attrs, logger)

### 2. Observe the original DataFrame
Note how `station` and `time` are columns 

In [None]:
df

Unnamed: 0,station,time,anemometer_height_m,elevation,elevation_eraqc,lat,lon,pr,pr_depth_qc,pr_duration,...,sfcWind_eraqc,sfcWind_method,sfcWind_qc,tas,tas_eraqc,tas_qc,tdps,tdps_eraqc,tdps_qc,thermometer_height_m
0,ASOSAWOS_69007093217,1991-01-02 12:00:00,,41.0,,36.683,-121.767,0.0,9.0,NaT,...,,N,1,19,,1,19,28.0,1,
1,ASOSAWOS_69007093217,1991-01-02 13:00:00,,41.0,,36.683,-121.767,,,NaT,...,,N,1,19,,1,19,28.0,1,
2,ASOSAWOS_69007093217,1991-01-02 14:00:00,,41.0,,36.683,-121.767,,,NaT,...,,N,1,19,,1,19,28.0,1,
3,ASOSAWOS_69007093217,1991-01-02 15:00:00,,41.0,,36.683,-121.767,0.0,9.0,NaT,...,,N,1,19,,1,19,28.0,1,
4,ASOSAWOS_69007093217,1991-01-02 16:00:00,,41.0,,36.683,-121.767,,,NaT,...,,N,1,19,,1,19,28.0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14264,ASOSAWOS_69007093217,1993-08-31 19:00:00,,41.0,,36.683,-121.767,,,NaT,...,,N,1,19,,1,19,28.0,1,
14265,ASOSAWOS_69007093217,1993-08-31 20:00:00,,41.0,,36.683,-121.767,,,NaT,...,,N,1,19,,1,19,28.0,1,
14266,ASOSAWOS_69007093217,1993-08-31 21:00:00,,41.0,,36.683,-121.767,,,NaT,...,,N,1,19,,1,19,28.0,1,
14267,ASOSAWOS_69007093217,1993-08-31 22:00:00,,41.0,,36.683,-121.767,,,NaT,...,,N,1,19,,1,19,28.0,1,


### 3. Obeserve the output DataFrame 
Note how `time` is the index of the DataFrame, and the columns have `_x` and `_y` in their name. <br>This normally happens during a `pd.merge` call when you try to combine DataFrames in an incompatible way.<br><br>I believe this issue is coming from this step in the `merge_hourly_standardization` function: 

```python 
result = reduce(
        lambda left, right: pd.merge(left, right, on=["time"], how="outer"),
        result_list,
    )
```

In [99]:
# df after merge_hourly_standardization function 
df_after_merge

Unnamed: 0_level_0,station_x,lat_x,lon_x,elevation_x,anemometer_height_m_x,thermometer_height_m_x,tas_x,tdps_x,psl_x,ps_altimeter_x,...,lat_y,lon_y,pr_y,ps_altimeter_y,psl_y,sfcWind,sfcWind_dir,tas_y,tdps_y,thermometer_height_m_y
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1991-01-02 12:00:00,ASOSAWOS_69007093217,36.683,-121.767,41.0,,,19.0,19.0,19.0,19.0,...,36.683,-121.767,0.0,19,19,4.1,90.0,19,19,
1991-01-02 13:00:00,ASOSAWOS_69007093217,36.683,-121.767,41.0,,,19.0,19.0,19.0,19.0,...,36.683,-121.767,,19,19,3.6,90.0,19,19,
1991-01-02 14:00:00,ASOSAWOS_69007093217,36.683,-121.767,41.0,,,19.0,19.0,19.0,19.0,...,36.683,-121.767,,19,19,3.6,100.0,19,19,
1991-01-02 15:00:00,ASOSAWOS_69007093217,36.683,-121.767,41.0,,,19.0,19.0,19.0,19.0,...,36.683,-121.767,0.0,19,19,2.6,100.0,19,19,
1991-01-02 16:00:00,ASOSAWOS_69007093217,36.683,-121.767,41.0,,,19.0,19.0,19.0,19.0,...,36.683,-121.767,,19,19,3.1,90.0,19,19,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1993-08-31 19:00:00,ASOSAWOS_69007093217,36.683,-121.767,41.0,,,19.0,19.0,19.0,19.0,...,36.683,-121.767,,19,19,3.6,300.0,19,19,
1993-08-31 20:00:00,ASOSAWOS_69007093217,36.683,-121.767,41.0,,,19.0,19.0,19.0,19.0,...,36.683,-121.767,,19,19,5.1,290.0,19,19,
1993-08-31 21:00:00,ASOSAWOS_69007093217,36.683,-121.767,41.0,,,19.0,19.0,19.0,19.0,...,36.683,-121.767,,19,19,5.6,310.0,19,19,
1993-08-31 22:00:00,ASOSAWOS_69007093217,36.683,-121.767,41.0,,,19.0,19.0,19.0,19.0,...,36.683,-121.767,,19,19,6.1,300.0,19,19,
