# Calculating weighted time series on regridded WOA data
**Author:** Denisse Fierro Arcos  
**Date:** 2024-10-04 

## Loading libraries

In [2]:
import xarray as xr
import xesmf as xe
import pandas as pd
from glob import glob
import geopandas as gpd
import os
import numpy as np
import rioxarray

## Loading WOA data

In [2]:
temp_month = xr.open_zarr(
    os.path.join('/g/data/vf71/WOA_data/global',
                 'woa23_month_clim_mean_temp_1981-2010.zarr/')).t_an
temp_month

Unnamed: 0,Array,Chunk
Bytes,2.64 GiB,75.15 MiB
Shape,"(12, 57, 720, 1440)","(12, 57, 120, 240)"
Dask graph,36 chunks in 2 graph layers,36 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.64 GiB 75.15 MiB Shape (12, 57, 720, 1440) (12, 57, 120, 240) Dask graph 36 chunks in 2 graph layers Data type float32 numpy.ndarray",12  1  1440  720  57,

Unnamed: 0,Array,Chunk
Bytes,2.64 GiB,75.15 MiB
Shape,"(12, 57, 720, 1440)","(12, 57, 120, 240)"
Dask graph,36 chunks in 2 graph layers,36 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Loading sample GFDL data

In [3]:
gfdl = xr.open_zarr(
    os.path.join('/g/data/vf71/fishmip_inputs/ISIMIP3a/global_inputs/obsclim',
                 '025deg/comp_clim_woa', 
                 'gfdl-mom6-cobalt2_obsclim_global_clim_mean_temp_1981_2010.zarr/')).thetao
gfdl

Unnamed: 0,Array,Chunk
Bytes,138.43 MiB,5.54 MiB
Shape,"(35, 720, 1440)","(35, 144, 288)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 138.43 MiB 5.54 MiB Shape (35, 720, 1440) (35, 144, 288) Dask graph 25 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  720  35,

Unnamed: 0,Array,Chunk
Bytes,138.43 MiB,5.54 MiB
Shape,"(35, 720, 1440)","(35, 144, 288)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Interpolate depth values to match GFDL

In [17]:
temp_month = temp_month.interp({'depth': gfdl.depth.values})
temp_month

Unnamed: 0,Array,Chunk
Bytes,1.62 GiB,46.14 MiB
Shape,"(12, 35, 720, 1440)","(12, 35, 120, 240)"
Dask graph,36 chunks in 10 graph layers,36 chunks in 10 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.62 GiB 46.14 MiB Shape (12, 35, 720, 1440) (12, 35, 120, 240) Dask graph 36 chunks in 10 graph layers Data type float32 numpy.ndarray",12  1  1440  720  35,

Unnamed: 0,Array,Chunk
Bytes,1.62 GiB,46.14 MiB
Shape,"(12, 35, 720, 1440)","(12, 35, 120, 240)"
Dask graph,36 chunks in 10 graph layers,36 chunks in 10 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Calculate regridder 

In [15]:
reg = xe.Regridder(temp_month, gfdl, method = 'conservative')
reg

--------------------------------------------------------------------------
but there are no active ports detected (or Open MPI was unable to use
them).  This is most certainly not what you wanted.  Check your
cables, subnet manager configuration, etc.  The openib BTL will be
ignored for this job.

  Local host: gadi-cpu-bdw-0014
--------------------------------------------------------------------------
  lon_bnds = ds.cf.get_bounds('longitude')


xESMF Regridder 
Regridding algorithm:       conservative 
Weight filename:            conservative_720x1440_720x1440.nc 
Reuse pre-computed weights? False 
Input grid shape:           (720, 1440) 
Output grid shape:          (720, 1440) 
Periodic in longitude?      False

## Apply regridder

In [18]:
temp_woa_reg = reg(temp_month, output_chunks = (144, 288))
temp_woa_reg

Unnamed: 0,Array,Chunk
Bytes,1.62 GiB,66.45 MiB
Shape,"(12, 35, 720, 1440)","(12, 35, 144, 288)"
Dask graph,25 chunks in 18 graph layers,25 chunks in 18 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.62 GiB 66.45 MiB Shape (12, 35, 720, 1440) (12, 35, 144, 288) Dask graph 25 chunks in 18 graph layers Data type float32 numpy.ndarray",12  1  1440  720  35,

Unnamed: 0,Array,Chunk
Bytes,1.62 GiB,66.45 MiB
Shape,"(12, 35, 720, 1440)","(12, 35, 144, 288)"
Dask graph,25 chunks in 18 graph layers,25 chunks in 18 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Change month names format to date

In [19]:
temp_woa_reg['month'] = pd.date_range(start = '1981-01-01', 
                                      periods = 12, freq = 'MS').strftime('%B')
temp_woa_reg

Unnamed: 0,Array,Chunk
Bytes,1.62 GiB,66.45 MiB
Shape,"(12, 35, 720, 1440)","(12, 35, 144, 288)"
Dask graph,25 chunks in 18 graph layers,25 chunks in 18 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.62 GiB 66.45 MiB Shape (12, 35, 720, 1440) (12, 35, 144, 288) Dask graph 25 chunks in 18 graph layers Data type float32 numpy.ndarray",12  1  1440  720  35,

Unnamed: 0,Array,Chunk
Bytes,1.62 GiB,66.45 MiB
Shape,"(12, 35, 720, 1440)","(12, 35, 144, 288)"
Dask graph,25 chunks in 18 graph layers,25 chunks in 18 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Save results

In [20]:
temp_woa_reg.to_zarr(
    os.path.join('/g/data/vf71/WOA_data/global',
                 'regridded_woa_month_clim_mean_temp_1981-2010.zarr'),
                     consolidated = True, mode = 'w')

<xarray.backends.zarr.ZarrStore at 0x14f0af68d740>

## Load supporting files

In [3]:
#Loading FishMIP regional models shapefile
rmes = gpd.read_file(
    os.path.join('/g/data/vf71/shared_resources/FishMIP_regional_models', 
                 'FishMIP_regional_models.shp'))
#Loading FishMIP regional models gridded mask
mask_ras = xr.open_dataset(
    os.path.join('/g/data/vf71/shared_resources/FishMIPMasks',
                 'merged_regional_fishmip', 
                 'gfdl-mom6-cobalt2_areacello_15arcmin_fishMIP_regional_merged.nc')).region
#Renaming coordinate dimensions
mask_ras = mask_ras.rename({'latitude': 'lat', 'longitude': 'lon'})
#Rechunking data to make it more manageable
mask_ras = mask_ras.chunk({'lat': 144, 'lon': 288})

# Getting a list of all WOA zarr files available 
WOA_zarr = glob('/g/data/vf71/WOA_data/global/regridded*month*.zarr')

#Define (or create) folders where outputs will be stored
base_out_month = '/g/data/vf71/WOA_data/regional/monthly/regridded'
base_out_clim = '/g/data/vf71/WOA_data/regional/climatology/comp_clim/'
os.makedirs(base_out_month, exist_ok = True)

## Define supporting functions

In [5]:
def mask_boolean_ard_data(file_path, boolean_mask_ras):
    '''
    Open netCDF files in analysis ready data (ARD) format. That is apply chunks
    that make data analysis easier.
    
    Inputs:
    file_path (character): Full file path where netCDF file is stored.
    boolean_mask_ras (boolean data array): Data array to be used as initial mask
    to decrease the size of the original dataset. This mask makes no distinction
    between regional models, it simply identifies grid cells within regional 
    model boundaries with the value of 1.
    
    Outputs:
    da (data array): ARD data array containing data only for grid cells within
    regional model boundaries.
    '''

    #Getting chunks from gridded mask to apply it to model data array
    [lat_chunk] = np.unique(boolean_mask_ras.chunksizes['lat'])
    [lon_chunk] = np.unique(boolean_mask_ras.chunksizes['lon'])
    
    #Open data array
    da = xr.open_zarr(file_path)
    [var] = list(da.keys())
    da = da[var]
    #Ensure chunks are the same as mask
    if 'month' in da.dims:
        da = da.chunk({'month': 12, 'depth': 57, 'lat': lat_chunk, 'lon': lon_chunk})
        #Make sure months are in correct format
        da['month'] = pd.date_range(start = '2010-01-01', periods = 12, 
                                    freq = 'MS').strftime('%B')
    else:
        da = da.chunk({'depth': 57, 'lat': lat_chunk, 'lon': lon_chunk})
        
    #Apply mask for all regions to decrease dataset size
    da = da.where(boolean_mask_ras == 1)
    
    #Add spatial information to dataset
    da.rio.set_spatial_dims(x_dim = 'lon', y_dim = 'lat', inplace = True)
    da.rio.write_crs('epsg:4326', inplace = True)

    return da

In [6]:
def mask_ard_data(ard_da, shp_mask, file_out):
    '''
    Open netCDF files in analysis ready data (ARD) format. That is apply chunks
    that make data analysis easier.
    
    Inputs:
    ard_da (data array): Analysis ready data (ARD) data array produced by the 
    function "open_ard_data"
    shp_mask (shapefile): Shapefile containing the boundaries of regional models
    file_out (character): Full file path where masked data should be stored.
    
    Outputs:
    No data is returned, but masked file will be stored in specified file path.
    '''

    #Clip data using regional shapefile
    da_mask = ard_da.rio.clip(shp_mask.geometry, shp_mask.crs, drop = True, 
                              all_touched = True)
    #Remove spatial information
    da_mask = da_mask.drop_vars('spatial_ref')
    da_mask.encoding = {}

    #Check file extension included in path to save data
    if file_out.endswith('zarr'):
        for i, c in enumerate(da_mask.chunks):
            if len(c) > 1 and len(set(c)) > 1:
                print(f'Rechunking {file_out}.')
                print(f'Dimension "{da_mask.dims[i]}" has unequal chunks.')
                da_mask = da_mask.chunk({da_mask.dims[i]: '200MB'})
        da_mask.to_zarr(file_out, consolidated = True, mode = 'w')
    if file_out.endswith('parquet'):
        #Keep data array attributes to be recorded in final data frame
        da_attrs = ard_da.attrs
        da_attrs = pd.DataFrame([da_attrs])
        if 'month' in ard_da.dims:
            ind_wider = ['lat', 'lon', 'month', 'depth', 'vals']
        else:
            ind_wider = ['lat', 'lon', 'depth', 'vals']
        #Turn extracted data into data frame and remove rows with NA values
        df = da_mask.to_series().to_frame().reset_index().dropna()
        #Changing column name to standardise across variables
        df = df.rename(columns = {ard_da.name: 'vals'}).reset_index(drop = True)
        #Reorganise data
        df = df[ind_wider]
        #Include original dataset attributes
        df = pd.concat([df, da_attrs], axis = 1)
        #Saving data frame
        df.to_parquet(file_out)

## Apply supporting functions

In [17]:
#Applying functions to WOA files
for f in WOA_zarr:
    #Open data array as ARD
    da = mask_boolean_ard_data(f, mask_ras)   
    base_name = os.path.basename(f).replace('zarr', 'parquet')
    
    #Create full file path
    if 'month' in f:
         #Adding output folder to create full file path
        full_file_out = os.path.join(base_out_month, base_name)
    else:
        full_file_out = os.path.join(base_out_clim, base_name)

    #Extract data for each region included in the regional mask
    for i in rmes.region:
        #Get polygon for each region
        mask = rmes[rmes.region == i]
        #Get name of region and clean it for use in output file
        reg_name = mask['region'].values[0].lower().replace(" ", "-").replace("'", "")
        #File name out - Replacing "global" for region name
        file_out = full_file_out.replace('woa_', f'woa_{reg_name}_')
        #Extract data and save masked data - but only if file does not already exist
        if os.path.isdir(file_out) | os.path.isfile(file_out):
            continue
        mask_ard_data(da, mask, file_out)