In [1]:
import os
from glob import glob
#Base folder containing Earth System Model (ESM) data
base_dir = '/g/data/vf71/fishmip_inputs/ISIMIP3a/global_inputs/obsclim/025deg'
#Get a list of all files containing monthly ESM outputs (depth is excluded)
list_files = glob(os.path.join(base_dir, '*monthly*.nc'))

In [2]:
list_files[34:]

['/g/data/vf71/fishmip_inputs/ISIMIP3a/global_inputs/obsclim/025deg/gfdl-mom6-cobalt2_obsclim_thetao_15arcmin_global_monthly_1961_2010.nc',
 '/g/data/vf71/fishmip_inputs/ISIMIP3a/global_inputs/obsclim/025deg/gfdl-mom6-cobalt2_obsclim_zooc-vint_15arcmin_global_monthly_1961_2010.nc',
 '/g/data/vf71/fishmip_inputs/ISIMIP3a/global_inputs/obsclim/025deg/gfdl-mom6-cobalt2_obsclim_so-surf_15arcmin_global_monthly_1961_2010.nc',
 '/g/data/vf71/fishmip_inputs/ISIMIP3a/global_inputs/obsclim/025deg/gfdl-mom6-cobalt2_obsclim_tos_15arcmin_global_monthly_1961_2010.nc',
 '/g/data/vf71/fishmip_inputs/ISIMIP3a/global_inputs/obsclim/025deg/gfdl-mom6-cobalt2_obsclim_so-bot_15arcmin_global_monthly_1961_2010.nc']

# Calculating mean climatology for GFDL data (3D fields)
**Author:** Denisse Fierro Arcos  
**Date:** 2024-09-09  
  
Calculating mean climatological conditions within the boundaries of FishMIP regional models using GFDL-MOM6-COBALT2 model outputs. Only variables that include multiple depth bins are processed here. Climatologies calculated here are shown as maps in shiny app.

In [19]:
import xarray as xr
import pandas as pd
import os
from glob import glob

In [18]:
#Location of zarr files
base_dir = '/g/data/vf71/fishmip_inputs/ISIMIP3a/regional_inputs/obsclim/025deg'

#Get list of zarr files
zarr_list = glob(os.path.join('*zarr'))

#Folder where mean climatologies with all data will be saved
base_out_maps = file.path(base_dir, "maps_data")
os.makedirs(base_out_maps, exist_ok = True)

#Folder where mean climatologies for comparison will be saved
base_out_comp = file.path(base_out_maps, "comp_clim")
os.makedirs(base_out_comp, exist_ok = True)

'/g/data/vf71/fishmip_inputs/ISIMIP3a/regional_inputs/obsclim/025deg/gfdl-mom6-cobalt2_obsclim_o2_15arcmin_gulf-of-guinea_monthly_1961_2010.zarr'

In [50]:
def calc_clim(file_path, path_out, monthly = False, **kwargs):
    '''
    Open netCDF files and calculate climatologies.
    
    Inputs:
    file_path (character): Full file path where data is stored
    path_out (character): Full file path where masked data should be stored
    monthly (boolean): Default is FALSE. If set to TRUE, monthly climatology is
    calculated
    min_year (integer): Optional. First year to be included in climatology
    max_year (integer): Optional. Last year to be included in climatology
    '''

    #Get base file path
    if monthly:
        base_file = os.path.basename(file_path).replace('monthly', 
                                                        'mthly_clim_mean')
    else:
        base_file = os.path.basename(file_path).replace('monthly', 
                                                        'climatological_mean')
    base_file = base_file.replace('zarr', 'parquet')
        
    #Load file
    ds = xr.open_zarr(file_path)
    #Get name of variable
    [var] = list(ds.data_vars)
    ds = ds[var]

    #Save attributes
    ds_attrs = pd.DataFrame([ds.attrs])
    
    #Get years included in dataset
    years = pd.unique(ds.time.dt.year)

    #Check start year is later or equal to first year in data
    if 'min_year' in kwargs.keys():
        if min_year < min(years):
            print('"min_year" must be later or equal to the first year '+
                   'included in the data. Calculating mean values from ' +
                   str(min(years)))
            min_year = str(min(years))
        else:
            print('Calculating mean values from ' + str(min_year))
            min_year = str(min_year)
            base_file = base_file.replace(str(min(years)), min_year)
    if 'max_year' in kwargs.keys():
        if max_year > max(years):
            print('"max_year" must be earlier or equal to the last year '+
                   'included in the data. Calculating mean values from ' +
                   str(max(years)))
            max_year = str(max(years))
        else:
            print('Calculating mean values from ' + str(max_year))
            max_year = str(max_year)
            base_file = base_file.replace(str(max(years)), max_year)

    #Filter data 
    ds = ds.sel(time = slice(min_year, max_year))

    #Calculate climatology
    if monthly:
        ds_clim = ds.groupby('time.month').mean('time')
        ind_wider = ['lat', 'lon', 'month', 'vals']
    else:
        ds_clim = ds.mean('time')
        ind_wider = ['lat', 'lon', 'vals']

    #Turn extracted data into data frame and remove rows with NA values
    df = ds_clim.to_series().to_frame().reset_index().dropna()
    #Changing column name to standardise across variables
    df = df.rename(columns = {ds.name: 'vals'}).reset_index(drop = True)
    #Reorganise data
    df = df[ind_wider]
    #Include original dataset attributes
    df = pd.concat([df, da_attrs], axis = 1)
    #Saving data frame
    df.to_parquet(os.path.join(path_out, base_file))

Unnamed: 0,Array,Chunk
Bytes,870.30 MiB,124.33 MiB
Shape,"(600, 35, 97, 112)","(600, 5, 97, 112)"
Dask graph,7 chunks in 2 graph layers,7 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 870.30 MiB 124.33 MiB Shape (600, 35, 97, 112) (600, 5, 97, 112) Dask graph 7 chunks in 2 graph layers Data type float32 numpy.ndarray",600  1  112  97  35,

Unnamed: 0,Array,Chunk
Bytes,870.30 MiB,124.33 MiB
Shape,"(600, 35, 97, 112)","(600, 5, 97, 112)"
Dask graph,7 chunks in 2 graph layers,7 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [84]:
for f in zarr_list:
    calc_clim(f, base_out_maps)
    calc_clim(f, base_out_comp, monthly = True, min_year = 1981, max_year = 2010)

'gfdl-mom6-cobalt2_obsclim_o2_15arcmin_gulf-of-guinea_climatological_mean_1999_2010.zarr'