In [69]:
import pandas as pd
import os

In [70]:
df_fs38 = pd.DataFrame(columns=['path', 'file_type', 'realm', 'frequency', 'table_id', 'project_id',
       'institution_id', 'source_id', 'experiment_id', 'member_id',
       'variable_id', 'grid_label', 'version', 'time_range'])

In [71]:
# Change this variable to specify the absolute path to the directory containing this notebook
notebook_path = "/g/data/tm70/pb9726/access-issm-recipes/intake_esm_catalog"

In [72]:
df = pd.read_csv(notebook_path+"/file_lists/projection_ais.txt", header=None)
df.columns = ["path"]

In [73]:

# Function to split the filename into components based on '_'
def split_filename(file_path):
    # Remove the file extension and split the rest based on '_'
    filename = os.path.basename(file_path).replace('.nc', '')
    components = filename.split('_')
    # Return components in a dictionary format for DataFrame usage
    if len(components) == 5:
        return {
        'project_id': 'ISMIP6-Projection-' + components[1],
        'institution_id': components[2],
        'source_id': components[3],
        'experiment_id': components[4],
        'variable_id': components[0]
        }
    elif len(components) > 5:
        if (components[2] == 'ILTS') & (len(components) == 6): # If institution == 'ILTS' and there are no more extra components, add '_PLK' to the institution id
            return {
            'project_id': 'ISMIP6-Projection-' + components[1],
            'institution_id': 'ILTS_PIK',
            'source_id': components[4],
            'experiment_id': components[5],
            'variable_id': components[0]
            }
        elif (components[2] != 'ILTS') & (len(components) > 5): # If there are extra components but institution != 'ILTS', join the experiment_id's   
            return {
            'project_id': 'ISMIP6-Projection-' + components[1],
            'institution_id': components[2],
            'source_id': components[3],
            'experiment_id': '_'.join(components[4:]),
            'variable_id': components[0]
            }
        elif (components[2] == 'ILTS') & (len(components) > 6): # If institution == 'ILTS' and there are extra components, add '_PLK' to the institution id and join the experiment_id's   
            return {
            'project_id': 'ISMIP6-Projection-' + components[1],
            'institution_id': 'ILTS_PIK',
            'source_id': components[4],
            'experiment_id': '_'.join(components[5:]),
            'variable_id': components[0]
            }
# Apply function to each file path and expand results into separate columns
is_df = df[df['path'].str.contains("AIS")].copy()
is_df[['project_id', 'institution_id','source_id', 'experiment_id', 'variable_id']] = is_df['path'].apply(lambda x: pd.Series(split_filename(x)))

In [74]:
df_final = pd.DataFrame(columns=df_fs38.columns)
df_final[is_df.columns] = is_df

In [75]:
df_final.loc[df_final[df_final.institution_id.isin(['UTAS','UCIJPL','IMAU','ULB','JPL1','DOE','VUW','NCAR','VUB'])].index, "frequency"] = "year"
df_final.loc[df_final[df_final.institution_id.isin(['ILTS_PIK','AWI','PIK','LSCE'])].index, "frequency"] = "6month"

In [76]:
df_final["file_type"] = "f"
df_final["realm"] = "cryosphere"

In [77]:
df_final.loc[df_final[df_final.institution_id.isin(['UTAS','UCIJPL','JPL1','DOE','VUW','NCAR','VUB','LSCE'])].index, "time_range"] = "201501-210101"
df_final.loc[df_final[df_final.institution_id.isin(['ILTS_PIK','AWI'])].index, "time_range"] = "201507-218707"
df_final.loc[df_final[df_final.institution_id.isin(['PIK'])].index, "time_range"] = "201507-221507"
df_final.loc[df_final[df_final.institution_id.isin(['ULB'])].index, "time_range"] = "201511-210111"
df_final.loc[df_final[df_final.institution_id.isin(['IMAU'])].index, "time_range"] = "201507-210107"

In [78]:
df_final

Unnamed: 0,path,file_type,realm,frequency,table_id,project_id,institution_id,source_id,experiment_id,member_id,variable_id,grid_label,version,time_range
0,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projection-AIS,UTAS,ElmerIce,exp05,,yvelbase,,,201501-210101
1,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projection-AIS,UTAS,ElmerIce,exp05,,strbasemag,,,201501-210101
2,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projection-AIS,UTAS,ElmerIce,exp05,,tendlibmassbf,,,201501-210101
3,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projection-AIS,UTAS,ElmerIce,exp05,,sftgif,,,201501-210101
4,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projection-AIS,UTAS,ElmerIce,exp05,,libmassbffl,,,201501-210101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10331,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,6month,,ISMIP6-Projection-AIS,AWI,PISM1,expA3,,base,,,201507-218707
10332,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,6month,,ISMIP6-Projection-AIS,AWI,PISM1,expA3,,yvelmean,,,201507-218707
10333,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,6month,,ISMIP6-Projection-AIS,AWI,PISM1,expA3,,dlithkdt,,,201507-218707
10334,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,6month,,ISMIP6-Projection-AIS,AWI,PISM1,expA3,,lifmassbf,,,201507-218707


In [58]:
df_final.to_csv(notebook_path+"/esm/access-kj13-ismip6-ais.csv", index=False)

In [79]:
import gzip

with open(notebook_path+"/esm/access-kj13-ismip6-ais.csv", 'rb') as f_in:
    with gzip.open(notebook_path+"/esm/access-kj13-ismip6-ais.csv.gz", 'wb') as f_out:
        f_out.writelines(f_in)

# Write the Intake ESM Catalog

In [80]:
import json

In [81]:
data = {
    "id" :  'access-nri-kj13-ismip6-ais',
    "title" : 'access-nri-kj13-ismip6-ais',
    "description" :  "ACCESS-NRI Replica datasets for model evaluation. Includes ISMIP6 AIS model outputs.",
    "catalog_file" :  "access-kj13-ismip6-ais.csv.gz",
    'assets' : {
          'column_name': 'path',  
          'format': 'netcdf'
        },
    "esmcat_version" : '0.1.0',
    "catalog_file" : 'access-kj13-ismip6-ais.csv.gz',
}

In [84]:
data['aggregation_control'] = {'variable_column_name': 'variable_id',
 'groupby_attrs': ['file_type',
  'project_id',
  'institution_id',
  'source_id',
  'experiment_id',
  'member_id',
  'frequency',
  'realm',
  'table_id',
  'variable_id',
  'grid_label',
  'version'],
 'aggregations': [{'type': 'join_existing',
   'attribute_name': 'time_range',
   'options': {'dim': 'time'}}]}

In [86]:
data["attributes"] = [{'column_name': 'file_type'},
 {'column_name': 'project_id'},
 {'column_name': 'institution_id'},
 {'column_name': 'source_id'},
 {'column_name': 'experiment_id'},
 {'column_name': 'member_id'},
 {'column_name': 'frequency'},
 {'column_name': 'realm'},
 {'column_name': 'table_id'},
 {'column_name': 'variable_id'},
 {'column_name': 'grid_label'},
 {'column_name': 'version'},
 {'column_name': 'time_range'}]

In [87]:
data

{'id': 'access-nri-kj13-ismip6-ais',
 'title': 'access-nri-kj13-ismip6-ais',
 'description': 'ACCESS-NRI Replica datasets for model evaluation. Includes ISMIP6 AIS model outputs.',
 'catalog_file': 'access-kj13-ismip6-ais.csv.gz',
 'assets': {'column_name': 'path', 'format': 'netcdf'},
 'aggregation_control': {'variable_column_name': 'variable_id',
  'groupby_attrs': ['file_type',
   'project_id',
   'institution_id',
   'source_id',
   'experiment_id',
   'member_id',
   'frequency',
   'realm',
   'table_id',
   'variable_id',
   'grid_label',
   'version'],
  'aggregations': [{'type': 'join_existing',
    'attribute_name': 'time_range',
    'options': {'dim': 'time'}}]},
 'esmcat_version': '0.1.0',
 'attributes': [{'column_name': 'file_type'},
  {'column_name': 'project_id'},
  {'column_name': 'institution_id'},
  {'column_name': 'source_id'},
  {'column_name': 'experiment_id'},
  {'column_name': 'member_id'},
  {'column_name': 'frequency'},
  {'column_name': 'realm'},
  {'column_na

In [88]:
with open(notebook_path+'/esm/projection_ais_catalog.json', 'w') as json_file:
    json.dump(data, json_file)