In [1]:
import pandas as pd
import os

In [2]:
df_fs38 = pd.DataFrame(columns=['path', 'file_type', 'realm', 'frequency', 'table_id', 'project_id',
       'institution_id', 'source_id', 'experiment_id', 'member_id',
       'variable_id', 'grid_label', 'version', 'time_range'])

In [3]:
filepath = './'

In [4]:
df = pd.read_csv(filepath+"file_lists/projection_2300.txt", header=None)
df.columns = ["path"]

In [21]:
# Function to split the filename into components based on '_'
def split_filename(file_path):
    # Remove the file extension and split the rest based on '_'
    filename = os.path.basename(file_path).replace('.nc', '')
    if (filename.endswith(".docx") | filename.endswith(".pdf")):
        return None

    components = filename.split('_')
    # Return components in a dictionary format for DataFrame usage
    if len(components) == 5:
        return {
        'project_id': 'ISMIP6-Projections-2300',
        'institution_id': components[2],
        'source_id': components[3],
        'experiment_id': components[4],
        'variable_id': components[0]
        }
    elif len(components) > 5:
        if components[1] == 'tca':
            return {
            'project_id': 'ISMIP6-Projections-2300',
            'institution_id': components[3],
            'source_id': components[4],
            'experiment_id': '_'.join(components[5:]),
            'variable_id': components[0]+'_tca'
            }
        else:
            pass
# Apply function to each file path and expand results into separate columns
is_df = df[df['path'].str.contains("AIS")].copy()
is_df[['project_id', 'institution_id','source_id', 'experiment_id', 'variable_id']] = is_df['path'].apply(lambda x: pd.Series(split_filename(x)))

In [69]:
df_final = pd.DataFrame(columns=df_fs38.columns)
df_final[is_df.columns] = is_df

In [70]:
df_final["file_type"] = "f"
df_final["realm"] = "cryosphere"
df_final["frequency"] = "year"

In [71]:
df_final.loc[df_final[df_final.institution_id.isin(['DC','ILTS','NCAR','NORCE','PIK','ULB','UNN','UTAS'])].index, "time_range"] = "201601-230101"
df_final.loc[df_final[df_final.institution_id.isin(['DOE','LSCE','VUW'])].index, "time_range"] = "201507-230007"
df_final.loc[df_final[df_final.institution_id.isin(['IGE'])].index, "time_range"] = "201601-230101"
df_final.loc[df_final[df_final.institution_id.isin(['IMAU','UCIJPL','UCM','VUB'])].index, "time_range"] = "201501-230001"

In [72]:
df_final

Unnamed: 0,path,file_type,realm,frequency,table_id,project_id,institution_id,source_id,experiment_id,member_id,variable_id,grid_label,version,time_range
0,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projections-2300,LSCE,GRISLI,expAE04,,tendlibmassbffl,,,201507-230007
1,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projections-2300,LSCE,GRISLI,expAE04,,zvelsurf,,,201507-230007
2,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projections-2300,LSCE,GRISLI,expAE04,,strbasemag,,,201507-230007
3,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projections-2300,LSCE,GRISLI,expAE04,,tendlifmassbf,,,201507-230007
4,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projections-2300,LSCE,GRISLI,expAE04,,tendlicalvf,,,201507-230007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18014,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projections-2300,NORCE,CISM3-MAR364-ERA-t1-local,ctrlAE,,xvelbase,,,201601-230101
18015,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projections-2300,NORCE,CISM3-MAR364-ERA-t1-local,ctrlAE,,orog,,,201601-230101
18016,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projections-2300,NORCE,CISM3-MAR364-ERA-t1-local,ctrlAE,,yvelsurf,,,201601-230101
18017,/g/data/kj13/datasets/ismip6/ISMIP6-Projection...,f,cryosphere,year,,ISMIP6-Projections-2300,NORCE,CISM3-MAR364-ERA-t1-local,ctrlAE,,libmassbfgr,,,201601-230101


In [73]:
df_final.to_csv(filepath+"esm/access-kj13-ismip6-2300.csv", index=False)

In [74]:
import gzip

with open(filepath+"esm/access-kj13-ismip6-2300.csv", 'rb') as f_in:
    with gzip.open(filepath+"esm/access-kj13-ismip6-2300.csv.gz", 'wb') as f_out:
        f_out.writelines(f_in)

# Write the Intake ESM Catalog

In [75]:
import json

In [76]:
data = {}

In [77]:
data["id"] = 'access-nri-kj13-ismip6-2300'
data["title"] = 'access-nri-kj13-ismip6-2300'
data["description"] = "ACCESS-NRI Replica datasets for model evaluation. Includes ISMIP6 AIS 2300 model outputs."
data["catalog_file"] = "access-kj13-ismip6-2300.csv.gz"

In [78]:
data['assets'] = {'column_name': 'path', 'format': 'netcdf'}

In [79]:
data['aggregation_control'] = {'variable_column_name': 'variable_id',
 'groupby_attrs': ['file_type',
  'project_id',
  'institution_id',
  'source_id',
  'experiment_id',
  'member_id',
  'frequency',
  'realm',
  'table_id',
  'variable_id',
  'grid_label',
  'version'],
 'aggregations': [{'type': 'join_existing',
   'attribute_name': 'time_range',
   'options': {'dim': 'time'}}]}

In [80]:
data['esmcat_version'] = '0.1.0'
data['catalog_file'] = 'access-kj13-ismip6-2300.csv.gz'

In [81]:
data["attributes"] = [{'column_name': 'file_type'},
 {'column_name': 'project_id'},
 {'column_name': 'institution_id'},
 {'column_name': 'source_id'},
 {'column_name': 'experiment_id'},
 {'column_name': 'member_id'},
 {'column_name': 'frequency'},
 {'column_name': 'realm'},
 {'column_name': 'table_id'},
 {'column_name': 'variable_id'},
 {'column_name': 'grid_label'},
 {'column_name': 'version'},
 {'column_name': 'time_range'}]

In [82]:
data

{'id': 'access-nri-kj13-ismip6-2300',
 'title': 'access-nri-kj13-ismip6-2300',
 'description': 'ACCESS-NRI Replica datasets for model evaluation. Includes ISMIP6 AIS 2300 model outputs.',
 'catalog_file': 'access-kj13-ismip6-2300.csv.gz',
 'assets': {'column_name': 'path', 'format': 'netcdf'},
 'aggregation_control': {'variable_column_name': 'variable_id',
  'groupby_attrs': ['file_type',
   'project_id',
   'institution_id',
   'source_id',
   'experiment_id',
   'member_id',
   'frequency',
   'realm',
   'table_id',
   'variable_id',
   'grid_label',
   'version'],
  'aggregations': [{'type': 'join_existing',
    'attribute_name': 'time_range',
    'options': {'dim': 'time'}}]},
 'esmcat_version': '0.1.0',
 'attributes': [{'column_name': 'file_type'},
  {'column_name': 'project_id'},
  {'column_name': 'institution_id'},
  {'column_name': 'source_id'},
  {'column_name': 'experiment_id'},
  {'column_name': 'member_id'},
  {'column_name': 'frequency'},
  {'column_name': 'realm'},
  {'c

In [83]:
with open(filepath+'esm/projection_2300_catalog.json', 'w') as json_file:
    json.dump(data, json_file)