#  Sample mapfile generation for esgpublish operation

## Set constants
- Filenames should be full relative paths consistent to this Notebook file

In [1]:
METAFILE =   "exeter-221.txt"   #  filename of the metadata
FIRST_ID=  "input4MIPs"   # "Project" identifier obs
import os, requests
os.chdir("/home/jovyan/mapfiles")

## Run this bash code in a terminal
- easiest to run in same directory as the Notebook
- wget is required
- scans a directory tree for .nc files
- generates the "metafile" 

In [3]:
%%sh

PARALLEL=/home/jovyan/conda-envs/pubpip/bin/parallel

# wget https://raw.githubusercontent.com/sashakames/aims-pub-util/master/get_meta.sh

dn="/p/user_pub/work/input4MIPs/CMIP7/CMIP/uoexeter/UOEXETER-CMIP-2-2-1"
# project should match identifier above 
#for fn in `find $dn -name '*.nc'`; do 
#  bash get_meta.sh $fn ; done > ~/mapfiles/DRCDP-3.txt  #  file name should match const above
#done
find $dn -type f |  $PARALLEL bash get_meta.sh {} > exeter-221.txt

## Open the input file

In [4]:
arr = []
with open(METAFILE) as f:
    for line in f:
        
        arr.append(line.rstrip())


## Iterate through input data and write out mapfile

In [5]:
out_dict = {}

for line in arr:

    parts = line.split()
    path = parts[1]

    checksum = parts[0]

    ts = parts[2]  # timestamp

    sz = parts[3].strip()  # size

    pp2 = path.split('/')
    idx = pp2.index(FIRST_ID)
    last = len(pp2) - 2
    dset_id = ".".join(pp2[idx  :last]) + "#" + pp2[last][1:]

    out_arr = []
    out_arr.append(dset_id)
    out_arr.append(path)
    out_arr.append(sz)
    out_arr.append("mod_time=" + ts)
    out_arr.append("checksum=" + checksum)
    out_arr.append("checksum_type=SHA256")
    out_line = ' | '.join(out_arr) + "\n"
    
    if not dset_id in out_dict:
        out_dict[dset_id] = [out_line]
    else:
        out_dict[dset_id].append(out_line)

for dset_id in out_dict:
    outfilename = dset_id.replace("#",".v") + ".map"
    print("Writing " + outfilename)
    with open(outfilename , "w") as f:
        for line in out_dict[dset_id]:
            f.write(line)

Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.monC.vd.gnz.v20250521.map
Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.monC.reff.gnz.v20250521.map
Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.monC.nd.gnz.v20250521.map
Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.monC.sad.gnz.v20250521.map
Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.day.utsvolcemis.gn.v20250521.map
Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.monC.ext.gnz.v20250521.map
Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.monC.asy.gnz.v20250521.map
Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.monC.ssa.gnz.v20250521.map
Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.mon.nd.gnz.v20250521.map
Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.mon.sad.gnz.v20250521.map
Writing input4MIPs.CMIP7.CMIP.uoexeter.UOEXETER-CMIP-2-2-1.atmos.mon.reff.gnz.

## esgpublish operation

- See https://esg-publisher.readthedocs.org/
- Ensure you have followed steps to install and configure the publisher to create a esg.yaml file
- Assuming you have saved your config file to the current directory as esg.yaml and you have the mapfile `test-file.map`, you should run in a terminal:

esgpublish --map test-file.map --config esg.yaml --json <json-file> # might use .json

## Running test publishing using the esgcet module directly

The cells below handle a "generic" publishing workflow for testing purposes.  These are not intended for long production publication runs.

### 1.)  Import statements

In [None]:
from esgcet.input4mips import input4mips

### 2.)  Configure the publisher

In [None]:
argdict = {}
argdict["silent"] = False
argdict["verbose"] = False
argdict["cert"] = ""   # must be set for restricted ESGF1 publishing relying on certs
argdict["index_node"] = "esgf-fedtest.llnl.gov"
argdict["data_node"] = "fake-test-datanode.llnl.gov"
argdict["data_roots"] = { "/p/user_pub/work" : "user_pub_work" }
argdict["globus"] = "FAKE"
argdict["dtn"] = 'none'
argdict["replica"] = False
argdict["proj"] = FIRST_ID
argdict["json_file"] = None
argdict["auth"] = False
argdict["user_project_config"] = {}
argdict["test"] = True
argdict["verify"] = True
argdict["mountpoints"] = None
argdict["autoc_command"] = None
argdict["enable_archive"] = False
argdict["disable_further_info"]= True
argdict["fullmap"] = ["{dsid}.map" for dsid in out_dict]

### 3.) Create a publisher and run the workflow

In [None]:
publisher = input4mips(argdict)

publisher.workflow()

In [None]:
from esgcet.pub_client import publisherClient

In [None]:
pubCli = publisherClient("", "esgf-node.llnl.gov", auth=False)


In [None]:
pubCli.retract("input4MIPs.CMIP6Plus.CMIP.SOLARIS-HEPPA.SOLARIS-HEPPA-CMIP-4-2.atmos.day.multiple.gn.v20240718|esgf-data2.llnl.gov")
pubCli.retract("input4MIPs.CMIP6Plus.CMIP.SOLARIS-HEPPA.SOLARIS-HEPPA-CMIP-4-2.atmos.fx.multiple.gn.v20240718|esgf-data2.llnl.gov")
pubCli.retract("input4MIPs.CMIP6Plus.CMIP.SOLARIS-HEPPA.SOLARIS-HEPPA-CMIP-4-2.atmos.mon.multiple.gn.v20240718|esgf-data2.llnl.gov")

In [12]:
res=requests.get("https://esgf-node.llnl.gov/esg-search/search/?format=application%2fsolr%2bjson&project=DRCDP&limit=400&fields=instance_id")

In [13]:
jobj=res.json()

In [14]:
jobj["response"]["docs"][0]

{'instance_id': 'DRCDP.CMIP6.NAM.TTU.STAR-ESDM-V1.ACCESS-CM2.CMIP.historical.r1i1p1f1.day.pr.v20241130',
 'score': 1.0}

In [15]:
with open("maplst.txt", "w") as f:
    for it in jobj["response"]["docs"]:
        print(f'{it["instance_id"]}.map', file=f)