#  Sample mapfile generation for esgpublish operation

## Set constants
- Filenames should be full relative paths consistent to this Notebook file

In [None]:
METAFILE = "metadata-file.txt"   #  filename of the metadata
FIRST_ID= "cosmo-rea"   # Project identifyer
import os
os.chdir("/home/jovyan/mapfiles")

## Bash code to get filesystem metadata
- Runs in same directory as the Notebook
- wget is required
- scans a directory tree for .nc files
- generates the "metafile" 

In [None]:
%%bash

wget https://raw.githubusercontent.com/sashakames/aims-pub-util/master/get_meta.sh
dn="/path/to/data/project"  # Provide path to the data you wish to scan.  Note this will checksum all the files in the subtree and can take some time
for fn in `find $dn -name "*.nc"` ; do 
  bash get_meta.sh $fn ; done > metadata-file.txt  #  file name should match const above

## Open the input file

In [None]:
arr = []
with open(METAFILE) as f:
    for line in f:
        arr.append(line.rstrip())


## Iterate through input data and write out mapfile

In [None]:
out_dict = {}

for line in arr:

    parts = line.split()
    path = parts[1]

    checksum = parts[0]

    ts = parts[2]  # timestamp

    sz = parts[3].strip()  # size

    pp2 = path.split('/')
    idx = pp2.index(FIRST_ID)
    last = len(pp2) - 2
    dset_id = ".".join(pp2[idx  :last]) + "#" + pp2[last][1:]

    out_arr = []
    out_arr.append(dset_id)
    out_arr.append(path)
    out_arr.append(sz)
    out_arr.append("mod_time=" + ts)
    out_arr.append("checksum=" + checksum)
    out_arr.append("checksum_type=SHA256")
    out_line = ' | '.join(out_arr) + "\n"
    
    if not dset_id in out_dict:
        out_dict[dset_id] = [out_line]
    else:
        out_dict[dset_id].append(out_line)

for dset_id in out_dict:
    outfilename = dset_id.replace("#",".v") + ".map"
    print("Writing " + outfilename)
    with open(outfilename , "w") as f:
        for line in out_dict[dset_id]:
            f.write(line)

## esgpublish operation

- See https://esg-publisher.readthedocs.org/
- Ensure you have followed steps to install and configure the publisher to create a esg.yaml file
- Assuming you have saved your config file to the current directory as esg.yaml and you have the mapfile `test-file.map`, you should run in a terminal:

esgpublish --map test-file.map --config esg.yaml --no-auth  # No auth used in esgf-docker index install

## Running test publishing using the esgcet module directly

The cells below handle a "generic" publishing workflow for testing purposes.  These are not intended for long production publication runs.

### 1.)  Import statements

In [None]:
from esgcet.generic_netcdf import GenericPublisher

### 2.)  Configure the publisher

In [None]:
argdict = {}
argdict["silent"] = False
argdict["verbose"] = False
argdict["cert"] = ""   # must be set for restricted ESGF1 publishing relying on certs
argdict["index_node"] = "esgf-fedtest.llnl.gov"
argdict["data_node"] = "fake-test-datanode.llnl.gov"
argdict["data_roots"] = { "/p/user_pub/work" : "user_pub_work" }
argdict["globus"] = "FAKE"
argdict["dtn"] = 'none'
argdict["replica"] = False
argdict["proj"] = FIRST_ID
argdict["json_file"] = None
argdict["auth"] = False
argdict["user_project_config"] = {}
argdict["test"] = True
argdict["verify"] = True
argdict["mountpoints"] = None
argdict["autoc_command"] = None
argdict["enable_archive"] = False

### 3.) Create a publisher and run the workflow

In [None]:
publisher = GenericPublisher(argdict)

publisher.workflow()