In [None]:
import os
print(os.getcwd())
! mkdir -p prod && mkdir -p log && which python

## Context

Different file formats and usage scenarios of the dataconverter will typically use a different combination of instance data<br>
for concepts, one file format may report the voltage and microscope fabrication details, the next not etc.<br>
Therefore, possibly not all required as well as recommended concepts from an appdef will be populatable during<br>
parsing (i.e. running dataconverter) unless the missing metadata are streamed in from other sources (e.g. custom ELNs).<br>
Consequently, the validation part of the dataconverter may issue WARNINGS:: about missing or other non-compliances of<br>
an actual set of content with instance data that is written to the NeXus/HDF5 file.<br>

## What does this code do?

This code here fishes these warnings from a set of e.g. batch-processed conversion tasks performed to support developers<br>
with detecting and fixing common patterns of cases where the application definition is not fulfilled completely.<br>

In [None]:
tests = {
    #"eln": [("eln_data.yaml", "em.oasis.specific.yaml")],

    #"nxs_nion": ["2022-02-18_Metadata_Kuehbach.zip"],
    #"rsciio_velox": ["258.fbbd9cdfcf8be1b1d6e056cdf4186a0505df9188bbf52ebe36e74e271a8c972b.emd", "265.2dccb74e742d9807d736689ee1bdeb149aabbbccbecab89ab0d5855678a2b338.emd"],
    #"image_tfs": ["ALN_baoh_021.tif", "ETD_image.tif", "T3_image.tif", "NavCam_normal_vis_light_ccd.tif", "0c8nA_3deg_003_AplusB_test.tif"],
    #"rsciio_gatan": ["262.b97850f6c6b100740813c34d20eda294ef66a6130daa42a6cf975c4572ff599c.dm3", "265.6a232b951fe99bb06c481e6863b5b590df194b0766c915218586c81077417e8a.dm4"],

    #"image_hitachi": [("360.tif", "360.txt")],
    #"image_jeol": [("20240227_A1_2m_0_FA3_1.tif", "20240227_A1_2m_0_FA3_1.txt")],
    #"image_zeiss": ["SE2.tif"],
    #"image_tescan": [("Ti3C2VacDriedFilm19.tif", "Ti3C2VacDriedFilm19.hdr"), "CZ04-2_102_Pic_2.tif"],
    #"image_point_electronic": ["Defekt1.tif"],
    #"image_protochips": ["ReductionOfFeOxSmall.zip"],
    #"image_fei_legacy": ["BF_02_40kx.tif", "SEM_Image_-_SliceImage_-_109.tif"],

    #"hfive_oxford": ["57764_CR_ZG_380.h5oina", "EBSDCleanedMapData7.h5oina", "173_0057.h5oina", "173_0056.h5oina"],  
    #"hfive_apex": ["InGaN_nanowires_map.edaxh5", "InGaN_nanowires_linescan.edaxh5", "InGaN_nanowires_spectra.edaxh5"]  # , "2023-08-16_Ni_NFDI.edaxh5", "AlGaO.h5", "VInP_108_L2.h5"],
    #"hfive_bruker": ["066_0025.h5"],
    #"hfive_edax": ["001_0050.h5", "229_2097.oh5"],

    "image_diffraction_pattern": ["original_data.zip"],
    #"conventions": ["em.conventions.yaml"],
    #"hfive_dreamthreed_legacy": ["067_0003.dream3d", "SmallIN100_Final.dream3d", "244_0014.dream3d"],
}

In [None]:
root_directory = f"{os.getcwd()}/log"
verbose = True
issues = {}
blacklist = ["stderr.nxs_nion.2022-02-18_Metadata_Kuehbach.zip.txt", "stderr.image_diffraction_pattern.original_data.zip.txt"]
whitelist = ["stderr.nxs_nion.2022-02-18_Metadata_Kuehbach.zip.txt"]
for root, dirs, files in os.walk(root_directory):
    for file in files:
        fpath = f"{root}/{file}".replace(os.sep * 2, os.sep)
        fname = os.path.basename(fpath)
        if fname.startswith("stderr.") and fname.endswith(".txt"):
            # print(f"{fname}")
            if fname in blacklist:
                continue
            #if fname not in whitelist:
            #    continue
            with open(fpath, mode="r", encoding="utf8") as fp:
                txt = fp.read()
                txt = txt.replace("\r\n", "\n")  # windows to unix EOL conversion
                tokenized = [line for line in txt.split("\n") if line.strip() != "" and line.startswith("#") is False]
                for line in tokenized:
                    if line.startswith("WARNING: "):
                        if line not in issues:
                            issues[line] = [fname]
                        else:
                            issues[line].append(fname)
                del txt
n_issues = 0
for key, val in issues.items():  # sorted()
    if verbose and (not any(token in key for token in ["is being written", "written without documentation"])):
        print(f"{key}, {len(val)}")
        for entry in val:
            print(f"\t{entry[len('stderr.'):]}")
        # print(f"{key}, {len(val)}")  # {val}")
        n_issues += 1
print(f"There are {n_issues} types of WARNINGs across the set")