## Analyze parsed metadata from nionswift to improve mapping onto NXem for the nxs_nion parser of pynxtools-em

In [None]:
import yaml
import numpy as np
import re
import os
import json
from ast import literal_eval
from ase.data import chemical_symbols
import pandas as pd

print(f"{os.getcwd()}")

directory = "CHANGEME".replace("/", f"{os.path.sep}")
print(directory)
# serialized_axes = "[{'offset': -0.07421380365287618, 'scale': 7.247441762976189e-05, 'units': 'rad'}, {'offset': -0.07421380365287618, 'scale': 7.247441762976189e-05, 'units': 'rad'}]"
# serialized_dims = "(1,)"

### Variants of NXdata taken and how to interpret these

In [2]:
def deserialize_to_list_of_axes(serialized: str) -> list[dict[str, float | int | str]]:
    """Deserialize into a list of dictionaries each representing a physical dimension axis."""
    deserialized = literal_eval(serialized)
    any_dict_not_an_axis = False
    for obj in deserialized:
        if isinstance(obj, dict):
            if (
                all(key in obj for key in ("offset", "scale", "units"))
                and len(obj.keys()) == 3
            ):
                continue
        any_dict_not_an_axis = True
    if not any_dict_not_an_axis:
        return deserialized
    return []


# print(deserialize_to_list_of_axes(serialized_axes))

In [3]:
def deserialize_to_tuple_of_dimensions(serialized: str) -> tuple[int]:
    """Deserialize to a typical return value as of np.shape()."""
    deserialized = literal_eval(serialized)
    if isinstance(deserialized, tuple):
        if all(isinstance(value, int) for value in deserialized):
            return deserialized
    return ()


# print(deserialize_to_tuple_of_dimensions(serialized_dims))

In [4]:
def tokenize_axes(list_of_axes: list[dict[str, float | int | str]]) -> str:
    """Create a token of the sorted axes to identify the sub-space in phase space."""
    token = []
    for axis in list_of_axes:
        if axis["units"] == "":
            token.append("unitless")
        else:
            token.append(axis["units"])
    return ";".join(token)


# print(tokenize_axes(deserialize_to_list_of_axes(serialized_axes)))

In [None]:
nsproj_to_eln = {}
# identify for which projects we should have collected logfiles
with open(f"{directory}{os.sep}script{os.sep}nsproj_to_eln.yaml") as fp:
    nsproj_to_eln = yaml.safe_load(fp)
eln_to_nsproj = {}
for nsproj, eln in nsproj_to_eln.items():
    if eln not in eln_to_nsproj:
        eln_to_nsproj[eln] = nsproj
    else:
        print(f"WARNING::Duplicated key for {nsproj} !")
del nsproj, eln

<div style="padding:10px; border-left:5px solid #f39c12; background:#fdf5e6;">
<b>⚠️ Warning:</b> We need to prefix everything with project IDs in the future as users tend to keep copies nsproject files that have the same binary content!.
</div>

### Run and analyze projects that have not yet been touched

In [7]:
analyze_untouched_projects = False
if (
    analyze_untouched_projects
):  # identify for which projects we should collect a logfile
    nsproj_expected: str = set()
    with open(f"{directory}{os.sep}script{os.sep}nsproj_to_eln.yaml") as fp:
        nsproj_to_eln = yaml.safe_load(fp)
        # identify all possible project, based on step02.*.ods configuration file from run02
        for remote_nsproj_rel_fpath, remote_eln_abs_fpath in nsproj_to_eln.items():
            clean_hash = f"{remote_eln_abs_fpath.replace('/scratch/pynxtools-em/', '').replace('.eln_data.yaml', '')}.log"
            # print(clean_hash)
            if clean_hash not in nsproj_expected:
                nsproj_expected.add(clean_hash)
            else:
                """
                if remote_nsproj_rel_fpath not in ("../../nion_data/Haas/GaN_NWs_N3757_holey_C_new_60kV/GaN_NWs_N3757_holey_C_new_60kV.nsproj",
                                                   "../../nion_data/Haas/2020-12-01_SbOx_6M/.ipynb_checkpoints/2020-12-01_SbOx_6M-checkpoint.nsproj",
                                                   "../../nion_data/Haas/2020-12-01_SbOx_6M/2020-12-01_SbOx_6M.nsproj",
                                                   "../../nion_data/Haeusler/06Oct2022_BTO-STO-Trilayer.nsproj",
                                                   "../../nion_data/Haeusler/2022/01Nov2022_STO-BTO-Superlattice/01Nov2022_STO-BTO-Superlattice.nsproj",
                                                   "../../nion_data/Haeusler/2022/02Nov2022_STO-BTO-Superlattice Data/02Nov2022_STO-BTO-Superlattice_200kV.nsproj",
                                                   "../../nion_data/Haeusler/2024/MBE_060/Lamelle_1/29Jan_2024_DPC Data/2024_29Jan_MBE060_Lamelle_1.nsproj",
                                                   "../../nion_data/Haeusler/A1212_22-06-2021/A1212_22-06-2021.nsproj",
                                                   "../../nion_data/Haeusler/A1213_22-09-2021/20210922 sample A.nsproj",
                                                   "../../nion_data/Haeusler/A1213_22-09-2021/20210923 sample A.nsproj",
                                                   "../../nion_data/Haeusler/BTO lamella 2/Idle.nsproj",
                                                   "../../nion_data/Haeusler/Ga2O3_Lamelle_2/02March2023_Ga2O3_4D-STEM/02March2023_Ga2O3_4D-STEM.nsproj",
                                                   "../../nion_data/Haeusler/Ga2O3_Lamelle_2/15Dez2022_Ga2O3.nsproj",
                                                   "../../nion_data/Haeusler/Ga2O3_Lamelle_2/27Feb2023_Ga2O3 Data/27Feb2023_Ga2O3.nsproj",
                                                   "../../nion_data/Haeusler/HU1-2422-GaP/HU1-2422_GaP_05Juni2023.nsproj",
                                                   "../../nion_data/Haeusler/Superlattice_STO_BTO/STO_BTO_Lamelle_July2023/Lamelle_1/STO_BTO_July2023_Lamelle_1.nsproj",
                                                   "../../nion_data/Haeusler/Superlattice_STO_BTO/STO_BTO_Lamelle_July2023/Lamelle_1/STO_BTO_Lamelle_1_18July2023.nsproj",
                                                   "../../nion_data/Haeusler/Superlattice_STO_BTO/STO_BTO_Lamelle_July2023/Lamelle_2/STO_BTO_Lamelle_2_18July2023.nsproj",
                                                   "../../nion_data/Haeusler/Superlattice_STO_BTO/STO_BTO_Lamelle_July2023/Lamelle_2/STO_BTO_Lamelle_2_19July2023.nsproj",
                                                   "../../nion_data/Haeusler/b-Ga2O3_Sam/b-Ga2O3_Sam_29June2023.nsproj",
                                                   "../../nion_data/Hongguang/New folder/20220615 HZO 3nm 01.nsproj",
                                                   "../../nion_data/Nerl/23-05-26_hBN-G-Ag_het_E5_box_737.nsproj",
                                                   "../../nion_data/Nerl/23-11-24_WSe2_box 12634/23-11-24_WSe2_box 12634.nsproj",
                                                   "../../nion_data/Nerl/25-09-19-AlGaBiAs-EELS/AlGaBiAs_EELS_20250919.nsproj",
                                                   "../../nion_data/Nerl/Nerl-20-2-25_d1+2/25-02-20_Al-bowtie_nerl.nsproj",
                                                   "../../nion_data/Nerl/Nerl-20-2-25_d3/25-02-25-al G hbn nerl.nsproj",
                                                   "../../nion_data/Nerl/S14 Franz/25-02-26_Franz_S14.nsproj"):
                # accidentally .ipynb_checkpoints where included and these have sometimes duplicated nsproj files
                """
                # print(f"{remote_nsproj_rel_fpath}")
                print(f"WARNING::{remote_nsproj_rel_fpath} is a duplicated hash")
            del clean_hash
    print(
        f"{len(nsproj_expected)} projects ought to have logfiles as these are the only nsproj file, or a single instance of copies of the same nsproj file in different directories"
    )

In [8]:
if (
    analyze_untouched_projects
):  # identify for which projects we have already collected such a logfile
    nsproj_processed: str = set()
    cnt = 0
    for root, dirs, files in os.walk(f"{directory}{os.sep}log"):
        for file in files:
            fpath = f"{root}/{file}".replace(os.sep * 2, os.sep)
            if not fpath.endswith(".log"):
                continue
            # check if logfile has errors
            cnt += 1
            # print(cnt)
            with open(fpath) as fp:
                for dirty_line in fp.readlines():
                    match = re.search(r"^ERROR", dirty_line)
                    if not match:
                        continue
                    else:
                        print(dirty_line)
                        break
            continue
            clean_hash = f"{fpath[fpath.rfind(os.sep) + 1 :]}"
            if clean_hash not in nsproj_processed:
                nsproj_processed.add(clean_hash)
            else:
                raise ValueError("Duplicated hash")
            del clean_hash
    print(f"{len(nsproj_processed)} projects have logfiles")

In [9]:
if (
    analyze_untouched_projects
):  # given two sets return those from expected which are not in processed
    pass
    """
    nsproj_missing: str = nsproj_expected.difference(nsproj_processed)
    for entry in nsproj_missing:
        eln_data_yaml_file_missing = f"/scratch/pynxtools-em/{entry.replace('.log', '')}.eln_data.yaml"
        # print(eln_data_yaml_file_missing)
        for key, val in nsproj_to_eln.items():  # naive brute force ok, as not too many entries
            if eln_data_yaml_file_missing == f"{val.replace('.log', '')}":
                print(f">>>> {key} found")
                break
        del eln_data_yaml_file_missing
    """

In [10]:
nxdata_types = [
    "nm;nm",
    "eV",
    "nm;nm;eV",
    "unitless;nm;nm",
    "rad;rad",
    "unitless;eV",
    "unitless;nm;nm;eV",
    "unitless;unitless;eV",
    "unitless",
    "rad;eV",
    "nm;nm;unitless;eV",
    "unitless;unitless",
    "1/nm;1/nm",
    "unitless;rad;eV",
    "unitless;unitless;unitless",
    "nm;nm;rad;rad",
    "eV;nm;nm",
    "nm;nm;rad;eV",
    "unitless;rad;rad",
    "nm",
    "rad",
    "unitless;unitless;unitless;unitless",
    "iteration",
    "unitless;nm;nm;unitless;eV",
    "nm;nm;unitless;unitless",
    "1/;1/",
    "unitless;nm",
    "nm;eV",
    "nm;rad;eV",
    "nm;nm;unitless",
    "nm;nm;rad",
    "1/nm",
    "unitless;unitless;rad;rad",
    "unitless;unitless;unitless;unitless;unitless",
    "unitless;nm;nm;rad;rad",
    "1/rad;1/rad",
    "1/eV",
    "rad;rad;rad;rad",
]

In [12]:
pivot_dims = []
total_cnts = 0
for key, cnts in stats_axes.items():
    pivot_dims.append((cnts, key))
    total_cnts += cnts
pivot_dims.sort(key=lambda x: x[0], reverse=True)
sum_cnts = 0
for cnts, key in pivot_dims:
    sum_cnts += cnts
    print(f"{cnts}\t{np.around(sum_cnts / total_cnts, decimals=4)}\t{key}")
    # print(f'''"{key}",''')

11275	0.3267	nm;nm
4161	0.4473	eV
3172	0.5392	nm;nm;eV
3138	0.6301	unitless;nm;nm
3053	0.7186	rad;rad
2686	0.7964	unitless;eV
1168	0.8303	unitless;nm;nm;eV
801	0.8535	unitless;unitless;eV
650	0.8723	unitless
633	0.8907	rad;eV
498	0.9051	nm;nm;unitless;eV
485	0.9192	unitless;unitless
328	0.9287	1/nm;1/nm
312	0.9377	unitless;rad;eV
300	0.9464	unitless;unitless;unitless
294	0.9549	nm;nm;rad;rad
276	0.9629	eV;nm;nm
222	0.9693	nm;nm;rad;eV
220	0.9757	unitless;rad;rad
207	0.9817	nm
202	0.9876	rad
183	0.9929	unitless;unitless;unitless;unitless
115	0.9962	iteration
44	0.9975	unitless;nm;nm;unitless;eV
14	0.9979	nm;nm;unitless;unitless
14	0.9983	1/;1/
12	0.9986	unitless;nm
8	0.9989	nm;eV
8	0.9991	nm;rad;eV
8	0.9993	nm;nm;unitless
7	0.9995	nm;nm;rad
4	0.9997	1/nm
4	0.9998	unitless;unitless;rad;rad
2	0.9998	unitless;unitless;unitless;unitless;unitless
2	0.9999	unitless;nm;nm;rad;rad
2	0.9999	1/rad;1/rad
1	1.0	1/eV
1	1.0	rad;rad;rad;rad


In [13]:
### Voting based on user name and atomtypes
user_name_aliases = [
    "March",
    "Kirmse",
    "Haas",
    "Fairman",
    "Kammerer",  # "Kammerer;Jochen",
    "Wagner",
    "Krivanek",
    "AEljarrat",
    "Repa",
    "Pekin",
    "Bruker",
    "McCauley",
    "Haeusler",
    "Coogan",
    "Elgvin",
    "Zhao",
    "Kochovski",
    "Wargulski",
    "Mueller",
    "Gladyshev",
    "Hongguang",
    "Mogilatenko",
]

voting_based_on_user_name = {}
voting_based_on_atom_types = {}
nsprojects = pd.read_excel(
    f"{directory}{os.sep}step02_nion_data_metadata.ods", engine="odf"
)
project_id = 0
project_id_start = 1
project_id_end = 670
for row in nsprojects.itertuples(index=True):
    if row.parse == 1:
        project_id += 1
        if (project_id < project_id_start) or (project_id > project_id_end):
            continue

        voting_based_on_user_name[row.nsproj_fpath] = {}
        for user_name_alias in user_name_aliases:
            voting_based_on_user_name[row.nsproj_fpath][user_name_alias] = 0
        if row.user_name_alias in voting_based_on_user_name[row.nsproj_fpath]:
            voting_based_on_user_name[row.nsproj_fpath][row.user_name_alias] = 1

        voting_based_on_atom_types[row.nsproj_fpath] = {}
        for symbol in chemical_symbols[1::]:
            voting_based_on_atom_types[row.nsproj_fpath][symbol] = 0
        if isinstance(row.dirty_atom_types, str):
            for symbol in [
                x.strip()
                for x in row.dirty_atom_types.replace("?", "").split(",")
                if x.strip()
            ]:
                voting_based_on_atom_types[row.nsproj_fpath][symbol] = 1
# print(voting_based_on_user_name)
# print(voting_based_on_atom_types)

### Evaluate the voting results to identify which projects to use for the Dec, 11th talk of Christoph

In [46]:
# voting results analyzed for sampling one representative of each nxdata type from each colleagues
def get_byte_size(nsproj_fpath):
    byte_size = 0
    for row in nsprojects.itertuples(index=True):
        if nsproj_fpath != row.nsproj_fpath:
            continue
        else:
            byte_size = row.total_size_bytes
            break
    return byte_size


def byte_to_gib(byte_size: int) -> str:
    return f"{np.around((byte_size / (1024**3)), decimals=3)} GiB"


nsproj_fpath_selection: str = set()
for nxdata_type in nxdata_types:
    # pick one representative the entry with the most diverse atom_types
    # search until one representative found
    # potential: list[tuple[str, int]] = []
    for nsproj_fpath, dict_with_votes in voting_based_on_expected_nxdata.items():
        # filter out
        if dict_with_votes[nxdata_type] == 0:
            continue
        if nsproj_fpath in nsproj_fpath_selection:
            continue
        total_atom_types = 0
        for key, vote in voting_based_on_atom_types[nsproj_fpath].items():
            total_atom_types += vote
        if total_atom_types == 0:
            del total_atom_types
            continue
        # potential.append((nsproj_fpath, get_byte_size(nsproj_fpath)))
        # continue
        if get_byte_size(nsproj_fpath) > (8 * (1024**3)):
            continue
        # jsut pick the first one
        nsproj_fpath_selection.add(nsproj_fpath)
        print(
            f"{nsproj_fpath}"
        )  # , {byte_to_gib(get_byte_size(nsproj_fpath))}, {nxdata_type}")
        del total_atom_types
        break
    # potential.sort(key=lambda x: x[1])  # modifies the original list
    # if len(potential) > 0:
    #     print(f"{nxdata_type}, {byte_to_gib(potential[0][1])}")
print(len(nsproj_fpath_selection))

../../nion_data/Haas/2020-07-28_HZO_2172/2020-07-28_HZO_2172.nsproj
../../nion_data/Haas/2021-09-16_SmTe3_T3/2021-09-16_SmTe3_T3.nsproj
../../nion_data/Wargulski/2022_12_21_CsPbIBr3_largeNPs_EELS_2ndTry/2022_12_21_CsPbIBr3_largeNPs_EELS_2ndTry.nsproj
../../nion_data/AEljarrat/2021_Au-NRs_Yuhang/20210812 Au-NRs bare.nsproj
../../nion_data/Haas/2021-01-25_Graphite_pumping/2021-01-25_Graphite_pumping.nsproj
../../nion_data/Haas/2021-04-16_AlScN_2nd/2021-04-16_AlScN_2nd.nsproj
../../nion_data/AEljarrat/BaSnO3 IKZ/20201208 PV01.nsproj
../../nion_data/Haas/2020-10-22_Sn-InN_5A/2020-10-22_Sn-InN_5A.nsproj
../../nion_data/Haas/2022-03-24_NbS3_D1_60kV/2022-03-24_NbS3_D1_60kV.nsproj
../../nion_data/Fairman/FeCr2O4_LARBED_13_11_23/FeCr2O4_LARBED_13_11_23.nsproj
../../nion_data/Haas/2025-02-10_N4071_GaN-NWs_60kV/2025-02-10_N4071_GaN-NWs_60kV.nsproj
../../nion_data/March/2021-11-22_Juri_Si_110/2021-11-22_Juri_Si_110.nsproj
../../nion_data/Haas/2023-09-26_CuS_nano-platelets_60kV/2023-09-26_CuS_nano-

In [None]:
### Voting which datasets to take