## Analyze parsed metadata from nionswift to improve mapping onto NXem for the nxs_nion parser of pynxtools-em

In [None]:
import yaml
import numpy as np
import re
import os
import json
import ast

print(f"{os.getcwd()}")

directory = "CHANGEME".replace("/", f"{os.path.sep}")
print(directory)
# serialized_axes = "[{'offset': -0.07421380365287618, 'scale': 7.247441762976189e-05, 'units': 'rad'}, {'offset': -0.07421380365287618, 'scale': 7.247441762976189e-05, 'units': 'rad'}]"
# serialized_dims = "(1,)"

## Variants of NXdata taken and how to interpret these

In [None]:
def deserialize_to_list_of_axes(serialized: str) -> list[dict[str, float | int | str]]:
    """Deserialize into a list of dictionaries each representing a physical dimension axis."""
    deserialized = ast.literal_eval(serialized)
    any_dict_not_an_axis = False
    for obj in deserialized:
        if isinstance(obj, dict):
            if (
                all(key in obj for key in ("offset", "scale", "units"))
                and len(obj.keys()) == 3
            ):
                continue
        any_dict_not_an_axis = True
    if not any_dict_not_an_axis:
        return deserialized
    return []


# print(deserialize_to_list_of_axes(serialized_axes))

In [None]:
def deserialize_to_tuple_of_dimensions(serialized: str) -> tuple[int]:
    """Deserialize to a typical return value as of np.shape()."""
    deserialized = ast.literal_eval(serialized)
    if isinstance(deserialized, tuple):
        if all(isinstance(value, int) for value in deserialized):
            return deserialized
    return ()


# print(deserialize_to_tuple_of_dimensions(serialized_dims))

In [None]:
def tokenize_axes(list_of_axes: list[dict[str, float | int | str]]) -> str:
    """Create a token of the sorted axes to identify the sub-space in phase space."""
    token = []
    for axis in list_of_axes:
        if axis["units"] == "":
            token.append("unitless")
        else:
            token.append(axis["units"])
    return ";".join(token)


# print(tokenize_axes(deserialize_to_list_of_axes(serialized_axes)))

In [None]:
logfiles = {}
stats_axes = {}  # statistics of how many datasets use specific combinations of physical dimension axes in sequence
for root, dirs, files in os.walk(f"{directory}{os.sep}log"):
    for file in files:
        fpath = f"{root}/{file}".replace(os.sep * 2, os.sep)
        if not fpath.endswith(".log"):
            continue
        print(fpath)
        with open(fpath) as fp:
            for dirty_line in fp.readlines():
                clean_line = dirty_line.strip()
                pattern = r"^[A-Z]+\s+\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.[+-]\d{4} ndata, metadata\.json, flat___dimensional_calibrations___*?(\[(?:\s*\{[^{}]*?:[^{}]*?\}\s*,?)+\])"
                match = re.search(pattern, clean_line)
                if match:
                    keyword = tokenize_axes(
                        deserialize_to_list_of_axes(f"{match.group(1)}")
                    )
                    if keyword in stats_axes:
                        stats_axes[keyword] += 1
                    else:
                        stats_axes[keyword] = 1
                    del keyword
                del match, pattern, clean_line
        continue
        # project_log_fpath, mime_type = fpath.split(".")
        # if project_log_fpath not in logfiles:
        #     logfiles[project_log_fpath] = {"log": project_log_fpath}
        # else:
        #     raise KeyError(f"Such a project_log_fpath exists already!")
print("Batch processing successful")

In [52]:
pivot_dims = []
total_cnts = 0
for key, cnts in stats_axes.items():
    pivot_dims.append((cnts, key))
    total_cnts += cnts
pivot_dims.sort(key=lambda x: x[0], reverse=True)
sum_cnts = 0
for cnts, key in pivot_dims:
    sum_cnts += cnts
    print(f"{cnts}\t{np.around(sum_cnts / total_cnts, decimals=4)}\t{key}")

9694	0.388	nm;nm
4138	0.5537	eV
3049	0.6757	rad;rad
2677	0.7829	unitless;eV
1140	0.8285	unitless;nm;nm
936	0.866	nm;nm;eV
649	0.892	unitless
561	0.9144	rad;eV
341	0.9281	unitless;unitless
327	0.9412	1/nm;1/nm
276	0.9522	eV;nm;nm
207	0.9605	nm
202	0.9686	rad
194	0.9763	unitless;unitless;eV
137	0.9818	unitless;rad;eV
132	0.9871	unitless;unitless;unitless
115	0.9917	iteration
59	0.9941	nm;nm;rad;eV
51	0.9961	unitless;nm;nm;eV
33	0.9974	nm;nm;unitless;eV
14	0.998	1/;1/
12	0.9985	unitless;nm
8	0.9988	nm;eV
7	0.9991	unitless;unitless;unitless;unitless
7	0.9994	nm;nm;rad;rad
6	0.9996	nm;rad;eV
4	0.9998	1/nm
2	0.9998	1/rad;1/rad
1	0.9999	unitless;unitless;unitless;unitless;unitless
1	0.9999	unitless;rad;rad
1	1.0	1/eV
1	1.0	nm;nm;rad
