In [None]:
import os
import sys
import pandas as pd
from pynxtools_apm.utils.versioning import __version__
from pynxtools_apm.examples.get_sha256_of_directories import SEPARATOR
from pynxtools_apm.examples.oasisb_utils import APT_MIME_TYPES, CAMECA_ROOT_MIME_TYPES

# TODO parameterize
with open("root_directory.txt") as fp:
    src_directory = f"{fp.read().strip().replace('/', os.sep)}"
print(src_directory)

header = "file_path:archive_path;byte_size;unix_mtime;sha256sum"

In [None]:
config: dict[str, str] = {
    "python_version": f"{sys.version.replace(' ', '_')}",
    "working_directory": f"{os.getcwd()}",
    "pynxtools_apm version": f"{__version__}",
    # "directory": f"src_directory,  # sys.argv[1],
}

ods = pd.read_excel(
    f"{src_directory}{os.sep}data{os.sep}aaa_legacy_data.ods",
    sheet_name="aaa_legacy_data",
    engine="odf",
)
cnt: dict[int, int] = {1: 0, 2: 0}
just_display_files_from_project = False

for row in ods.itertuples(index=True):
    if row.project_name not in ("mar_tomographic_hellman"):
        continue
    if row.parse in (1, 2):
        print(f"project{SEPARATOR}{row.project_name}{SEPARATOR}hashing...")
        sub_directory = f"{src_directory}{os.sep}data{os.sep}{row.project_name}"
        project_csv = (
            f"{src_directory}{os.sep}data{os.sep}{row.project_name}.sha256.results.csv"
        )
        with open(project_csv, "r") as fp:
            start = next(idx for idx, line in enumerate(fp) if header in line)
        df_hash = pd.read_csv(project_csv, sep=";", skiprows=start)
        del start, project_csv
        df_hash.columns = ["path", "size", "mtime", "sha256"]
        # print(df_hash)

        # multiple checks for consistency
        # i) which files with atom-probe-specific mimetype are listed in df ...
        hash_to_file: dict[str, list[str]] = {}
        for line in df_hash.itertuples(index=True):
            if line.path.lower().endswith(
                tuple(APT_MIME_TYPES + CAMECA_ROOT_MIME_TYPES)
            ):
                # ... and which of these are unique?
                if line.sha256 not in hash_to_file:
                    hash_to_file[line.sha256] = [line.path]
                else:
                    hash_to_file[line.sha256].append(line.path)
                    # print(f"WARNING::{line.path} is almost surely a duplicate")
        del df_hash
        # summarize these uniques, for the demo we only want to have a dataset in NOMAD ones per project
        file_to_hash: dict[str, str] = {}
        # file_has_likely_content_duplicate: dict[str, bool] = {}
        for hash_value, file_name_list in hash_to_file.items():
            for file_name in file_name_list:
                if file_name not in file_to_hash:
                    file_to_hash[file_name] = hash_value
                    # file_has_likely_content_duplicate[file_name] = True if len(file_name_list) > 1 else False
                else:
                    print(f"{line.path}, {file_name} has already been added before")
        # del hash_to_file

        if just_display_files_from_project:
            for file_name in file_to_hash:
                print(file_name)
            continue

        project_ods = f"{src_directory}{os.sep}data{os.sep}{row.project_name}.ods"
        df_workflow = pd.read_excel(
            project_ods,
            sheet_name=row.project_name,
            engine="odf",
        ).fillna("")
        del project_ods
        # print(df_workflow)

        # iii) are all the files that are referred to in each row, i.e., puzzled together workflow, in f"{row.project_name}.ods" listed in df?
        n_entries = 0
        composite_hashes: set[str] = set()
        for line in df_workflow.itertuples(index=True):
            has_entry = False
            composite_hash = ""
            for col_name in [
                "str_rraw",
                "rhit_hits",
                "root",
                "pos_epos_apt_ato_csv",
                "rng_rrng_fig_env",
                "hdf_xml_nxs_raw_ops",
            ]:
                value = getattr(line, col_name)
                if value != "":
                    if value in file_to_hash:
                        has_entry = True
                        composite_hash += f"{file_to_hash[value]}_"
                        # need to work with duplicate instead?
                        # if value in file_has_likely_content_duplicate:
                        #     if file_has_likely_content_duplicate[value]:
                        #         if f"{value}" != f"{hash_to_file[file_to_hash[value]][0]}":
                        #             print(f">>>> REPLACE {value} by {hash_to_file[file_to_hash[value]][0]}")
                    else:
                        print(
                            f">>>> {value} from column {col_name} has not been hashed"
                        )
            if has_entry:
                if composite_hash not in composite_hashes:
                    composite_hashes.add(composite_hash)
                    n_entries += 1
                else:
                    print(
                        f"REMOVE {line.Index} from {row.project_name}.ods as it will likely create an entry containing duplicated data"
                    )
            del has_entry, composite_hash
        del composite_hashes
        print(f">>>> {row.project_name} has {n_entries} entries")
        if n_entries == 0:
            print("\n")
            for file_name in file_to_hash:
                print(f"{file_name}")
            print("\n")
        cnt[row.parse] += n_entries
        del n_entries
        del file_to_hash, hash_to_file  # file_has_likely_content_duplicate
print(f"{cnt} entries expected in total")