In [7]:
# imports
import re
import pandas as pd
import numpy as np
import json
from typing import Any
import subprocess
from tqdm import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [8]:
# load table
with open('/home/hellgoth/Documents/work/projects/culture-collections_project/culture-collections/data/new_names/dataset.json', 'r') as f:
    dataset = json.load(f)

In [9]:
# general checks
print(len(dataset["views"]))
print("default" in dataset["views"])
print(len(set(dataset["views"].keys())))

1636
True
1636


In [10]:
# load abbreviations & add func to check for valid names
with open("abbreviations.csv", "r") as f:
    abbreviations = pd.read_csv(f, sep="\t", dtype=str)

ids = abbreviations["ids"].dropna().tolist()
fixes = abbreviations["fixes"].dropna().tolist()
channels = abbreviations["channels"].dropna().tolist()
dates = abbreviations["dates"].dropna().tolist()
researchers = abbreviations["researchers"].dropna().tolist()
extra = abbreviations["extra"].dropna().tolist()

def is_valid_name(name: str) -> bool:
    split = name.split("_")
    if len(split) < 5:
        print(f"{name} is too short, must be at least 5 parts")
        return False
    if split[0] not in ids:
        print(f"{name} has invalid id {split[0]}")
        return False
    if split[1] not in fixes:
        print(f"{name} has invalid fix {split[1]}")
        return False
    cs = split[2].split("-")
    for c in cs:
        if c not in channels:
            print(f"{name} has invalid channel {c}")
            return False
    if split[3] not in dates:
        print(f"{name} has invalid date {split[3]}")
        return False
    if split[4] not in researchers:
        print(f"{name} has invalid researcher {split[4]}")
        return False
    if len(split) > 5 and split[5] not in extra:
        print(f"{name} has invalid extra {split[5]}")
        return False
    return True

In [16]:
COLORS = {
    "nhs": "white",
    "bod": "white",
    "tub": "magenta",
    "cetn": "yellow",
    "dna": "blue",
    "npc": "green",
}
ALT_COLORS = ["orange", "red"]

In [17]:
all_files = subprocess.check_output(["mc", "ls", f"culcol_s3_ro/culture-collections/data/single_volumes/images/ome-zarr/"], text=True)
all_files = all_files.replace(all_files[:34], "")
all_files = all_files.split("/\n")[:-1]
print(len(all_files))
len(all_files) == len(set(all_files))  # check for duplicates

1240


True

In [18]:
from minio import Minio


client = Minio(
    "s3.embl.de",
    access_key="CulColROPubKey",
    secret_key="ReadCultureCollections092023",
)

In [20]:
# iterate through views
for view_name, view in dataset["views"].items():
    # skip default view
    if view_name == "default":
        continue

    # check if view name is valid
    if not is_valid_name(view_name):
        print(f"Invalid name found: {view_name}")
        continue

    # iterate through sourceDisplays
    color_counter = 0
    for i, source_display in enumerate(view["sourceDisplays"]):
        # check contrast limits
        cl = source_display["imageDisplay"]["contrastLimits"]
        if cl == [0, 2**8] or cl == [0, 2**16]:
            print(f"Contrast limits for {view_name} are not set: {cl}")

        # check channel order
        channels_from_name = view_name.split("_")[2].split("-")
        channel_name_from_dataset = source_display["imageDisplay"]["name"]
        if i != channels_from_name.index(channel_name_from_dataset):
            print(
                f"Channel order mismatch in {view_name}: {channel_name_from_dataset} not at index {i}"
            )

        # check channel color
        if channel_name_from_dataset == "na":
            exp_color = "black"
        elif channel_name_from_dataset == "bod" and "nhs" in channels_from_name:
            exp_color = "orange"
        elif channel_name_from_dataset not in COLORS:
            exp_color = ALT_COLORS[color_counter]
            color_counter += 1
        else:
            exp_color = COLORS[channel_name_from_dataset]
        if source_display["imageDisplay"]["color"] != exp_color:
            print(
                f"Color mismatch in {view_name} for channel {channel_name_from_dataset}: expected {exp_color}, got {source_display['imageDisplay']['color']}"
            )

        # check source name
        source_names = source_display["imageDisplay"]["sources"]
        if not len(source_names) == 1:
            print(f"Should have only one source: {view_name}")
            continue
        source_name = source_names[0]
        if not is_valid_name(
            "_".join(source_name.split("_")[:-1])
            if source_name.split("_")[-1].startswith("ch")
            else source_name
        ):
            print(f"Invalid source name found: {source_name}")

        # jump to source
        source = dataset["sources"][source_name]
        # check channel idx
        if source_name.split("_")[-1].startswith("ch"):
            exp_channel_idx = int(source_name.split("_")[-1][2:])
            if not (
                source["image"]["imageData"]["ome.zarr"]["channel"]
                == source["image"]["imageData"]["ome.zarr.s3"]["channel"]
                == exp_channel_idx
            ):
                print(
                    f"Channel index mismatch in {view_name} for source {source_name}: expected {exp_channel_idx}, got {source['image']['ome.zarr']['channel']}"
                )
        else:  # Hiral's case
            if "channel" in source["image"]["imageData"]["ome.zarr"] or "channel" in source["image"]["imageData"]["ome.zarr.s3"]:
                print(
                    f"Channel index found in {view_name} for source {source_name} but not expected"
                )
        
        # check for valid file name
        local_file_name = source["image"]["imageData"]["ome.zarr"]["relativePath"].split("/")[2]
        remote_file_name = source["image"]["imageData"]["ome.zarr.s3"]["s3Address"].split("/")[8]
        if local_file_name != remote_file_name:
            print(
                f"File name mismatch in {view_name} for source {source_name}: {local_file_name} != {remote_file_name}"
            )
        # if not is_valid_name(local_file_name.replace(".ome.zarr", "")):
        #     print(f"Invalid source name found: {source_name}")

        # check for file existence
        if remote_file_name not in all_files:
            print(f"File {local_file_name} not found in S3 bucket for {view_name}")

        # check that vol exists
        series = source["image"]["imageData"]["ome.zarr.s3"]["s3Address"].split("/")[9]
        resolution = 0
        timepoint = 0
        channel = 0
        if source_name.split("_")[-1].startswith("ch"):
            channel = int(source_name.split("_")[-1][2:])
        objs = [
            obj.object_name
            for obj in client.list_objects(
                "culture-collections",
                prefix=f"data/single_volumes/images/ome-zarr/{remote_file_name}/{series}/{resolution}/{timepoint}/{channel}/",
            )
        ]
        if len(objs) == 0:
            print(
                f"Volume {remote_file_name}/{series}/{resolution}/{timepoint}/{channel} not found in S3 bucket for {view_name}"
            )

Contrast limits for rcc999rcc2538_pfa_dna-tub-nhs_20230728_csd_04 are not set: [0, 256]
Contrast limits for rcc10709_pfa_dna-tub-nhs_20230829_csd_07 are not set: [0, 256]
