FeTS-AI · mzenk · Jun 30, 2022 · Jun 20, 2022 · Jun 20, 2022 · Jun 24, 2022
diff --git a/Task_2/mlcubes/data_prep/project/prepare.py b/Task_2/mlcubes/data_prep/project/prepare.py
@@ -1,5 +1,8 @@
+import csv
 from pathlib import Path
+import random
 import shutil
+from typing import List
 from tqdm import tqdm
 
 
@@ -36,6 +39,51 @@ def copy_subject(subject_dir: Path, output_dir_data: Path, output_dir_labels: Pa
                 break
 
 
+def get_validation_subjects(
+    data_path: Path,
+    split_path: Path,
+    val_size: float = 0.2,
+    min_val: int = 10,
+    seed=108493,
+) -> List[Path]:
+    subject_list = []
+    if split_path.exists():
+        with open(split_path, newline="", encoding="utf-8") as csvfile:
+            split_reader = csv.reader(csvfile)
+            for row in split_reader:
+                if str(row[0]) == "data_uid":
+                    continue
+                subject_dir = data_path / str(row[0])
+                if not subject_dir.exists():
+                    raise FileNotFoundError(
+                        f"The data folder {subject_dir} does not exist, but a corresponding subject was found in the validation split file. "
+                        f"Please contact the organizers!"
+                    )
+                subject_list.append(subject_dir)
+    else:
+        print(
+            f"WARNING: The file with data split information is not present. "
+            f"Performing automatic split with val_size={val_size}. "
+            f"Please contact the organizers if this causes errors."
+        )
+        all_subjects = []
+        for x in Path(data_path).iterdir():
+            # just to be sure there are no other folders that don't contain the actual data:
+            if x.is_dir() and len(list(x.glob("*.nii.gz"))) > 0:
+                all_subjects.append(x)
+        min_val = min(min_val, len(all_subjects))
+        num_val = max(int(val_size * len(all_subjects)), min_val)
+        random.seed(seed)
+        subject_list = random.sample(all_subjects, k=num_val)
+    print(
+        "Got {} subjects from the validation split: {}".format(
+            len(subject_list),
+            ", ".join([x.name for x in subject_list])
+        )
+    )
+    return subject_list
+
+
 def run_preparation(
     input_dir: str, output_data_dir: str, output_label_dir: str
 ) -> None:
@@ -44,7 +92,8 @@ def run_preparation(
     output_data_path.mkdir(parents=True, exist_ok=True)
     output_labels_path.mkdir(parents=True, exist_ok=True)
 
-    subject_list = [x for x in Path(input_dir).iterdir() if x.is_dir()]
-    print(f"Preparing {len(subject_list)} subjects...")
-    for subject_dir in tqdm(subject_list):
+    val_split_path = Path(input_dir) / "split_info" / "fets_phase2_split_1" / "val.csv"
+    subject_dir_list = get_validation_subjects(input_dir, val_split_path)
+    print(f"Preparing {len(subject_dir_list)} subjects...")
+    for subject_dir in tqdm(subject_dir_list):
         copy_subject(subject_dir, output_data_path, output_labels_path)
diff --git a/Task_2/mlcubes/data_prep/project/sanity_check.py b/Task_2/mlcubes/data_prep/project/sanity_check.py
@@ -5,29 +5,15 @@
 import numpy as np
 
 
-def check_subject_validity(subject_dir: Path, labels_dir: Path) -> List[Path]:
-    """Runs a few checks to ensure data quality and integrity
+def check_subject_validity(
+    subject_dir: Path, labels_dir: Path
+) -> Tuple[List[Path], List[Path], List[Path], List[Path]]:
+    """Checks if all files exist. Also checks size, spacing and label set of images and mask.
     """
-    subject_valid = True
-    files_to_check = [
-        subject_dir / f"{subject_dir.name}_brain_t1.nii.gz",
-        subject_dir / f"{subject_dir.name}_brain_t1ce.nii.gz",
-        subject_dir / f"{subject_dir.name}_brain_t2.nii.gz",
-        subject_dir / f"{subject_dir.name}_brain_flair.nii.gz",
-        labels_dir / f"{subject_dir.name}_final_seg.nii.gz",
-    ]
-
-    # check existance
-    for file_ in files_to_check:
-        if not file_.exists():
-            subject_valid = False
-            print(f"Missing file: {file_}")
-    return subject_valid
-
-
-def check_subject_images(subject_dir: Path, labels_dir: Path) -> Tuple[List[Path], List[Path]]:
+    missing_files = []
     wrong_size = []
     wrong_spacing = []
+    wrong_labels = []
 
     files_to_check = [
         subject_dir / f"{subject_dir.name}_brain_t1.nii.gz",
@@ -37,31 +23,56 @@ def check_subject_images(subject_dir: Path, labels_dir: Path) -> Tuple[List[Path
         labels_dir / f"{subject_dir.name}_final_seg.nii.gz",
     ]
     # check image properties
-    BASE_SIZE = np.array([240, 240, 155])
-    BASE_SPACING = np.array([1.0, 1.0, 1.0])
+    EXPECTED_SIZE = np.array([240, 240, 155])
+    EXPECTED_SPACING = np.array([1.0, 1.0, 1.0])
+    EXPECTED_LABELS = {0, 1, 2, 4}
     for file_ in files_to_check:
+        if not file_.exists():
+            missing_files.append(str(file_))
+            continue
         image = sitk.ReadImage(str(file_))
         size_array = np.array(image.GetSize())
         spacing_array = np.array(image.GetSpacing())
 
-        if not (BASE_SIZE == size_array).all():
-            wrong_size.append(file_)
-        if not (BASE_SPACING == spacing_array).all():
-            wrong_spacing.append(file_)
-    return wrong_size, wrong_spacing
+        if not (EXPECTED_SIZE == size_array).all():
+            wrong_size.append(str(file_))
+        if not (EXPECTED_SPACING == spacing_array).all():
+            wrong_spacing.append(str(file_))
+        if file_.name.endswith("seg.nii.gz"):
+            arr = sitk.GetArrayFromImage(image)
+            found_labels = np.unique(arr)
+            if len(set(found_labels).difference(EXPECTED_LABELS)) > 0:
+                wrong_labels.append(str(file_))
+    return missing_files, wrong_size, wrong_spacing, wrong_labels
 
 
 def run_sanity_check(data_path: str, labels_path: str):
+    check_successful = True
     for curr_subject_dir in Path(data_path).iterdir():
         if curr_subject_dir.is_dir():
-            assert check_subject_validity(
-                curr_subject_dir, Path(labels_path)
-            ), f"Subject {curr_subject_dir.name} does not contain all modalities or segmentation."
-            wrong_size, wrong_spacing = check_subject_images(curr_subject_dir, Path(labels_path))
-            assert len(wrong_size) == 0, (
-                f"Image size is not [240,240,155] for {wrong_size}"
-            )
-            assert len(wrong_spacing) == 0, (
-                f"Image resolution is not [1,1,1] for {wrong_spacing}"
-            )
-    print("Finished")
+            (
+                missing_files,
+                wrong_size,
+                wrong_spacing,
+                wrong_labels,
+            ) = check_subject_validity(curr_subject_dir, Path(labels_path))
+            if len(missing_files) > 0:
+                check_successful = False
+                print(
+                    f"ERROR Files missing for subject {curr_subject_dir.name}:\n{missing_files}"
+                )
+            if len(wrong_size) > 0:
+                check_successful = False
+                print(f"ERROR: Image size is not [240,240,155] for:\n{wrong_size}")
+            if len(wrong_spacing) > 0:
+                check_successful = False
+                print(f"ERROR: Image resolution is not [1,1,1] for:\n{wrong_spacing}")
+            if len(wrong_labels) > 0:
+                check_successful = False
+                print(
+                    f"ERROR: There were unexpected label values (not in [0, 1, 2, 4]) for:\n{wrong_labels}"
+                )
+    assert (
+        check_successful
+    ), "The sanity check discovered error(s). Please check the log above for details."
+    print("Finished. All good!")
diff --git a/Task_2/mlcubes/model/README.md b/Task_2/mlcubes/model/README.md
@@ -33,9 +33,9 @@ cd ./Task_2/mlcubes/model
 
 To test your installation, you can run any of the commands in [this section](#task-execution).
 
-## Important files
+## How to modify this project
 
-These are the most important files on this project:
+You can change each file in this project to add your own implementation. In particular, participants will want to adapt the `Dockerfile`, `requirements.txt` and code in `project/src`. They should also add model checkpoints to their container. Each place where modifications are possible is described in some detail below. We also made a short guide for converting BraTS docker submissions to the format used in FeTS [here](#guide-for-converting-brats-submissions). Here is an overview of files in this project: 
 
 ```bash
 ├── mlcube
@@ -55,10 +55,6 @@ These are the most important files on this project:
             └── utilities.py # Python utilities file that stores useful functions.
 ```
 
-## How to modify this project
-
-You can change each file described above to add your own implementation. In particular, participants will want to adapt the `Dockerfile`, `requirements.txt` and code in `project/src`. They should also add model checkpoints to their container. Each place where modifications are possible is described in some detail below. More information on the internals of MLCube can be found in the official [git repository](https://github.com/mlcommons/mlcube) or [documentation](https://mlcommons.github.io/mlcube/).
-
 <details><summary><b>Requirements file </b></summary>
 <p>
 
@@ -77,7 +73,7 @@ This file can be adapted to add your own docker labels, install some OS dependen
 <details><summary><b>MLCube yaml file </b></summary>
 <p>
 
-`mlcube.yaml` contains instructions about the docker image and platform that will be used, information about the project (name, description, authors), and also the tasks defined for the project. **Note** that this file is not submitted and changes will hence not have any effect in the official evaluation. We will use the provided template with the name of your docker image instead.
+`mlcube.yaml` contains instructions about the docker image and platform that will be used, information about the project (name, description, authors), and also the tasks defined for the project. **Note** that this file is not submitted and changes will hence not have any effect in the official evaluation. We will use the provided template with the name of your docker image instead. To change the name of your docker image, you can use the `docker.image` field in the `mlcube.yaml` or use `docker tag` after building it.
 
 In the existing implementation you will find the `infer` task, which will be executed in the federated evaluation. It takes the following parameters:
 
@@ -139,6 +135,8 @@ When testing your MLCube locally, different parameter files can be passed to an
 </p>
 </details>
 
+More information on the internals of MLCube can be found in the official [git repository](https://github.com/mlcommons/mlcube) or [documentation](https://mlcommons.github.io/mlcube/). 
+
 ## Task execution
 
 Here we describe the simple commands required to build and run individual MLCubes, which is useful for debugging your submission.
@@ -196,6 +194,48 @@ Furthermore, predictions for test cases should be placed in an output directory
 An example for loading images and saving segmentations is included in [`my_logic.py`](project/src/my_logic.py).
 
 
+## Guide for converting BraTS submissions
+
+This section is supposed to help teams that already created a docker submission for BraTS 2021 with converting it so that it's a valid FeTS task-2 submission. The first step is to download [this folder](.) and copy your code to `project/src`. Then, you will need to modify a few files:
+
+- `mlcube.py`: You can write a simple wrapper that basically calls your original inference code for each test case. This could look similar to this:
+  ```python
+  # ...
+
+  @app.command("infer")
+  def infer(
+      data_path: str = typer.Option(..., "--data_path"),
+      output_path: str = typer.Option(..., "--output_path"),
+      parameters_file: str = typer.Option(..., "--parameters_file"),
+      ckpt_path: str = typer.Option(..., "--checkpoint_path")
+  ):
+      if not Path(ckpt_path).exists():
+          print(ckpt_path)
+          # For federated evaluation, model needs to be stored here
+          print("WARNING: Checkpoint path not specified or doesn't exist. Using default path instead.")
+          ckpt_path = "/mlcube_project/model_ckpts"
+
+      for idx, subject_dir in enumerate(Path(data_path).iterdir()):
+          if subject_dir.is_dir():
+              subject_id = subject_dir.name
+              print("Processing subject {}".format(subject_id))
+              # run code from original BraTS submission. 
+              # TODO Make sure your code can handle input/output paths as arguments: --input and --output. Also make sure outputs from previous runs in the output are not overwritten
+              single_case_cmd = ["<insert_your_entrypoint>", "--input", str(subject_dir), "--output", str(output_path)]
+              subprocess.run(single_case_cmd, check=True)
+  ```
+  If your original entrypoint is a python script, you can of course also import it in `mlcube.py` instead of using a subprocess. It is important to keep the interface of the `infer` command unchanged.
+
+- `requirements.txt`: Update the python requirements.
+
+- `Dockerfile`: Merge your Dockerfile with the one provided in [`project/Dockerfile`](./project/Dockerfile). It's important to make `mlcube.py` the entrypoint now, as in our Dockerfile. If possible, you should try to use the base image (`FROM` instruction) we suggest, to guarantee your container runs on various GPU setups.
+
+- `model_ckpts`: Your model checkpoints have to be embedded in the docker image. Copy them here before building the image and make sure they are found by your script inside the container.
+
+- `mlcube.yaml`: Insert your custom image name in the `docker.image` field.
+
+After these changes, you should be able to run tests using the commands from [this section](#task-execution). Once these run without error, you're ready to [submit](https://www.synapse.org/#!Synapse:syn28546456/wiki/617255)!
+
 ## Project workflow
 
 ![MLCube workflow](https://i.imgur.com/qXRp3Tb.png)