Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update license of model cube and data preparation cube #180

Merged
merged 5 commits into from
Jun 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 52 additions & 3 deletions Task_2/mlcubes/data_prep/project/prepare.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import csv
from pathlib import Path
import random
import shutil
from typing import List
from tqdm import tqdm


Expand Down Expand Up @@ -36,6 +39,51 @@ def copy_subject(subject_dir: Path, output_dir_data: Path, output_dir_labels: Pa
break


def get_validation_subjects(
data_path: Path,
split_path: Path,
val_size: float = 0.2,
min_val: int = 10,
seed=108493,
) -> List[Path]:
subject_list = []
if split_path.exists():
with open(split_path, newline="", encoding="utf-8") as csvfile:
split_reader = csv.reader(csvfile)
for row in split_reader:
if str(row[0]) == "data_uid":
continue
subject_dir = data_path / str(row[0])
if not subject_dir.exists():
raise FileNotFoundError(
f"The data folder {subject_dir} does not exist, but a corresponding subject was found in the validation split file. "
f"Please contact the organizers!"
)
subject_list.append(subject_dir)
else:
print(
f"WARNING: The file with data split information is not present. "
f"Performing automatic split with val_size={val_size}. "
f"Please contact the organizers if this causes errors."
)
all_subjects = []
for x in Path(data_path).iterdir():
# just to be sure there are no other folders that don't contain the actual data:
if x.is_dir() and len(list(x.glob("*.nii.gz"))) > 0:
all_subjects.append(x)
min_val = min(min_val, len(all_subjects))
num_val = max(int(val_size * len(all_subjects)), min_val)
random.seed(seed)
subject_list = random.sample(all_subjects, k=num_val)
print(
"Got {} subjects from the validation split: {}".format(
len(subject_list),
", ".join([x.name for x in subject_list])
)
)
return subject_list


def run_preparation(
input_dir: str, output_data_dir: str, output_label_dir: str
) -> None:
Expand All @@ -44,7 +92,8 @@ def run_preparation(
output_data_path.mkdir(parents=True, exist_ok=True)
output_labels_path.mkdir(parents=True, exist_ok=True)

subject_list = [x for x in Path(input_dir).iterdir() if x.is_dir()]
print(f"Preparing {len(subject_list)} subjects...")
for subject_dir in tqdm(subject_list):
val_split_path = Path(input_dir) / "split_info" / "fets_phase2_split_1" / "val.csv"
subject_dir_list = get_validation_subjects(input_dir, val_split_path)
print(f"Preparing {len(subject_dir_list)} subjects...")
for subject_dir in tqdm(subject_dir_list):
copy_subject(subject_dir, output_data_path, output_labels_path)
87 changes: 49 additions & 38 deletions Task_2/mlcubes/data_prep/project/sanity_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,15 @@
import numpy as np


def check_subject_validity(subject_dir: Path, labels_dir: Path) -> List[Path]:
"""Runs a few checks to ensure data quality and integrity
def check_subject_validity(
subject_dir: Path, labels_dir: Path
) -> Tuple[List[Path], List[Path], List[Path], List[Path]]:
"""Checks if all files exist. Also checks size, spacing and label set of images and mask.
"""
subject_valid = True
files_to_check = [
subject_dir / f"{subject_dir.name}_brain_t1.nii.gz",
subject_dir / f"{subject_dir.name}_brain_t1ce.nii.gz",
subject_dir / f"{subject_dir.name}_brain_t2.nii.gz",
subject_dir / f"{subject_dir.name}_brain_flair.nii.gz",
labels_dir / f"{subject_dir.name}_final_seg.nii.gz",
]

# check existance
for file_ in files_to_check:
if not file_.exists():
subject_valid = False
print(f"Missing file: {file_}")
return subject_valid


def check_subject_images(subject_dir: Path, labels_dir: Path) -> Tuple[List[Path], List[Path]]:
missing_files = []
wrong_size = []
wrong_spacing = []
wrong_labels = []

files_to_check = [
subject_dir / f"{subject_dir.name}_brain_t1.nii.gz",
Expand All @@ -37,31 +23,56 @@ def check_subject_images(subject_dir: Path, labels_dir: Path) -> Tuple[List[Path
labels_dir / f"{subject_dir.name}_final_seg.nii.gz",
]
# check image properties
BASE_SIZE = np.array([240, 240, 155])
BASE_SPACING = np.array([1.0, 1.0, 1.0])
EXPECTED_SIZE = np.array([240, 240, 155])
EXPECTED_SPACING = np.array([1.0, 1.0, 1.0])
EXPECTED_LABELS = {0, 1, 2, 4}
for file_ in files_to_check:
if not file_.exists():
missing_files.append(str(file_))
continue
image = sitk.ReadImage(str(file_))
size_array = np.array(image.GetSize())
spacing_array = np.array(image.GetSpacing())

if not (BASE_SIZE == size_array).all():
wrong_size.append(file_)
if not (BASE_SPACING == spacing_array).all():
wrong_spacing.append(file_)
return wrong_size, wrong_spacing
if not (EXPECTED_SIZE == size_array).all():
wrong_size.append(str(file_))
if not (EXPECTED_SPACING == spacing_array).all():
wrong_spacing.append(str(file_))
if file_.name.endswith("seg.nii.gz"):
arr = sitk.GetArrayFromImage(image)
found_labels = np.unique(arr)
if len(set(found_labels).difference(EXPECTED_LABELS)) > 0:
wrong_labels.append(str(file_))
return missing_files, wrong_size, wrong_spacing, wrong_labels


def run_sanity_check(data_path: str, labels_path: str):
check_successful = True
for curr_subject_dir in Path(data_path).iterdir():
if curr_subject_dir.is_dir():
assert check_subject_validity(
curr_subject_dir, Path(labels_path)
), f"Subject {curr_subject_dir.name} does not contain all modalities or segmentation."
wrong_size, wrong_spacing = check_subject_images(curr_subject_dir, Path(labels_path))
assert len(wrong_size) == 0, (
f"Image size is not [240,240,155] for {wrong_size}"
)
assert len(wrong_spacing) == 0, (
f"Image resolution is not [1,1,1] for {wrong_spacing}"
)
print("Finished")
(
missing_files,
wrong_size,
wrong_spacing,
wrong_labels,
) = check_subject_validity(curr_subject_dir, Path(labels_path))
if len(missing_files) > 0:
check_successful = False
print(
f"ERROR Files missing for subject {curr_subject_dir.name}:\n{missing_files}"
)
if len(wrong_size) > 0:
check_successful = False
print(f"ERROR: Image size is not [240,240,155] for:\n{wrong_size}")
if len(wrong_spacing) > 0:
check_successful = False
print(f"ERROR: Image resolution is not [1,1,1] for:\n{wrong_spacing}")
if len(wrong_labels) > 0:
check_successful = False
print(
f"ERROR: There were unexpected label values (not in [0, 1, 2, 4]) for:\n{wrong_labels}"
)
assert (
check_successful
), "The sanity check discovered error(s). Please check the log above for details."
print("Finished. All good!")
54 changes: 47 additions & 7 deletions Task_2/mlcubes/model/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ cd ./Task_2/mlcubes/model

To test your installation, you can run any of the commands in [this section](#task-execution).

## Important files
## How to modify this project

These are the most important files on this project:
You can change each file in this project to add your own implementation. In particular, participants will want to adapt the `Dockerfile`, `requirements.txt` and code in `project/src`. They should also add model checkpoints to their container. Each place where modifications are possible is described in some detail below. We also made a short guide for converting BraTS docker submissions to the format used in FeTS [here](#guide-for-converting-brats-submissions). Here is an overview of files in this project:

```bash
├── mlcube
Expand All @@ -55,10 +55,6 @@ These are the most important files on this project:
└── utilities.py # Python utilities file that stores useful functions.
```

## How to modify this project

You can change each file described above to add your own implementation. In particular, participants will want to adapt the `Dockerfile`, `requirements.txt` and code in `project/src`. They should also add model checkpoints to their container. Each place where modifications are possible is described in some detail below. More information on the internals of MLCube can be found in the official [git repository](https://github.com/mlcommons/mlcube) or [documentation](https://mlcommons.github.io/mlcube/).

<details><summary><b>Requirements file </b></summary>
<p>

Expand All @@ -77,7 +73,7 @@ This file can be adapted to add your own docker labels, install some OS dependen
<details><summary><b>MLCube yaml file </b></summary>
<p>

`mlcube.yaml` contains instructions about the docker image and platform that will be used, information about the project (name, description, authors), and also the tasks defined for the project. **Note** that this file is not submitted and changes will hence not have any effect in the official evaluation. We will use the provided template with the name of your docker image instead.
`mlcube.yaml` contains instructions about the docker image and platform that will be used, information about the project (name, description, authors), and also the tasks defined for the project. **Note** that this file is not submitted and changes will hence not have any effect in the official evaluation. We will use the provided template with the name of your docker image instead. To change the name of your docker image, you can use the `docker.image` field in the `mlcube.yaml` or use `docker tag` after building it.

In the existing implementation you will find the `infer` task, which will be executed in the federated evaluation. It takes the following parameters:

Expand Down Expand Up @@ -139,6 +135,8 @@ When testing your MLCube locally, different parameter files can be passed to an
</p>
</details>

More information on the internals of MLCube can be found in the official [git repository](https://github.com/mlcommons/mlcube) or [documentation](https://mlcommons.github.io/mlcube/).

## Task execution

Here we describe the simple commands required to build and run individual MLCubes, which is useful for debugging your submission.
Expand Down Expand Up @@ -196,6 +194,48 @@ Furthermore, predictions for test cases should be placed in an output directory
An example for loading images and saving segmentations is included in [`my_logic.py`](project/src/my_logic.py).


## Guide for converting BraTS submissions

This section is supposed to help teams that already created a docker submission for BraTS 2021 with converting it so that it's a valid FeTS task-2 submission. The first step is to download [this folder](.) and copy your code to `project/src`. Then, you will need to modify a few files:

- `mlcube.py`: You can write a simple wrapper that basically calls your original inference code for each test case. This could look similar to this:
```python
# ...

@app.command("infer")
def infer(
data_path: str = typer.Option(..., "--data_path"),
output_path: str = typer.Option(..., "--output_path"),
parameters_file: str = typer.Option(..., "--parameters_file"),
ckpt_path: str = typer.Option(..., "--checkpoint_path")
):
if not Path(ckpt_path).exists():
print(ckpt_path)
# For federated evaluation, model needs to be stored here
print("WARNING: Checkpoint path not specified or doesn't exist. Using default path instead.")
ckpt_path = "/mlcube_project/model_ckpts"

for idx, subject_dir in enumerate(Path(data_path).iterdir()):
if subject_dir.is_dir():
subject_id = subject_dir.name
print("Processing subject {}".format(subject_id))
# run code from original BraTS submission.
# TODO Make sure your code can handle input/output paths as arguments: --input and --output. Also make sure outputs from previous runs in the output are not overwritten
single_case_cmd = ["<insert_your_entrypoint>", "--input", str(subject_dir), "--output", str(output_path)]
subprocess.run(single_case_cmd, check=True)
```
If your original entrypoint is a python script, you can of course also import it in `mlcube.py` instead of using a subprocess. It is important to keep the interface of the `infer` command unchanged.

- `requirements.txt`: Update the python requirements.

- `Dockerfile`: Merge your Dockerfile with the one provided in [`project/Dockerfile`](./project/Dockerfile). It's important to make `mlcube.py` the entrypoint now, as in our Dockerfile. If possible, you should try to use the base image (`FROM` instruction) we suggest, to guarantee your container runs on various GPU setups.

- `model_ckpts`: Your model checkpoints have to be embedded in the docker image. Copy them here before building the image and make sure they are found by your script inside the container.

- `mlcube.yaml`: Insert your custom image name in the `docker.image` field.

After these changes, you should be able to run tests using the commands from [this section](#task-execution). Once these run without error, you're ready to [submit](https://www.synapse.org/#!Synapse:syn28546456/wiki/617255)!

## Project workflow

![MLCube workflow](https://i.imgur.com/qXRp3Tb.png)
Loading