Skip to content

Commit

Permalink
Merge pull request #284 from Dana-Farber-AIOS/infer_backend_from_path
Browse files Browse the repository at this point in the history
Infer backend from path
  • Loading branch information
jacob-rosenthal committed Feb 7, 2022
2 parents ea9439e + 94ffa23 commit 65ecf6c
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 48 deletions.
59 changes: 23 additions & 36 deletions pathml/core/slide_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,29 @@
import pathml.core
import pathml.preprocessing.pipeline
from pathml.core.slide_types import SlideType
from torch.utils.data import Dataset


def get_file_ext(path):
def infer_backend(path):
"""
Return the file extension of an input path.
If zipped with 'gz' or 'bz2' extension, will instead return the second to last extension.
If multiple extensions, will return the last two.
Checks file extensions to try to infer correct backend to use.
Uses the file extensions from the sets contained in this file (pathml/core/slide_data.py)
For file formats which are supported by both openslide and bioformats, will return "bioformats".
Args:
path: path to file
Returns:
str: file extension
str: one of "bioformats", "openslide", "dicom", "h5path"
"""
p = Path(path)
ext = p.suffixes
if not ext:
raise Exception(f"invalid path has no file extension: {path}")
elif len(ext) == 1:
ext = ext[0]
elif len(ext) >= 2:
if ext[-1] in {".gz", ".bz2"}:
ext = ext[-2]
else:
ext = "".join(ext[-2:])
return ext
path = str(path)
for extension_set, name in zip(
[pathmlext, bioformatsext, openslideext, dicomext],
["h5path", "bioformats", "openslide", "dicom"],
):
for ext in extension_set:
if path[-len(ext) :] == ext:
return name
raise ValueError(f"input path {path} doesn't match any supported file extensions")


class SlideData:
Expand All @@ -55,8 +51,11 @@ class SlideData:
tiles (pathml.core.Tiles, optional): object containing {coordinates, tile} pairs
labels (collections.OrderedDict, optional): dictionary containing {key, label} pairs
backend (str, optional): backend to use for interfacing with slide on disk.
Must be one of {"OpenSlide", "BioFormats", "DICOM"} (case-insensitive).
Must be one of {"OpenSlide", "BioFormats", "DICOM", "h5path"} (case-insensitive).
Note that for supported image formats, OpenSlide performance can be significantly better than BioFormats.
Consider specifying ``backend = "openslide"`` when possible.
If ``None``, and a ``filepath`` is provided, tries to infer the correct backend from the file extension.
Defaults to ``None``.
slide_type (pathml.core.SlideType, optional): slide type specification. Must be a
:class:`~pathml.core.SlideType` object. Alternatively, slide type can be specified by using the
parameters ``stain``, ``tma``, ``rgb``, ``volumetric``, and ``time_series``.
Expand Down Expand Up @@ -121,8 +120,8 @@ def __init__(
), f"slide_type is of type {type(slide_type)} but must be of type pathml.core.types.SlideType"
assert backend is None or (
isinstance(backend, str)
and backend.lower() in {"openslide", "bioformats", "dicom"}
), f"backend {backend} must be one of ['OpenSlide', 'BioFormats', 'DICOM'] (case-insensitive)."
and backend.lower() in {"openslide", "bioformats", "dicom", "h5path"}
), f"backend {backend} must be one of ['OpenSlide', 'BioFormats', 'DICOM', 'h5path'] (case-insensitive)."
assert counts is None or isinstance(
counts, anndata.AnnData
), f"counts is if type {type(counts)} but must be of type anndata.AnnData"
Expand All @@ -146,7 +145,7 @@ def __init__(

# get name from filepath if no name is provided
if name is None and filepath is not None:
name = Path(filepath).stem
name = Path(filepath).name

_load_from_h5path = False

Expand All @@ -155,21 +154,9 @@ def __init__(
backend = backend.lower()
else:
# try to infer the correct backend
ext = get_file_ext(filepath)
if ext in openslideext:
backend = "openslide"
elif ext in bioformatsext:
backend = "bioformats"
elif ext in dicomext:
backend = "dicom"
elif ext in pathmlext:
backend = "h5path"
# load SlideData from h5 or h5path
backend = infer_backend(filepath)
if backend == "h5path":
_load_from_h5path = True
else:
raise ValueError(
f"Backend not specified, but cannot infer correct backend from input path {filepath}"
)

if backend.lower() == "openslide":
backend_obj = pathml.core.OpenSlideBackend(filepath)
Expand Down
20 changes: 9 additions & 11 deletions tests/core_tests/test_slide_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
BioFormatsBackend,
Tile,
)
from pathml.core.slide_data import get_file_ext
from pathml.core.slide_data import infer_backend
from pathml.preprocessing import Pipeline, BoxBlur


Expand All @@ -29,18 +29,16 @@ def test_repr(slide):


@pytest.mark.parametrize(
"path,ext",
"path,backend",
[
("/test/testing/test.txt", ".txt"),
("/test/testing/test.txt.gz", ".txt"),
("/test/testing/test.txt.bz2", ".txt"),
("/test/testing/test.qptiff", ".qptiff"),
("/test/testing/test.ext1.ext2", ".ext1.ext2"),
("/test/testing/test.qptiff", "bioformats"),
("/test/dot.dot/space space space/File with.spaces and.dots.h5path", "h5path"),
("test.dcm", "dicom"),
("test.file.multiple.exts.jpg.qptiff.tiff.ome.tiff", "bioformats"),
],
)
def test_get_file_ext(path, ext):
result = get_file_ext(path)
assert result == ext
def test_infer_backend(path, backend):
assert infer_backend(path) == backend


def test_write_with_array_labels(tmp_path, example_slide_data):
Expand Down Expand Up @@ -125,7 +123,7 @@ def test_generate_tiles_padding(he_slide, pad):

def test_read_write_heslide(tmp_path, example_slide_data_with_tiles):
slidedata = example_slide_data_with_tiles
path = tmp_path / "testhe.h5"
path = tmp_path / "testhe.test.test.dots space dots.h5"
slidedata.write(path)
readslidedata = SlideData(path)
repr(readslidedata)
Expand Down
2 changes: 1 addition & 1 deletion tests/ml_tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_dataset(tmp_path, im_path):
else:
assert v == labs[k]

if wsi.name == "small_vectra":
if wsi.name == "small_vectra.qptiff":
# 5-dim images (XYZCT converted to TCZXY for batching)
assert np.array_equal(im, wsi.tiles[0].image.transpose(4, 3, 2, 1, 0))
else:
Expand Down

0 comments on commit 65ecf6c

Please sign in to comment.