Merge pull request #284 from Dana-Farber-AIOS/infer_backend_from_path

Infer backend from path
Dana-Farber-AIOS · Feb 7, 2022 · 65ecf6c · 65ecf6c
2 parents ea9439e + 94ffa23
commit 65ecf6c
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 48 deletions.
diff --git a/pathml/core/slide_data.py b/pathml/core/slide_data.py
@@ -15,33 +15,29 @@
 import pathml.core
 import pathml.preprocessing.pipeline
 from pathml.core.slide_types import SlideType
-from torch.utils.data import Dataset
 
 
-def get_file_ext(path):
+def infer_backend(path):
     """
-    Return the file extension of an input path.
-    If zipped with 'gz' or 'bz2' extension, will instead return the second to last extension.
-    If multiple extensions, will return the last two.
+    Checks file extensions to try to infer correct backend to use.
+    Uses the file extensions from the sets contained in this file (pathml/core/slide_data.py)
+    For file formats which are supported by both openslide and bioformats, will return "bioformats".
 
     Args:
         path: path to file
 
     Returns:
-        str: file extension
+        str: one of "bioformats", "openslide", "dicom", "h5path"
     """
-    p = Path(path)
-    ext = p.suffixes
-    if not ext:
-        raise Exception(f"invalid path has no file extension: {path}")
-    elif len(ext) == 1:
-        ext = ext[0]
-    elif len(ext) >= 2:
-        if ext[-1] in {".gz", ".bz2"}:
-            ext = ext[-2]
-        else:
-            ext = "".join(ext[-2:])
-    return ext
+    path = str(path)
+    for extension_set, name in zip(
+        [pathmlext, bioformatsext, openslideext, dicomext],
+        ["h5path", "bioformats", "openslide", "dicom"],
+    ):
+        for ext in extension_set:
+            if path[-len(ext) :] == ext:
+                return name
+    raise ValueError(f"input path {path} doesn't match any supported file extensions")
 
 
 class SlideData:
@@ -55,8 +51,11 @@ class SlideData:
         tiles (pathml.core.Tiles, optional): object containing {coordinates, tile} pairs
         labels (collections.OrderedDict, optional): dictionary containing {key, label} pairs
         backend (str, optional): backend to use for interfacing with slide on disk.
-            Must be one of {"OpenSlide", "BioFormats", "DICOM"} (case-insensitive).
+            Must be one of {"OpenSlide", "BioFormats", "DICOM", "h5path"} (case-insensitive).
+            Note that for supported image formats, OpenSlide performance can be significantly better than BioFormats.
+            Consider specifying ``backend = "openslide"`` when possible.
             If ``None``, and a ``filepath`` is provided, tries to infer the correct backend from the file extension.
+            Defaults to ``None``.
         slide_type (pathml.core.SlideType, optional): slide type specification. Must be a
             :class:`~pathml.core.SlideType` object. Alternatively, slide type can be specified by using the
             parameters ``stain``, ``tma``, ``rgb``, ``volumetric``, and ``time_series``.
@@ -121,8 +120,8 @@ def __init__(
         ), f"slide_type is of type {type(slide_type)} but must be of type pathml.core.types.SlideType"
         assert backend is None or (
             isinstance(backend, str)
-            and backend.lower() in {"openslide", "bioformats", "dicom"}
-        ), f"backend {backend} must be one of ['OpenSlide', 'BioFormats', 'DICOM'] (case-insensitive)."
+            and backend.lower() in {"openslide", "bioformats", "dicom", "h5path"}
+        ), f"backend {backend} must be one of ['OpenSlide', 'BioFormats', 'DICOM', 'h5path'] (case-insensitive)."
         assert counts is None or isinstance(
             counts, anndata.AnnData
         ), f"counts is if type {type(counts)} but must be of type anndata.AnnData"
@@ -146,7 +145,7 @@ def __init__(
 
         # get name from filepath if no name is provided
         if name is None and filepath is not None:
-            name = Path(filepath).stem
+            name = Path(filepath).name
 
         _load_from_h5path = False
 
@@ -155,21 +154,9 @@ def __init__(
             backend = backend.lower()
         else:
             # try to infer the correct backend
-            ext = get_file_ext(filepath)
-            if ext in openslideext:
-                backend = "openslide"
-            elif ext in bioformatsext:
-                backend = "bioformats"
-            elif ext in dicomext:
-                backend = "dicom"
-            elif ext in pathmlext:
-                backend = "h5path"
-                # load SlideData from h5 or h5path
+            backend = infer_backend(filepath)
+            if backend == "h5path":
                 _load_from_h5path = True
-            else:
-                raise ValueError(
-                    f"Backend not specified, but cannot infer correct backend from input path {filepath}"
-                )
 
         if backend.lower() == "openslide":
             backend_obj = pathml.core.OpenSlideBackend(filepath)

diff --git a/tests/core_tests/test_slide_data.py b/tests/core_tests/test_slide_data.py
@@ -18,7 +18,7 @@
     BioFormatsBackend,
     Tile,
 )
-from pathml.core.slide_data import get_file_ext
+from pathml.core.slide_data import infer_backend
 from pathml.preprocessing import Pipeline, BoxBlur
 
 
@@ -29,18 +29,16 @@ def test_repr(slide):
 
 
 @pytest.mark.parametrize(
-    "path,ext",
+    "path,backend",
     [
-        ("/test/testing/test.txt", ".txt"),
-        ("/test/testing/test.txt.gz", ".txt"),
-        ("/test/testing/test.txt.bz2", ".txt"),
-        ("/test/testing/test.qptiff", ".qptiff"),
-        ("/test/testing/test.ext1.ext2", ".ext1.ext2"),
+        ("/test/testing/test.qptiff", "bioformats"),
+        ("/test/dot.dot/space space space/File with.spaces and.dots.h5path", "h5path"),
+        ("test.dcm", "dicom"),
+        ("test.file.multiple.exts.jpg.qptiff.tiff.ome.tiff", "bioformats"),
     ],
 )
-def test_get_file_ext(path, ext):
-    result = get_file_ext(path)
-    assert result == ext
+def test_infer_backend(path, backend):
+    assert infer_backend(path) == backend
 
 
 def test_write_with_array_labels(tmp_path, example_slide_data):
@@ -125,7 +123,7 @@ def test_generate_tiles_padding(he_slide, pad):
 
 def test_read_write_heslide(tmp_path, example_slide_data_with_tiles):
     slidedata = example_slide_data_with_tiles
-    path = tmp_path / "testhe.h5"
+    path = tmp_path / "testhe.test.test.dots space dots.h5"
     slidedata.write(path)
     readslidedata = SlideData(path)
     repr(readslidedata)

diff --git a/tests/ml_tests/test_dataset.py b/tests/ml_tests/test_dataset.py
@@ -54,7 +54,7 @@ def test_dataset(tmp_path, im_path):
         else:
             assert v == labs[k]
 
-    if wsi.name == "small_vectra":
+    if wsi.name == "small_vectra.qptiff":
         # 5-dim images (XYZCT converted to TCZXY for batching)
         assert np.array_equal(im, wsi.tiles[0].image.transpose(4, 3, 2, 1, 0))
     else: