Merge pull request #290 from Dana-Farber-AIOS/release-2.0.4

Release 2.0.4
Dana-Farber-AIOS · Feb 7, 2022 · c0a30d6 · c0a30d6
2 parents 1010b3a + 2aa4f62
commit c0a30d6
Show file tree

Hide file tree

Showing 14 changed files with 149 additions and 103 deletions.
diff --git a/.github/workflows/tests-conda.yml b/.github/workflows/tests-conda.yml
@@ -40,6 +40,7 @@ jobs:
       # uses: conda-incubator/setup-miniconda@f4c00b0ec69bdc87b1ab4972613558dd9f4f36f3
       uses: conda-incubator/setup-miniconda@v2.0.0
       with:
+        add_pip_as_python_dependency: false
         environment-file: environment.yml
         activate-environment: pathml
         python-version: ${{ matrix.python-version }}

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -37,6 +37,16 @@ Request a new feature by filing an issue on GitHub. Make sure to include the fol
 For developers
 ==============
 
+Coordinate system conventions
+-----------------------------
+
+With multiple tools for interacting with matrices/images, conflicting coordinate systems has been a common source of
+bugs. This is typically caused when mixing up (X, Y) coordinate systems and (i, j) coordinate systems. **To avoid these
+issues, we have adopted the (i, j) coordinate convention throughout PathML.** This follows the convention used by
+NumPy and many others, where ``A[i, j]`` refers to the element of matrix A in the ith row, jth column.
+Developers should be careful about coordinate systems and make the necessary adjustments when using third-party tools
+so that users of PathML can rely on a consistent coordinate system when using our tools.
+
 Setting up a local development environment
 -------------------------------------------
 
@@ -94,12 +104,15 @@ How to contribute code, documentation, etc.
 6. Push your changes and open a pull request on GitHub referencing the corresponding issue
 7. Respond to discussion/feedback about the pull request, make changes as necessary
 
-Versioning
-----------
+Versioning and Distributing
+---------------------------
 
 We use `semantic versioning`_. The version is tracked in ``pathml/_version.py`` and should be updated there as required.
-When new code is merged to the master branch on GitHub, the version should be incremented and the commit should
-be tagged in version format (e.g., "v1.0.0" for version 1.0.0).
+When new code is merged to the master branch on GitHub, the version should be incremented and a new release should be
+pushed. Releases can be created using the GitHub website interface, and should be tagged in version format
+(e.g., "v1.0.0" for version 1.0.0) and include release notes indicating what has changed.
+Once a new release is created, GitHub Actions workflows will automatically build and publish the updated package on
+PyPI and TestPyPI, as well as build and publish the Docker image to Docker Hub.
 
 Code Quality
 ------------

diff --git a/environment.yml b/environment.yml
@@ -5,7 +5,7 @@ channels:
     - pytorch
 
 dependencies:
-    - pip==21.2.2
+    - pip==21.3.1
     - python==3.8
     - numpy==1.19.5
     - scipy==1.7.3

diff --git a/pathml/_version.py b/pathml/_version.py
@@ -3,4 +3,4 @@
 License: GNU GPL 2.0
 """
 
-__version__ = "2.0.3"
+__version__ = "2.0.4"
diff --git a/pathml/core/slide_backends.py b/pathml/core/slide_backends.py
@@ -252,6 +252,10 @@ class BioFormatsBackend(SlideBackend):
         filename (str): path to image file on disk
         dtype (numpy.dtype): data type of image. If ``None``, will use BioFormats to infer the data type from the
             image's OME metadata. Defaults to ``None``.
+
+    Note:
+        While the Bio-Formats convention uses XYZCT channel order, we use YXZCT for compatibility with the rest of
+        PathML which is based on (i, j) coordinate system.
     """
 
     def __init__(self, filename, dtype=None):
@@ -281,7 +285,8 @@ def __init__(self, filename, dtype=None):
                 reader.getSizeC(),
                 reader.getSizeT(),
             )
-            sizeSeries.append((sizex, sizey, sizez, sizec, sizet))
+            # use yxzct for compatibility with the rest of PathML which uses i,j coords (not x, y)
+            sizeSeries.append((sizey, sizex, sizez, sizec, sizet))
         s = [s[0] * s[1] for s in sizeSeries]
 
         self.level_count = seriesCount  # count of levels
@@ -332,7 +337,7 @@ def get_image_shape(self, level=None):
                 Defaults to ``None``.
 
         Returns:
-            Tuple[int, int]: Shape of image (H, W)
+            Tuple[int, int]: Shape of image (i, j) at target level
         """
         if level is None:
             return self.shape[:2]
@@ -343,25 +348,29 @@ def get_image_shape(self, level=None):
             ), f"input level {level} invalid for slide with {self.level_count} levels total"
             return self.shape_list[level][:2]
 
-    def extract_region(self, location, size, level=0, series_as_channels=False):
+    def extract_region(
+        self, location, size, level=0, series_as_channels=False, normalize=True
+    ):
         """
         Extract a region of the image. All bioformats images have 5 dimensions representing
-        (x, y, z, channel, time). Even if an image does not have multiple z-series or time-series,
-        those dimensions will still be kept. For example, a standard RGB image will be of shape (x, y, 1, 3, 1).
+        (i, j, z, channel, time). Even if an image does not have multiple z-series or time-series,
+        those dimensions will still be kept. For example, a standard RGB image will be of shape (i, j, 1, 3, 1).
         If a tuple with len < 5 is passed, missing dimensions will be
         retrieved in full.
 
         Args:
-            location (Tuple[int, int]): (X,Y) location of corner of extracted region closest to the origin.
-            size (Tuple[int, int, ...]): (X,Y) size of each region. If an integer is passed, will convert to a
-            tuple of (H, W) and extract a square region. If a tuple with len < 5 is passed, missing
+            location (Tuple[int, int]): (i, j) location of corner of extracted region closest to the origin.
+            size (Tuple[int, int, ...]): (i, j) size of each region. If an integer is passed, will convert to a
+            tuple of (i, j) and extract a square region. If a tuple with len < 5 is passed, missing
                 dimensions will be retrieved in full.
             level (int): level from which to extract chunks. Level 0 is highest resolution. Defaults to 0.
             series_as_channels (bool): Whether to treat image series as channels. If ``True``, multi-level images
                 are not supported. Defaults to ``False``.
+            normalize (bool, optional): Whether to normalize the image to int8 before returning. Defaults to True.
+                If False, image will be returned as-is immediately after reading, typically in float64.
 
         Returns:
-            np.ndarray: image at the specified region. 5-D array of (x, y, z, c, t)
+            np.ndarray: image at the specified region. 5-D array of (i, j, z, c, t)
         """
         if level is None:
             level = 0
@@ -412,7 +421,7 @@ def extract_region(self, location, size, level=0, series_as_channels=False):
                 t=0,
                 series=level,
                 rescale=False,
-                XYWH=(location[0], location[1], 2, 2),
+                XYWH=(location[1], location[0], 2, 2),
             )
 
             # need this part because some facilities output images where the channels are incorrectly stored as series
@@ -426,11 +435,10 @@ def extract_region(self, location, size, level=0, series_as_channels=False):
                                 t=t,
                                 series=c,
                                 rescale=False,
-                                XYWH=(location[0], location[1], size[0], size[1]),
+                                XYWH=(location[1], location[0], size[1], size[0]),
                             )
                             slicearray = np.asarray(slicearray)
                             # some file formats read x, y out of order, transpose
-                            slicearray = np.transpose(slicearray)
                             array[:, :, z, c, t] = slicearray
 
             # in this case, channels are correctly stored as channels, and we can support multi-level images as series
@@ -442,26 +450,23 @@ def extract_region(self, location, size, level=0, series_as_channels=False):
                             t=t,
                             series=level,
                             rescale=False,
-                            XYWH=(location[0], location[1], size[0], size[1]),
+                            XYWH=(location[1], location[0], size[1], size[0]),
                         )
                         slicearray = np.asarray(slicearray)
-                        # some file formats read x, y out of order, transpose
-                        if slicearray.shape[:2] != array.shape[:2]:
-                            slicearray = np.transpose(slicearray)
-                            # in 2d undoes transpose
-                            if len(sample.shape) == 3:
-                                slicearray = np.moveaxis(slicearray, 0, -1)
                         if len(sample.shape) == 3:
                             array[:, :, z, :, t] = slicearray
                         else:
                             array[:, :, z, level, t] = slicearray
 
-        # scale array before converting: https://github.com/Dana-Farber-AIOS/pathml/issues/271
-        # first scale to [0-1]
-        array_scaled = array / (2 ** (8 * self.pixel_dtype.itemsize))
-        # then scale to [0-255] and convert to 8 bit
-        array_scaled = array_scaled * 2 ** 8
-        return array_scaled.astype(np.uint8)
+        if not normalize:
+            return array
+        else:
+            # scale array before converting: https://github.com/Dana-Farber-AIOS/pathml/issues/271
+            # first scale to [0-1]
+            array_scaled = array / (2 ** (8 * self.pixel_dtype.itemsize))
+            # then scale to [0-255] and convert to 8 bit
+            array_scaled = array_scaled * 2 ** 8
+            return array_scaled.astype(np.uint8)
 
     def get_thumbnail(self, size=None):
         """
@@ -515,6 +520,7 @@ def generate_tiles(self, shape=3000, stride=None, pad=False, level=0, **kwargs):
             pad (bool): How to handle tiles on the edges. If ``True``, these edge tiles will be zero-padded
                 and yielded with the other chunks. If ``False``, incomplete edge chunks will be ignored.
                 Defaults to ``False``.
+            **kwargs: Other arguments passed through to ``extract_region()`` method.
 
         Yields:
             pathml.core.tile.Tile: Extracted Tile object

diff --git a/pathml/core/slide_data.py b/pathml/core/slide_data.py
@@ -15,33 +15,29 @@
 import pathml.core
 import pathml.preprocessing.pipeline
 from pathml.core.slide_types import SlideType
-from torch.utils.data import Dataset
 
 
-def get_file_ext(path):
+def infer_backend(path):
     """
-    Return the file extension of an input path.
-    If zipped with 'gz' or 'bz2' extension, will instead return the second to last extension.
-    If multiple extensions, will return the last two.
+    Checks file extensions to try to infer correct backend to use.
+    Uses the file extensions from the sets contained in this file (pathml/core/slide_data.py)
+    For file formats which are supported by both openslide and bioformats, will return "bioformats".
 
     Args:
         path: path to file
 
     Returns:
-        str: file extension
+        str: one of "bioformats", "openslide", "dicom", "h5path"
     """
-    p = Path(path)
-    ext = p.suffixes
-    if not ext:
-        raise Exception(f"invalid path has no file extension: {path}")
-    elif len(ext) == 1:
-        ext = ext[0]
-    elif len(ext) >= 2:
-        if ext[-1] in {".gz", ".bz2"}:
-            ext = ext[-2]
-        else:
-            ext = "".join(ext[-2:])
-    return ext
+    path = str(path)
+    for extension_set, name in zip(
+        [pathmlext, bioformatsext, openslideext, dicomext],
+        ["h5path", "bioformats", "openslide", "dicom"],
+    ):
+        for ext in extension_set:
+            if path[-len(ext) :] == ext:
+                return name
+    raise ValueError(f"input path {path} doesn't match any supported file extensions")
 
 
 class SlideData:
@@ -55,8 +51,11 @@ class SlideData:
         tiles (pathml.core.Tiles, optional): object containing {coordinates, tile} pairs
         labels (collections.OrderedDict, optional): dictionary containing {key, label} pairs
         backend (str, optional): backend to use for interfacing with slide on disk.
-            Must be one of {"OpenSlide", "BioFormats", "DICOM"} (case-insensitive).
+            Must be one of {"OpenSlide", "BioFormats", "DICOM", "h5path"} (case-insensitive).
+            Note that for supported image formats, OpenSlide performance can be significantly better than BioFormats.
+            Consider specifying ``backend = "openslide"`` when possible.
             If ``None``, and a ``filepath`` is provided, tries to infer the correct backend from the file extension.
+            Defaults to ``None``.
         slide_type (pathml.core.SlideType, optional): slide type specification. Must be a
             :class:`~pathml.core.SlideType` object. Alternatively, slide type can be specified by using the
             parameters ``stain``, ``tma``, ``rgb``, ``volumetric``, and ``time_series``.
@@ -91,6 +90,7 @@ def __init__(
         volumetric=None,
         time_series=None,
         counts=None,
+        dtype=None,
     ):
         # check inputs
         assert masks is None or isinstance(
@@ -120,8 +120,8 @@ def __init__(
         ), f"slide_type is of type {type(slide_type)} but must be of type pathml.core.types.SlideType"
         assert backend is None or (
             isinstance(backend, str)
-            and backend.lower() in {"openslide", "bioformats", "dicom"}
-        ), f"backend {backend} must be one of ['OpenSlide', 'BioFormats', 'DICOM'] (case-insensitive)."
+            and backend.lower() in {"openslide", "bioformats", "dicom", "h5path"}
+        ), f"backend {backend} must be one of ['OpenSlide', 'BioFormats', 'DICOM', 'h5path'] (case-insensitive)."
         assert counts is None or isinstance(
             counts, anndata.AnnData
         ), f"counts is if type {type(counts)} but must be of type anndata.AnnData"
@@ -145,7 +145,7 @@ def __init__(
 
         # get name from filepath if no name is provided
         if name is None and filepath is not None:
-            name = Path(filepath).stem
+            name = Path(filepath).name
 
         _load_from_h5path = False
 
@@ -154,26 +154,14 @@ def __init__(
             backend = backend.lower()
         else:
             # try to infer the correct backend
-            ext = get_file_ext(filepath)
-            if ext in openslideext:
-                backend = "openslide"
-            elif ext in bioformatsext:
-                backend = "bioformats"
-            elif ext in dicomext:
-                backend = "dicom"
-            elif ext in pathmlext:
-                backend = "h5path"
-                # load SlideData from h5 or h5path
+            backend = infer_backend(filepath)
+            if backend == "h5path":
                 _load_from_h5path = True
-            else:
-                raise ValueError(
-                    f"Backend not specified, but cannot infer correct backend from input path {filepath}"
-                )
 
         if backend.lower() == "openslide":
             backend_obj = pathml.core.OpenSlideBackend(filepath)
         elif backend.lower() == "bioformats":
-            backend_obj = pathml.core.BioFormatsBackend(filepath)
+            backend_obj = pathml.core.BioFormatsBackend(filepath, dtype)
         elif backend.lower() == "dicom":
             backend_obj = pathml.core.DICOMBackend(filepath)
         elif backend.lower() == "h5path":
@@ -279,6 +267,7 @@ def run(
             write_dir (str): Path to directory to write the processed slide to. The processed SlideData object
                 will be written to the directory immediately after the pipeline has completed running.
                 The filepath will default to "<write_dir>/<slide.name>.h5path. Defaults to ``None``.
+            **kwargs: Other arguments passed through to ``generate_tiles()`` method of the backend.
         """
         assert isinstance(
             pipeline, pathml.preprocessing.pipeline.Pipeline
@@ -381,8 +370,8 @@ def extract_region(self, location, size, *args, **kwargs):
             location (Tuple[int, int]): Location of top-left corner of tile (i, j)
             size (Union[int, Tuple[int, int]]): Size of each tile. May be a tuple of (height, width) or a
                 single integer, in which case square tiles of that size are generated.
-            *args: positional arguments passed through
-            **kwargs: keyword arguments passed through
+            *args: positional arguments passed through to ``extract_region()`` method of the backend.
+            **kwargs: keyword arguments passed through to ``extract_region()`` method of the backend.
 
         Returns:
             np.ndarray: image at the specified region

diff --git a/pathml/core/tile.py b/pathml/core/tile.py
@@ -108,7 +108,7 @@ def __init__(
             # remove any Nones
             stain_type_dict = {key: val for key, val in stain_type_dict.items() if val}
             if stain_type_dict:
-                slide_type = pathml.core.types.SlideType(**stain_type_dict)
+                slide_type = pathml.core.slide_types.SlideType(**stain_type_dict)
 
         assert counts is None or isinstance(
             counts, anndata.AnnData

diff --git a/pathml/datasets/pannuke.py b/pathml/datasets/pannuke.py
@@ -249,7 +249,7 @@ def _download_pannuke(self, download_dir):
             # don't download if the directory already exists
             if not os.path.isdir(p):
                 print(f"Downloading fold {fold_ix}")
-                url = f"https://warwick.ac.uk/fac/sci/dcs/research/tia/data/pannuke/fold_{fold_ix}.zip"
+                url = f"https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_{fold_ix}.zip"
                 name = os.path.basename(url)
                 download_from_url(url=url, download_dir=download_dir, name=name)
                 path = os.path.join(download_dir, name)