ComputationalCryoEM · garrettwrong · Apr 14, 2023 · Apr 8, 2023 · Apr 9, 2023 · Apr 9, 2023
@@ -12,7 +12,6 @@ dependencies:
   - pip
   - python=3.8
   - numpy=1.23.5
-  - pandas=1.3.5
   - scipy=1.9.3
   - scikit-learn
   - scikit-image

@@ -12,7 +12,6 @@ dependencies:
   - pip
   - python=3.8
   - numpy=1.23.5
-  - pandas=1.3.5
   - scipy=1.9.3
   - scikit-learn
   - scikit-image
@@ -12,7 +12,6 @@ dependencies:
   - pip
   - python=3.8
   - numpy=1.23.5
-  - pandas=1.3.5
   - scipy=1.9.3
   - scikit-learn
   - scikit-image

@@ -12,7 +12,6 @@ dependencies:
   - pip
   - python=3.8
   - numpy=1.23.5
-  - pandas=1.3.5
   - scipy=1.9.3
   - scikit-learn
   - scikit-image

@@ -33,7 +33,6 @@ def read(fname):
         "mrcfile",
         "numpy>=1.21.5",
         "packaging",
-        "pandas>=1.3.5",
         "psutil",
         "pyfftw",
         "PyWavelets",

@@ -12,7 +12,6 @@
 import mrcfile
 import numpy as np
 from numpy import linalg as npla
-from pandas import DataFrame
 from scipy.optimize import linprog
 from scipy.signal.windows import dpss
 
@@ -693,9 +692,8 @@ def write_star(self, name, params_dict, output_dir):
         data_block["_rlnAmplitudeContrast"] = params_dict["amplitude_contrast"]
         data_block["_rlnVoltage"] = params_dict["voltage"]
         data_block["_rlnMicrographPixelSize"] = params_dict["pixel_size"]
-        df = DataFrame([data_block])
         blocks = OrderedDict()
-        blocks["root"] = df
+        blocks["root"] = data_block
         star = StarFile(blocks=blocks)
         star.write(os.path.join(output_dir, os.path.splitext(name)[0]) + ".star")
 

@@ -7,7 +7,6 @@
 
 import mrcfile
 import numpy as np
-import pandas as pd
 
 from aspire.image import Image
 from aspire.operators import CTFFilter, IdentityFilter
@@ -226,8 +225,9 @@ def _coords_list_from_star(self, star_file):
         return a list of coordinates in box format.
         :param star_file: A path to a STAR file containing particle centers
         """
-        df = StarFile(star_file).get_block_by_index(0).astype(float)
-        coords = list(zip(df["_rlnCoordinateX"], df["_rlnCoordinateY"]))
+        data_block = StarFile(star_file).get_block_by_index(0)
+        coords = list(zip(data_block["_rlnCoordinateX"], data_block["_rlnCoordinateY"]))
+        coords = [(float(x), float(y)) for x, y in coords]
         return [
             self._box_coord_from_center(coord, self.particle_size) for coord in coords
         ]
@@ -318,16 +318,16 @@ def import_aspire_ctf(self, ctf):
                 "Number of CTF STAR files must match number of micrographs."
             )
 
-        # merge DataFrames from CTF files
-        dfs = []
+        # merge dicts from CTF files
+        data_blocks = defaultdict(list)
         for f in ctf:
             # ASPIRE's CTF Estimator produces legacy (=< 3.0) STAR files containing one row
             star = RelionStarFile(f)
-            dfs.append(star.data_block)
+            data_block = star.data_block
+            for k, v in data_block.items():
+                data_blocks[k].append(v)
 
-        df = pd.concat(dfs, ignore_index=True)
-
-        self._extract_ctf(df)
+        self._extract_ctf(data_blocks)
 
     def import_relion_ctf(self, ctf):
         """
@@ -339,18 +339,18 @@ def import_relion_ctf(self, ctf):
         """
         data_block = RelionStarFile(ctf).get_merged_data_block()
 
-        # data_block is a pandas Dataframe containing the micrographs
-        if not len(data_block) == self.num_micrographs:
+        # data_block is a dict containing the micrographs
+        if not len(list(data_block.values())[0]) == self.num_micrographs:
             raise ValueError(
                 f"{ctf} has CTF information for {len(data_block)}",
                 f" micrographs but this source has {self.num_micrographs} micrographs.",
             )
 
         self._extract_ctf(data_block)
 
-    def _extract_ctf(self, df):
+    def _extract_ctf(self, data_block):
         """
-        Receives a flattened DataFrame containing micrograph CTF information, and populates
+        Receives a dict containing micrograph CTF information, and populates
             the Source's CTF Filters, filter indices, and metadata.
         """
         # required CTF params excluding pixel size
@@ -366,8 +366,11 @@ def _extract_ctf(self, df):
 
         # get unique ctfs from the data block
         # i'th entry of `indices` contains the index of `filter_params` with corresponding CTF params
+        ctf_data = np.stack(data_block[c] for c in CTF_params).astype(self.dtype).T
         filter_params, indices = np.unique(
-            df[CTF_params].astype(self.dtype).values, return_inverse=True, axis=0
+            ctf_data,
+            return_inverse=True,
+            axis=0,
         )
 
         # convert defocus_ang from degrees to radians
@@ -643,16 +646,16 @@ def _validate_starfile(self, coord_file):
         """
         Ensures that a STAR file contains numeric particle centers.
         """
-        df = StarFile(coord_file).get_block_by_index(0)
+        data_block = StarFile(coord_file).get_block_by_index(0)
         # We're looking for specific columns for the X and Y coordinates
-        if not all(col in df.columns for col in ["_rlnCoordinateX", "_rlnCoordinateY"]):
+        if not all(col in data_block for col in ["_rlnCoordinateX", "_rlnCoordinateY"]):
             logger.error(f"Problem with coordinate file: {coord_file}")
             raise ValueError(
                 "STAR file does not contain _rlnCoordinateX, _rlnCoordinateY columns."
             )
         # check that all values in each column are numeric
         if not all(
-            all(df[col].apply(self._is_number))
+            all(map(self._is_number, data_block[col]))
             for col in ["_rlnCoordinateX", "_rlnCoordinateY"]
         ):
             logger.error(f"Problem with coordinate file: {coord_file}")

@@ -5,7 +5,6 @@
 
 import mrcfile
 import numpy as np
-import pandas as pd
 
 from aspire.image import Image
 from aspire.operators import CTFFilter, IdentityFilter
@@ -20,7 +19,7 @@ class RelionSource(ImageSource):
     A RelionSource represents a source of picked and cropped particles stored as slices in a `.mrcs` stack.
     It must be instantiated via a STAR file, which--at a minumum--lists the particles in each `.mrcs` stack in the
     `_rlnImageName` column. The STAR file may also contain Relion-specific metadata columns. This information
-    is read into a Pandas DataFrame table containing a row for each particle specifying its location and
+    is read into dictionaries containing rows for each particle specifying its location and
     its metadata. The metadata table may be augmented or modified via helper methods found in ImageSource. It may
     store, for example, Filter objects added during preprocessing.
     """
@@ -61,12 +60,12 @@ def __init__(
 
         metadata = self.populate_metadata()
 
-        n = len(metadata)
+        n = len(metadata["__mrc_filepath"])
         if n == 0:
             raise RuntimeError("No mrcs files found for starfile!")
 
         # Peek into the first image and populate some attributes
-        first_mrc_filepath = metadata.loc[0]["__mrc_filepath"]
+        first_mrc_filepath = metadata["__mrc_filepath"][0]
         mrc = mrcfile.open(first_mrc_filepath)
 
         # Get the 'mode' (data type) - TODO: There's probably a more direct way to do this.
@@ -106,10 +105,13 @@ def __init__(
             "_rlnAmplitudeContrast",
         ]
         # If these all exist in the STAR file, we may create CTF filters for the source
-        if set(CTF_params).issubset(metadata.columns):
+        if set(CTF_params).issubset(metadata.keys()):
             # partition particles according to unique CTF parameters
+            ctf_data = np.stack(metadata[k] for k in CTF_params).T
             filter_params, filter_indices = np.unique(
-                metadata[CTF_params].values, return_inverse=True, axis=0
+                ctf_data,
+                return_inverse=True,
+                axis=0,
             )
             filters = []
             # for each unique CTF configuration, create a CTFFilter object
@@ -132,7 +134,7 @@ def __init__(
             self.filter_indices = filter_indices
 
         # We have provided some, but not all the required params
-        elif any(param in metadata.columns for param in CTF_params):
+        elif any(param in metadata for param in CTF_params):
             logger.warning(
                 f"Found partially populated CTF Params."
                 f"  To automatically populate CTFFilters provide {CTF_params}"
@@ -151,7 +153,7 @@ def __init__(
     def populate_metadata(self):
         """
         Relion STAR files may contain a large number of metadata columns in addition
-        to the locations of particles. We read this into a Pandas DataFrame and add some of
+        to the locations of particles. We read this into a dict and add some of
         our own columns for convenience.
         """
         if self.data_folder is not None:
@@ -167,25 +169,24 @@ def populate_metadata(self):
         # particle locations are stored as e.g. '000001@first_micrograph.mrcs'
         # in the _rlnImageName column. here, we're splitting this information
         # so we can get the particle's index in the .mrcs stack as an int
-        metadata[["__mrc_index", "__mrc_filename"]] = metadata[
-            "_rlnImageName"
-        ].str.split("@", n=1, expand=True)
+        indices_filenames = [s.split("@") for s in metadata["_rlnImageName"]]
         # __mrc_index corresponds to the integer index of the particle in the __mrc_filename stack
         # Note that this is 1-based indexing
-        metadata["__mrc_index"] = pd.to_numeric(metadata["__mrc_index"])
+        metadata["__mrc_index"] = np.array([int(s[0]) for s in indices_filenames])
+        metadata["__mrc_filename"] = np.array([s[1] for s in indices_filenames])
 
         # Adding a full-filepath field to the Dataframe helps us save time later
         # Note that os.path.join works as expected when the second argument is an absolute path itself
-        metadata["__mrc_filepath"] = metadata["__mrc_filename"].apply(
-            lambda filename: os.path.join(self.data_folder, filename)
+        metadata["__mrc_filepath"] = np.array(
+            [os.path.join(self.data_folder, p) for p in metadata["__mrc_filename"]]
         )
 
         # finally, chop off the metadata df at max_rows
         if self.max_rows is None:
             return metadata
         else:
-            max_rows = min(self.max_rows, len(metadata))
-            return metadata.iloc[:max_rows]
+            max_rows = min(self.max_rows, len(metadata["__mrc_filepath"]))
+            return {k: v[:max_rows] for k, v in metadata.items()}
 
     def __str__(self):
         return f"RelionSource ({self.n} images of size {self.L}x{self.L})"
@@ -209,34 +210,38 @@ def _images(self, indices):
         # Log the indices in case needed to debug a crash
         logger.debug(f"Indices: {indices}")
 
-        def load_single_mrcs(filepath, df):
+        def load_single_mrcs(filepath, indices):
             arr = mrcfile.open(filepath).data
             # if the stack only contains one image, arr will have shape (resolution, resolution)
             # the code below reshapes it to (1, resolution, resolution)
             if len(arr.shape) == 2:
                 arr = arr.reshape((1,) + arr.shape)
             # __mrc_index is the 1-based index of the particle in the stack
-            data = arr[df["__mrc_index"] - 1, :, :]
+            data = arr[self._metadata["__mrc_index"][indices] - 1, :, :]
 
-            return df.index, data
+            return indices, data
 
         n_workers = self.n_workers
         if n_workers < 0:
             n_workers = cpu_count() - 1
 
-        df = self._metadata.loc[indices]
         im = np.empty(
             (len(indices), self._original_resolution, self._original_resolution),
             dtype=self.dtype,
         )
 
-        groups = df.groupby("__mrc_filepath")
-        n_workers = min(n_workers, len(groups))
+        filepaths, filepath_indices = np.unique(
+            self._metadata["__mrc_filepath"], return_inverse=True
+        )
+        n_workers = min(n_workers, len(filepaths))
 
         with futures.ThreadPoolExecutor(n_workers) as executor:
             to_do = []
-            for filepath, _df in groups:
-                future = executor.submit(load_single_mrcs, filepath, _df)
+            for i, filepath in enumerate(filepaths):
+                this_filepath_indices = np.where(filepath_indices == i)[0]
+                future = executor.submit(
+                    load_single_mrcs, filepath, this_filepath_indices
+                )
                 to_do.append(future)
 
             for future in futures.as_completed(to_do):