Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion environment-accelerate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ dependencies:
- pip
- python=3.8
- numpy=1.23.5
- pandas=1.3.5
- scipy=1.9.3
- scikit-learn
- scikit-image
Expand Down
1 change: 0 additions & 1 deletion environment-default.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ dependencies:
- pip
- python=3.8
- numpy=1.23.5
- pandas=1.3.5
- scipy=1.9.3
- scikit-learn
- scikit-image
1 change: 0 additions & 1 deletion environment-intel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ dependencies:
- pip
- python=3.8
- numpy=1.23.5
- pandas=1.3.5
- scipy=1.9.3
- scikit-learn
- scikit-image
Expand Down
1 change: 0 additions & 1 deletion environment-openblas.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ dependencies:
- pip
- python=3.8
- numpy=1.23.5
- pandas=1.3.5
- scipy=1.9.3
- scikit-learn
- scikit-image
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def read(fname):
"mrcfile",
"numpy>=1.21.5",
"packaging",
"pandas>=1.3.5",
"psutil",
"pyfftw",
"PyWavelets",
Expand Down
4 changes: 1 addition & 3 deletions src/aspire/ctf/ctf_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import mrcfile
import numpy as np
from numpy import linalg as npla
from pandas import DataFrame
from scipy.optimize import linprog
from scipy.signal.windows import dpss

Expand Down Expand Up @@ -693,9 +692,8 @@ def write_star(self, name, params_dict, output_dir):
data_block["_rlnAmplitudeContrast"] = params_dict["amplitude_contrast"]
data_block["_rlnVoltage"] = params_dict["voltage"]
data_block["_rlnMicrographPixelSize"] = params_dict["pixel_size"]
df = DataFrame([data_block])
blocks = OrderedDict()
blocks["root"] = df
blocks["root"] = data_block
star = StarFile(blocks=blocks)
star.write(os.path.join(output_dir, os.path.splitext(name)[0]) + ".star")

Expand Down
37 changes: 20 additions & 17 deletions src/aspire/source/coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import mrcfile
import numpy as np
import pandas as pd

from aspire.image import Image
from aspire.operators import CTFFilter, IdentityFilter
Expand Down Expand Up @@ -226,8 +225,9 @@ def _coords_list_from_star(self, star_file):
return a list of coordinates in box format.
:param star_file: A path to a STAR file containing particle centers
"""
df = StarFile(star_file).get_block_by_index(0).astype(float)
coords = list(zip(df["_rlnCoordinateX"], df["_rlnCoordinateY"]))
data_block = StarFile(star_file).get_block_by_index(0)
coords = list(zip(data_block["_rlnCoordinateX"], data_block["_rlnCoordinateY"]))
coords = [(float(x), float(y)) for x, y in coords]
return [
self._box_coord_from_center(coord, self.particle_size) for coord in coords
]
Expand Down Expand Up @@ -318,16 +318,16 @@ def import_aspire_ctf(self, ctf):
"Number of CTF STAR files must match number of micrographs."
)

# merge DataFrames from CTF files
dfs = []
# merge dicts from CTF files
data_blocks = defaultdict(list)
for f in ctf:
# ASPIRE's CTF Estimator produces legacy (=< 3.0) STAR files containing one row
star = RelionStarFile(f)
dfs.append(star.data_block)
data_block = star.data_block
for k, v in data_block.items():
data_blocks[k].append(v)

df = pd.concat(dfs, ignore_index=True)

self._extract_ctf(df)
self._extract_ctf(data_blocks)

def import_relion_ctf(self, ctf):
"""
Expand All @@ -339,18 +339,18 @@ def import_relion_ctf(self, ctf):
"""
data_block = RelionStarFile(ctf).get_merged_data_block()

# data_block is a pandas Dataframe containing the micrographs
if not len(data_block) == self.num_micrographs:
# data_block is a dict containing the micrographs
if not len(list(data_block.values())[0]) == self.num_micrographs:
raise ValueError(
f"{ctf} has CTF information for {len(data_block)}",
f" micrographs but this source has {self.num_micrographs} micrographs.",
)

self._extract_ctf(data_block)

def _extract_ctf(self, df):
def _extract_ctf(self, data_block):
"""
Receives a flattened DataFrame containing micrograph CTF information, and populates
Receives a dict containing micrograph CTF information, and populates
the Source's CTF Filters, filter indices, and metadata.
"""
# required CTF params excluding pixel size
Expand All @@ -366,8 +366,11 @@ def _extract_ctf(self, df):

# get unique ctfs from the data block
# i'th entry of `indices` contains the index of `filter_params` with corresponding CTF params
ctf_data = np.stack(data_block[c] for c in CTF_params).astype(self.dtype).T
filter_params, indices = np.unique(
df[CTF_params].astype(self.dtype).values, return_inverse=True, axis=0
ctf_data,
return_inverse=True,
axis=0,
)

# convert defocus_ang from degrees to radians
Expand Down Expand Up @@ -643,16 +646,16 @@ def _validate_starfile(self, coord_file):
"""
Ensures that a STAR file contains numeric particle centers.
"""
df = StarFile(coord_file).get_block_by_index(0)
data_block = StarFile(coord_file).get_block_by_index(0)
# We're looking for specific columns for the X and Y coordinates
if not all(col in df.columns for col in ["_rlnCoordinateX", "_rlnCoordinateY"]):
if not all(col in data_block for col in ["_rlnCoordinateX", "_rlnCoordinateY"]):
logger.error(f"Problem with coordinate file: {coord_file}")
raise ValueError(
"STAR file does not contain _rlnCoordinateX, _rlnCoordinateY columns."
)
# check that all values in each column are numeric
if not all(
all(df[col].apply(self._is_number))
all(map(self._is_number, data_block[col]))
for col in ["_rlnCoordinateX", "_rlnCoordinateY"]
):
logger.error(f"Problem with coordinate file: {coord_file}")
Expand Down
53 changes: 29 additions & 24 deletions src/aspire/source/relion.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import mrcfile
import numpy as np
import pandas as pd

from aspire.image import Image
from aspire.operators import CTFFilter, IdentityFilter
Expand All @@ -20,7 +19,7 @@ class RelionSource(ImageSource):
A RelionSource represents a source of picked and cropped particles stored as slices in a `.mrcs` stack.
It must be instantiated via a STAR file, which--at a minumum--lists the particles in each `.mrcs` stack in the
`_rlnImageName` column. The STAR file may also contain Relion-specific metadata columns. This information
is read into a Pandas DataFrame table containing a row for each particle specifying its location and
is read into dictionaries containing rows for each particle specifying its location and
its metadata. The metadata table may be augmented or modified via helper methods found in ImageSource. It may
store, for example, Filter objects added during preprocessing.
"""
Expand Down Expand Up @@ -61,12 +60,12 @@ def __init__(

metadata = self.populate_metadata()

n = len(metadata)
n = len(metadata["__mrc_filepath"])
if n == 0:
raise RuntimeError("No mrcs files found for starfile!")

# Peek into the first image and populate some attributes
first_mrc_filepath = metadata.loc[0]["__mrc_filepath"]
first_mrc_filepath = metadata["__mrc_filepath"][0]
mrc = mrcfile.open(first_mrc_filepath)

# Get the 'mode' (data type) - TODO: There's probably a more direct way to do this.
Expand Down Expand Up @@ -106,10 +105,13 @@ def __init__(
"_rlnAmplitudeContrast",
]
# If these all exist in the STAR file, we may create CTF filters for the source
if set(CTF_params).issubset(metadata.columns):
if set(CTF_params).issubset(metadata.keys()):
# partition particles according to unique CTF parameters
ctf_data = np.stack(metadata[k] for k in CTF_params).T
filter_params, filter_indices = np.unique(
metadata[CTF_params].values, return_inverse=True, axis=0
ctf_data,
return_inverse=True,
axis=0,
)
filters = []
# for each unique CTF configuration, create a CTFFilter object
Expand All @@ -132,7 +134,7 @@ def __init__(
self.filter_indices = filter_indices

# We have provided some, but not all the required params
elif any(param in metadata.columns for param in CTF_params):
elif any(param in metadata for param in CTF_params):
logger.warning(
f"Found partially populated CTF Params."
f" To automatically populate CTFFilters provide {CTF_params}"
Expand All @@ -151,7 +153,7 @@ def __init__(
def populate_metadata(self):
"""
Relion STAR files may contain a large number of metadata columns in addition
to the locations of particles. We read this into a Pandas DataFrame and add some of
to the locations of particles. We read this into a dict and add some of
our own columns for convenience.
"""
if self.data_folder is not None:
Expand All @@ -167,25 +169,24 @@ def populate_metadata(self):
# particle locations are stored as e.g. '000001@first_micrograph.mrcs'
# in the _rlnImageName column. here, we're splitting this information
# so we can get the particle's index in the .mrcs stack as an int
metadata[["__mrc_index", "__mrc_filename"]] = metadata[
"_rlnImageName"
].str.split("@", n=1, expand=True)
indices_filenames = [s.split("@") for s in metadata["_rlnImageName"]]
# __mrc_index corresponds to the integer index of the particle in the __mrc_filename stack
# Note that this is 1-based indexing
metadata["__mrc_index"] = pd.to_numeric(metadata["__mrc_index"])
metadata["__mrc_index"] = np.array([int(s[0]) for s in indices_filenames])
metadata["__mrc_filename"] = np.array([s[1] for s in indices_filenames])

# Adding a full-filepath field to the Dataframe helps us save time later
# Note that os.path.join works as expected when the second argument is an absolute path itself
metadata["__mrc_filepath"] = metadata["__mrc_filename"].apply(
lambda filename: os.path.join(self.data_folder, filename)
metadata["__mrc_filepath"] = np.array(
[os.path.join(self.data_folder, p) for p in metadata["__mrc_filename"]]
)

# finally, chop off the metadata df at max_rows
if self.max_rows is None:
return metadata
else:
max_rows = min(self.max_rows, len(metadata))
return metadata.iloc[:max_rows]
max_rows = min(self.max_rows, len(metadata["__mrc_filepath"]))
return {k: v[:max_rows] for k, v in metadata.items()}

def __str__(self):
return f"RelionSource ({self.n} images of size {self.L}x{self.L})"
Expand All @@ -209,34 +210,38 @@ def _images(self, indices):
# Log the indices in case needed to debug a crash
logger.debug(f"Indices: {indices}")

def load_single_mrcs(filepath, df):
def load_single_mrcs(filepath, indices):
arr = mrcfile.open(filepath).data
# if the stack only contains one image, arr will have shape (resolution, resolution)
# the code below reshapes it to (1, resolution, resolution)
if len(arr.shape) == 2:
arr = arr.reshape((1,) + arr.shape)
# __mrc_index is the 1-based index of the particle in the stack
data = arr[df["__mrc_index"] - 1, :, :]
data = arr[self._metadata["__mrc_index"][indices] - 1, :, :]

return df.index, data
return indices, data

n_workers = self.n_workers
if n_workers < 0:
n_workers = cpu_count() - 1

df = self._metadata.loc[indices]
im = np.empty(
(len(indices), self._original_resolution, self._original_resolution),
dtype=self.dtype,
)

groups = df.groupby("__mrc_filepath")
n_workers = min(n_workers, len(groups))
filepaths, filepath_indices = np.unique(
self._metadata["__mrc_filepath"], return_inverse=True
)
n_workers = min(n_workers, len(filepaths))

with futures.ThreadPoolExecutor(n_workers) as executor:
to_do = []
for filepath, _df in groups:
future = executor.submit(load_single_mrcs, filepath, _df)
for i, filepath in enumerate(filepaths):
this_filepath_indices = np.where(filepath_indices == i)[0]
future = executor.submit(
load_single_mrcs, filepath, this_filepath_indices
)
to_do.append(future)

for future in futures.as_completed(to_do):
Expand Down
Loading