Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ticket/2594/dev #2609

Merged
merged 11 commits into from
Nov 28, 2022
Original file line number Diff line number Diff line change
@@ -1,31 +1,21 @@
import pandas as pd
from typing import Iterable, List
import ast
from typing import Iterable

from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.abcs import BehaviorProjectBase # noqa: E501
from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.abcs import ( # noqa: E501
BehaviorProjectBase,
)
from allensdk.brain_observatory.behavior.behavior_session import (
BehaviorSession)
BehaviorSession,
)
from allensdk.brain_observatory.behavior.behavior_ophys_experiment import (
BehaviorOphysExperiment)
BehaviorOphysExperiment,
)
from allensdk.core.utilities import literal_col_eval
from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.data_io.project_cloud_api_base import ( # noqa: E501
ProjectCloudApiBase,
)

from allensdk.brain_observatory.behavior.behavior_project_cache.\
project_apis.data_io.project_cloud_api_base import ProjectCloudApiBase # noqa: E501


def literal_col_eval(df: pd.DataFrame,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved this function to utilities.py and removed the default values for columns

columns: List[str] = ["ophys_experiment_id",
"ophys_container_id",
"driver_line"]) -> pd.DataFrame:
def converter(x):
if isinstance(x, str):
x = ast.literal_eval(x)
return x

for column in columns:
if column in df.columns:
df.loc[df[column].notnull(), column] = \
df[column][df[column].notnull()].apply(converter)
return df
COL_EVAL_LIST = ["ophys_experiment_id", "ophys_container_id", "driver_line"]


class BehaviorProjectCloudApi(BehaviorProjectBase, ProjectCloudApiBase):
Expand All @@ -34,26 +24,32 @@ class BehaviorProjectCloudApi(BehaviorProjectBase, ProjectCloudApiBase):

def _load_manifest_tables(self):

expected_metadata = set(["behavior_session_table",
"ophys_session_table",
"ophys_experiment_table",
"ophys_cells_table"])
expected_metadata = set(
[
"behavior_session_table",
"ophys_session_table",
"ophys_experiment_table",
"ophys_cells_table",
]
)

cache_metadata = set(self.cache._manifest.metadata_file_names)

if cache_metadata != expected_metadata:
raise RuntimeError("expected S3CloudCache object to have "
f"metadata file names: {expected_metadata} "
f"but it has {cache_metadata}")
raise RuntimeError(
"expected S3CloudCache object to have "
f"metadata file names: {expected_metadata} "
f"but it has {cache_metadata}"
)

self._get_ophys_session_table()
self._get_behavior_session_table()
self._get_ophys_experiment_table()
self._get_ophys_cells_table()

def get_behavior_session(
self,
behavior_session_id: int) -> BehaviorSession:
self, behavior_session_id: int
) -> BehaviorSession:
"""get a BehaviorSession by specifying behavior_session_id

Parameters
Expand All @@ -79,25 +75,28 @@ def get_behavior_session(

"""
row = self._behavior_session_table.query(
f"behavior_session_id=={behavior_session_id}")
f"behavior_session_id=={behavior_session_id}"
)
if row.shape[0] != 1:
raise RuntimeError("The behavior_session_table should have "
"1 and only 1 entry for a given "
"behavior_session_id. For "
f"{behavior_session_id} "
f" there are {row.shape[0]} entries.")
raise RuntimeError(
"The behavior_session_table should have "
"1 and only 1 entry for a given "
"behavior_session_id. For "
f"{behavior_session_id} "
f" there are {row.shape[0]} entries."
)
row = row.squeeze()
has_file_id = not pd.isna(row[self.cache.file_id_column])
if not has_file_id:
oeid = row.ophys_experiment_id[0]
row = self._ophys_experiment_table.query(f"index=={oeid}")
file_id = str(int(row[self.cache.file_id_column]))
data_path = self._get_data_path(file_id=file_id)
return BehaviorSession.from_nwb_path(
nwb_path=str(data_path))
return BehaviorSession.from_nwb_path(nwb_path=str(data_path))

def get_behavior_ophys_experiment(self, ophys_experiment_id: int
) -> BehaviorOphysExperiment:
def get_behavior_ophys_experiment(
self, ophys_experiment_id: int
) -> BehaviorOphysExperiment:
"""get a BehaviorOphysExperiment by specifying ophys_experiment_id

Parameters
Expand All @@ -111,24 +110,29 @@ def get_behavior_ophys_experiment(self, ophys_experiment_id: int

"""
row = self._ophys_experiment_table.query(
f"index=={ophys_experiment_id}")
f"index=={ophys_experiment_id}"
)
if row.shape[0] != 1:
raise RuntimeError("The behavior_ophys_experiment_table should "
"have 1 and only 1 entry for a given "
f"ophys_experiment_id. For "
f"{ophys_experiment_id} "
f" there are {row.shape[0]} entries.")
raise RuntimeError(
"The behavior_ophys_experiment_table should "
"have 1 and only 1 entry for a given "
f"ophys_experiment_id. For "
f"{ophys_experiment_id} "
f" there are {row.shape[0]} entries."
)
file_id = str(int(row[self.cache.file_id_column]))
data_path = self._get_data_path(file_id=file_id)
return BehaviorOphysExperiment.from_nwb_path(
str(data_path))
return BehaviorOphysExperiment.from_nwb_path(str(data_path))

def _get_ophys_session_table(self):
session_table_path = self._get_metadata_path(
fname="ophys_session_table")
df = literal_col_eval(pd.read_csv(session_table_path,
dtype={'mouse_id': str}))
df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
fname="ophys_session_table"
)
df = literal_col_eval(
pd.read_csv(session_table_path, dtype={"mouse_id": str}),
columns=COL_EVAL_LIST,
)
df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])
self._ophys_session_table = df.set_index("ophys_session_id")

def get_ophys_session_table(self) -> pd.DataFrame:
Expand All @@ -146,10 +150,13 @@ def get_ophys_session_table(self) -> pd.DataFrame:

def _get_behavior_session_table(self):
session_table_path = self._get_metadata_path(
fname='behavior_session_table')
df = literal_col_eval(pd.read_csv(session_table_path,
dtype={'mouse_id': str}))
df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
fname="behavior_session_table"
)
df = literal_col_eval(
pd.read_csv(session_table_path, dtype={"mouse_id": str}),
columns=COL_EVAL_LIST,
)
df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])

self._behavior_session_table = df.set_index("behavior_session_id")

Expand All @@ -172,20 +179,27 @@ def get_behavior_session_table(self) -> pd.DataFrame:

def _get_ophys_experiment_table(self):
experiment_table_path = self._get_metadata_path(
fname="ophys_experiment_table")
df = literal_col_eval(pd.read_csv(experiment_table_path,
dtype={'mouse_id': str}))
df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
fname="ophys_experiment_table"
)
df = literal_col_eval(
pd.read_csv(experiment_table_path, dtype={"mouse_id": str}),
columns=COL_EVAL_LIST,
)
df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])

self._ophys_experiment_table = df.set_index("ophys_experiment_id")

def _get_ophys_cells_table(self):
ophys_cells_table_path = self._get_metadata_path(
fname="ophys_cells_table")
df = literal_col_eval(pd.read_csv(ophys_cells_table_path))
fname="ophys_cells_table"
)
df = literal_col_eval(
pd.read_csv(ophys_cells_table_path), columns=COL_EVAL_LIST
)
# NaN's for invalid cells force this to float, push to int
df['cell_specimen_id'] = pd.array(df['cell_specimen_id'],
dtype="Int64")
df["cell_specimen_id"] = pd.array(
df["cell_specimen_id"], dtype="Int64"
)
self._ophys_cells_table = df.set_index("cell_roi_id")

def get_ophys_cells_table(self):
Expand All @@ -205,7 +219,7 @@ def get_ophys_experiment_table(self):
return self._ophys_experiment_table

def get_natural_movie_template(self, number: int) -> Iterable[bytes]:
""" Download a template for the natural movie stimulus. This is the
"""Download a template for the natural movie stimulus. This is the
actual movie that was shown during the recording session.
:param number: identifier for this scene
:type number: int
Expand Down
70 changes: 47 additions & 23 deletions allensdk/brain_observatory/ecephys/ecephys_session.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
import warnings
from collections.abc import Collection
from collections import defaultdict
from collections.abc import Collection
from typing import Optional

import xarray as xr
import numpy as np
import pandas as pd
import scipy.stats
import xarray as xr

from allensdk.core.lazy_property import LazyPropertyMixin
from allensdk.core.utilities import literal_col_eval, df_list_to_tuple
from allensdk.brain_observatory.ecephys.ecephys_session_api import (
EcephysSessionApi,
EcephysNwbSessionApi,
EcephysNwb1Api)
EcephysNwb1Api,
EcephysNwbSessionApi,
EcephysSessionApi,
)
from allensdk.brain_observatory.ecephys.stimulus_table import naming_utilities
from allensdk.brain_observatory.ecephys.stimulus_table._schemas import (
default_column_renames,
default_stimulus_renames,
default_column_renames)
)
from allensdk.core.lazy_property import LazyPropertyMixin

# stimulus_presentation column names not describing a parameter of a stimulus
NON_STIMULUS_PARAMETERS = tuple([
Expand Down Expand Up @@ -356,16 +359,17 @@ def get_current_source_density(self, probe_id):
Returns
-------
xr.DataArray :
dimensions are channel (id) and time (seconds, relative to stimulus
onset). Values are current source density assessed on that
channel at that time (V/m^2)
dimensions are channel (id) and time (seconds, relative to
stimulus onset). Values are current source density assessed
on that channel at that time (V/m^2)

"""

return self.api.get_current_source_density(probe_id)

def get_lfp(self, probe_id, mask_invalid_intervals=True):
''' Load an xarray DataArray with LFP data from channels on a single probe
''' Load an xarray DataArray with LFP data from channels on a
single probe

Parameters
----------
Expand Down Expand Up @@ -979,8 +983,8 @@ def get_parameter_values_for_stimulus(
self,
stimulus_name,
drop_nulls=True):
""" For each stimulus parameter, report the unique values taken on by that
parameter while a named stimulus was presented.
""" For each stimulus parameter, report the unique values taken
on by that parameter while a named stimulus was presented.

Parameters
----------
Expand All @@ -1004,8 +1008,8 @@ def get_stimulus_parameter_values(
self,
stimulus_presentation_ids=None,
drop_nulls=True):
''' For each stimulus parameter, report the unique values taken on by that
parameter throughout the course of the session.
''' For each stimulus parameter, report the unique values taken
on by that parameter throughout the course of the session.

Parameters
----------
Expand Down Expand Up @@ -1036,7 +1040,6 @@ def get_stimulus_parameter_values(

non_null = np.array(uniques[uniques != "null"])
non_null = non_null
non_null = np.sort(non_null)

if not drop_nulls and "null" in uniques:
non_null = np.concatenate([non_null, ["null"]])
Expand Down Expand Up @@ -1124,12 +1127,29 @@ def _build_stimulus_presentations(
# pandas groupby ops ignore nans, so we need a new "nonapplicable"
# value that pandas does not recognize as null ...
stimulus_presentations.replace("", nonapplicable, inplace=True)
stimulus_presentations.fillna(nonapplicable, inplace=True)

stimulus_presentations['duration'] = \
stimulus_presentations['stop_time'] - \
stimulus_presentations['start_time']

# pandas does not automatically convert boolean cols for fillna
boolean_colnames = stimulus_presentations.dtypes[
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pandas 1.1 introduced dropna argument for groupby, which allows using na values as keys. If this is not a possibility, then I guess this is ok.

I also don't know what the use case is here, but I'm surprised we want to group by a missing key

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call with the dropna. Although this would require a refactor since several parts of the code and notebook refer to the NA value as 'null'. I changed those to check for nan instead.

Some of these old files can require a good amount of linting once they're touched.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, after doing more testing, I discovered one cell of the notebook had a change in results. Apparently, there's an unresolved pandas bug with dropna in the groupby function when used with MultiIndex groupings. pandas-dev/pandas#36470

I reverted everything back to 'null'

stimulus_presentations.dtypes == "boolean"].index
col_type_map = {colname: "object" for colname in boolean_colnames}
stimulus_presentations = stimulus_presentations.astype(
col_type_map).fillna(nonapplicable)
mikejhuang marked this conversation as resolved.
Show resolved Hide resolved

# eval str(numeric) and str(lists)
# convert lists to tuple for hashability
# Rationale: pd dataframe reads values as str from nwb files
# where they are expected to be float
col_list = ["phase, size, spatial_frequency"]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed the logic here to eval/tuple by specifying columns instead of creating rules.

stimulus_presentations = literal_col_eval(
stimulus_presentations,
columns=col_list)
stimulus_presentations = df_list_to_tuple(
stimulus_presentations,
columns=col_list)
stimulus_presentations["duration"] = (
stimulus_presentations["stop_time"]
- stimulus_presentations["start_time"]
)
# TODO: database these
stimulus_conditions = {}
presentation_conditions = []
Expand Down Expand Up @@ -1241,7 +1261,10 @@ def _build_mean_waveforms(self, mean_waveforms):

channel_id_lut = defaultdict(lambda: -1)
for cid, row in self.channels.iterrows():
channel_id_lut[(row["local_index"], row["probe_id"])] = cid
channel_id_lut[(
row["probe_channel_number"],
row["probe_id"],
)] = cid

probe_id_lut = {
uid: row['probe_id'] for uid, row in self._units.iterrows()
Expand Down Expand Up @@ -1438,7 +1461,8 @@ def is_distinct_from(left, right):


def array_intervals(array):
""" find interval bounds (bounding consecutive identical values) in an array
""" find interval bounds (bounding consecutive identical values)
in an array

Parameters
-----------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,3 @@ def map_column_names(table, name_map=None, ignore_case=True):
output = table.rename(columns=name_map)

return output
#
Loading