Merge pull request #2609 from AllenInstitute/ticket/2594/dev

Ticket/2594/dev
AllenInstitute · Nov 28, 2022 · 28e8497 · 28e8497
2 parents 4245587 + 25c055d
commit 28e8497
Show file tree

Hide file tree

Showing 6 changed files with 140,485 additions and 90,291 deletions.
diff --git a/...vatory/behavior/behavior_project_cache/project_apis/data_io/behavior_project_cloud_api.py b/...vatory/behavior/behavior_project_cache/project_apis/data_io/behavior_project_cloud_api.py
@@ -1,31 +1,21 @@
 import pandas as pd
-from typing import Iterable, List
-import ast
+from typing import Iterable
 
-from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.abcs import BehaviorProjectBase  # noqa: E501
+from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.abcs import (  # noqa: E501
+    BehaviorProjectBase,
+)
 from allensdk.brain_observatory.behavior.behavior_session import (
-    BehaviorSession)
+    BehaviorSession,
+)
 from allensdk.brain_observatory.behavior.behavior_ophys_experiment import (
-    BehaviorOphysExperiment)
+    BehaviorOphysExperiment,
+)
+from allensdk.core.utilities import literal_col_eval
+from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.data_io.project_cloud_api_base import (  # noqa: E501
+    ProjectCloudApiBase,
+)
 
-from allensdk.brain_observatory.behavior.behavior_project_cache.\
-    project_apis.data_io.project_cloud_api_base import ProjectCloudApiBase  # noqa: E501
-
-
-def literal_col_eval(df: pd.DataFrame,
-                     columns: List[str] = ["ophys_experiment_id",
-                                           "ophys_container_id",
-                                           "driver_line"]) -> pd.DataFrame:
-    def converter(x):
-        if isinstance(x, str):
-            x = ast.literal_eval(x)
-        return x
-
-    for column in columns:
-        if column in df.columns:
-            df.loc[df[column].notnull(), column] = \
-                df[column][df[column].notnull()].apply(converter)
-    return df
+COL_EVAL_LIST = ["ophys_experiment_id", "ophys_container_id", "driver_line"]
 
 
 class BehaviorProjectCloudApi(BehaviorProjectBase, ProjectCloudApiBase):
@@ -34,26 +24,32 @@ class BehaviorProjectCloudApi(BehaviorProjectBase, ProjectCloudApiBase):
 
     def _load_manifest_tables(self):
 
-        expected_metadata = set(["behavior_session_table",
-                                 "ophys_session_table",
-                                 "ophys_experiment_table",
-                                 "ophys_cells_table"])
+        expected_metadata = set(
+            [
+                "behavior_session_table",
+                "ophys_session_table",
+                "ophys_experiment_table",
+                "ophys_cells_table",
+            ]
+        )
 
         cache_metadata = set(self.cache._manifest.metadata_file_names)
 
         if cache_metadata != expected_metadata:
-            raise RuntimeError("expected S3CloudCache object to have "
-                               f"metadata file names: {expected_metadata} "
-                               f"but it has {cache_metadata}")
+            raise RuntimeError(
+                "expected S3CloudCache object to have "
+                f"metadata file names: {expected_metadata} "
+                f"but it has {cache_metadata}"
+            )
 
         self._get_ophys_session_table()
         self._get_behavior_session_table()
         self._get_ophys_experiment_table()
         self._get_ophys_cells_table()
 
     def get_behavior_session(
-            self,
-            behavior_session_id: int) -> BehaviorSession:
+        self, behavior_session_id: int
+    ) -> BehaviorSession:
         """get a BehaviorSession by specifying behavior_session_id
 
         Parameters
@@ -79,25 +75,28 @@ def get_behavior_session(
 
         """
         row = self._behavior_session_table.query(
-                f"behavior_session_id=={behavior_session_id}")
+            f"behavior_session_id=={behavior_session_id}"
+        )
         if row.shape[0] != 1:
-            raise RuntimeError("The behavior_session_table should have "
-                               "1 and only 1 entry for a given "
-                               "behavior_session_id. For "
-                               f"{behavior_session_id} "
-                               f" there are {row.shape[0]} entries.")
+            raise RuntimeError(
+                "The behavior_session_table should have "
+                "1 and only 1 entry for a given "
+                "behavior_session_id. For "
+                f"{behavior_session_id} "
+                f" there are {row.shape[0]} entries."
+            )
         row = row.squeeze()
         has_file_id = not pd.isna(row[self.cache.file_id_column])
         if not has_file_id:
             oeid = row.ophys_experiment_id[0]
             row = self._ophys_experiment_table.query(f"index=={oeid}")
         file_id = str(int(row[self.cache.file_id_column]))
         data_path = self._get_data_path(file_id=file_id)
-        return BehaviorSession.from_nwb_path(
-                nwb_path=str(data_path))
+        return BehaviorSession.from_nwb_path(nwb_path=str(data_path))
 
-    def get_behavior_ophys_experiment(self, ophys_experiment_id: int
-                                      ) -> BehaviorOphysExperiment:
+    def get_behavior_ophys_experiment(
+        self, ophys_experiment_id: int
+    ) -> BehaviorOphysExperiment:
         """get a BehaviorOphysExperiment by specifying ophys_experiment_id
 
         Parameters
@@ -111,24 +110,29 @@ def get_behavior_ophys_experiment(self, ophys_experiment_id: int
 
         """
         row = self._ophys_experiment_table.query(
-                f"index=={ophys_experiment_id}")
+            f"index=={ophys_experiment_id}"
+        )
         if row.shape[0] != 1:
-            raise RuntimeError("The behavior_ophys_experiment_table should "
-                               "have 1 and only 1 entry for a given "
-                               f"ophys_experiment_id. For "
-                               f"{ophys_experiment_id} "
-                               f" there are {row.shape[0]} entries.")
+            raise RuntimeError(
+                "The behavior_ophys_experiment_table should "
+                "have 1 and only 1 entry for a given "
+                f"ophys_experiment_id. For "
+                f"{ophys_experiment_id} "
+                f" there are {row.shape[0]} entries."
+            )
         file_id = str(int(row[self.cache.file_id_column]))
         data_path = self._get_data_path(file_id=file_id)
-        return BehaviorOphysExperiment.from_nwb_path(
-            str(data_path))
+        return BehaviorOphysExperiment.from_nwb_path(str(data_path))
 
     def _get_ophys_session_table(self):
         session_table_path = self._get_metadata_path(
-            fname="ophys_session_table")
-        df = literal_col_eval(pd.read_csv(session_table_path,
-                                          dtype={'mouse_id': str}))
-        df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
+            fname="ophys_session_table"
+        )
+        df = literal_col_eval(
+            pd.read_csv(session_table_path, dtype={"mouse_id": str}),
+            columns=COL_EVAL_LIST,
+        )
+        df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])
         self._ophys_session_table = df.set_index("ophys_session_id")
 
     def get_ophys_session_table(self) -> pd.DataFrame:
@@ -146,10 +150,13 @@ def get_ophys_session_table(self) -> pd.DataFrame:
 
     def _get_behavior_session_table(self):
         session_table_path = self._get_metadata_path(
-            fname='behavior_session_table')
-        df = literal_col_eval(pd.read_csv(session_table_path,
-                                          dtype={'mouse_id': str}))
-        df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
+            fname="behavior_session_table"
+        )
+        df = literal_col_eval(
+            pd.read_csv(session_table_path, dtype={"mouse_id": str}),
+            columns=COL_EVAL_LIST,
+        )
+        df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])
 
         self._behavior_session_table = df.set_index("behavior_session_id")
 
@@ -172,20 +179,27 @@ def get_behavior_session_table(self) -> pd.DataFrame:
 
     def _get_ophys_experiment_table(self):
         experiment_table_path = self._get_metadata_path(
-            fname="ophys_experiment_table")
-        df = literal_col_eval(pd.read_csv(experiment_table_path,
-                                          dtype={'mouse_id': str}))
-        df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
+            fname="ophys_experiment_table"
+        )
+        df = literal_col_eval(
+            pd.read_csv(experiment_table_path, dtype={"mouse_id": str}),
+            columns=COL_EVAL_LIST,
+        )
+        df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])
 
         self._ophys_experiment_table = df.set_index("ophys_experiment_id")
 
     def _get_ophys_cells_table(self):
         ophys_cells_table_path = self._get_metadata_path(
-            fname="ophys_cells_table")
-        df = literal_col_eval(pd.read_csv(ophys_cells_table_path))
+            fname="ophys_cells_table"
+        )
+        df = literal_col_eval(
+            pd.read_csv(ophys_cells_table_path), columns=COL_EVAL_LIST
+        )
         # NaN's for invalid cells force this to float, push to int
-        df['cell_specimen_id'] = pd.array(df['cell_specimen_id'],
-                                          dtype="Int64")
+        df["cell_specimen_id"] = pd.array(
+            df["cell_specimen_id"], dtype="Int64"
+        )
         self._ophys_cells_table = df.set_index("cell_roi_id")
 
     def get_ophys_cells_table(self):
@@ -205,7 +219,7 @@ def get_ophys_experiment_table(self):
         return self._ophys_experiment_table
 
     def get_natural_movie_template(self, number: int) -> Iterable[bytes]:
-        """ Download a template for the natural movie stimulus. This is the
+        """Download a template for the natural movie stimulus. This is the
         actual movie that was shown during the recording session.
         :param number: identifier for this scene
         :type number: int

diff --git a/allensdk/brain_observatory/ecephys/ecephys_session.py b/allensdk/brain_observatory/ecephys/ecephys_session.py
@@ -1,22 +1,25 @@
 import warnings
-from collections.abc import Collection
 from collections import defaultdict
+from collections.abc import Collection
 from typing import Optional
 
-import xarray as xr
 import numpy as np
 import pandas as pd
 import scipy.stats
+import xarray as xr
 
-from allensdk.core.lazy_property import LazyPropertyMixin
+from allensdk.core.utilities import literal_col_eval, df_list_to_tuple
 from allensdk.brain_observatory.ecephys.ecephys_session_api import (
-        EcephysSessionApi,
-        EcephysNwbSessionApi,
-        EcephysNwb1Api)
+    EcephysNwb1Api,
+    EcephysNwbSessionApi,
+    EcephysSessionApi,
+)
 from allensdk.brain_observatory.ecephys.stimulus_table import naming_utilities
 from allensdk.brain_observatory.ecephys.stimulus_table._schemas import (
+    default_column_renames,
     default_stimulus_renames,
-    default_column_renames)
+)
+from allensdk.core.lazy_property import LazyPropertyMixin
 
 # stimulus_presentation column names not describing a parameter of a stimulus
 NON_STIMULUS_PARAMETERS = tuple([
@@ -356,16 +359,17 @@ def get_current_source_density(self, probe_id):
         Returns
         -------
         xr.DataArray :
-            dimensions are channel (id) and time (seconds, relative to stimulus
-            onset). Values are current source density assessed on that
-            channel at that time (V/m^2)
+            dimensions are channel (id) and time (seconds, relative to
+            stimulus onset). Values are current source density assessed
+            on that channel at that time (V/m^2)
 
         """
 
         return self.api.get_current_source_density(probe_id)
 
     def get_lfp(self, probe_id, mask_invalid_intervals=True):
-        ''' Load an xarray DataArray with LFP data from channels on a single probe
+        ''' Load an xarray DataArray with LFP data from channels on a
+         single probe
 
         Parameters
         ----------
@@ -979,8 +983,8 @@ def get_parameter_values_for_stimulus(
             self,
             stimulus_name,
             drop_nulls=True):
-        """ For each stimulus parameter, report the unique values taken on by that
-        parameter while a named stimulus was presented.
+        """ For each stimulus parameter, report the unique values taken
+        on by that parameter while a named stimulus was presented.
 
         Parameters
         ----------
@@ -1004,8 +1008,8 @@ def get_stimulus_parameter_values(
             self,
             stimulus_presentation_ids=None,
             drop_nulls=True):
-        ''' For each stimulus parameter, report the unique values taken on by that
-        parameter throughout the course of the  session.
+        ''' For each stimulus parameter, report the unique values taken
+        on by that parameter throughout the course of the session.
 
         Parameters
         ----------
@@ -1036,7 +1040,6 @@ def get_stimulus_parameter_values(
 
             non_null = np.array(uniques[uniques != "null"])
             non_null = non_null
-            non_null = np.sort(non_null)
 
             if not drop_nulls and "null" in uniques:
                 non_null = np.concatenate([non_null, ["null"]])
@@ -1124,12 +1127,29 @@ def _build_stimulus_presentations(
         # pandas groupby ops ignore nans, so we need a new "nonapplicable"
         # value that pandas does not recognize as null ...
         stimulus_presentations.replace("", nonapplicable, inplace=True)
-        stimulus_presentations.fillna(nonapplicable, inplace=True)
-
-        stimulus_presentations['duration'] = \
-            stimulus_presentations['stop_time'] - \
-            stimulus_presentations['start_time']
 
+        # pandas does not automatically convert boolean cols for fillna
+        boolean_colnames = stimulus_presentations.dtypes[
+            stimulus_presentations.dtypes == "boolean"].index
+        col_type_map = {colname: "object" for colname in boolean_colnames}
+        stimulus_presentations = stimulus_presentations.astype(
+            col_type_map).fillna(nonapplicable)
+
+        # eval str(numeric) and str(lists)
+        # convert lists to tuple for hashability
+        # Rationale: pd dataframe reads values as str from nwb files
+        # where they are expected to be float
+        col_list = ["phase, size, spatial_frequency"]
+        stimulus_presentations = literal_col_eval(
+            stimulus_presentations,
+            columns=col_list)
+        stimulus_presentations = df_list_to_tuple(
+            stimulus_presentations,
+            columns=col_list)
+        stimulus_presentations["duration"] = (
+            stimulus_presentations["stop_time"]
+            - stimulus_presentations["start_time"]
+        )
         # TODO: database these
         stimulus_conditions = {}
         presentation_conditions = []
@@ -1241,7 +1261,10 @@ def _build_mean_waveforms(self, mean_waveforms):
 
         channel_id_lut = defaultdict(lambda: -1)
         for cid, row in self.channels.iterrows():
-            channel_id_lut[(row["local_index"], row["probe_id"])] = cid
+            channel_id_lut[(
+                row["probe_channel_number"],
+                row["probe_id"],
+                )] = cid
 
         probe_id_lut = {
             uid: row['probe_id'] for uid, row in self._units.iterrows()
@@ -1438,7 +1461,8 @@ def is_distinct_from(left, right):
 
 
 def array_intervals(array):
-    """ find interval bounds (bounding consecutive identical values) in an array
+    """ find interval bounds (bounding consecutive identical values)
+    in an array
 
     Parameters
     -----------

diff --git a/allensdk/brain_observatory/ecephys/stimulus_table/naming_utilities.py b/allensdk/brain_observatory/ecephys/stimulus_table/naming_utilities.py
@@ -195,4 +195,3 @@ def map_column_names(table, name_map=None, ignore_case=True):
     output = table.rename(columns=name_map)
 
     return output
-#