create utils in allensdk/core for df processing

change to double quotes for docstring lint add inline notes for eval/tuple rationale resolve merge conflicts resolve merge conflict
AllenInstitute · Nov 28, 2022 · 25c055d · 25c055d
1 parent 03ac5a7
commit 25c055d
Show file tree

Hide file tree

Showing 4 changed files with 140 additions and 146 deletions.
diff --git a/...vatory/behavior/behavior_project_cache/project_apis/data_io/behavior_project_cloud_api.py b/...vatory/behavior/behavior_project_cache/project_apis/data_io/behavior_project_cloud_api.py
@@ -1,47 +1,55 @@
 import pandas as pd
 from typing import Iterable
 
-from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.abcs import BehaviorProjectBase  # noqa: E501
+from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.abcs import (  # noqa: E501
+    BehaviorProjectBase,
+)
 from allensdk.brain_observatory.behavior.behavior_session import (
-    BehaviorSession)
+    BehaviorSession,
+)
 from allensdk.brain_observatory.behavior.behavior_ophys_experiment import (
-    BehaviorOphysExperiment)
-from allensdk.brain_observatory.behavior.swdb.utilities import literal_col_eval
-from allensdk.brain_observatory.behavior.behavior_project_cache.\
-    project_apis.data_io.project_cloud_api_base import ProjectCloudApiBase  # noqa: E501
+    BehaviorOphysExperiment,
+)
+from allensdk.core.utilities import literal_col_eval
+from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.data_io.project_cloud_api_base import (  # noqa: E501
+    ProjectCloudApiBase,
+)
 
+COL_EVAL_LIST = ["ophys_experiment_id", "ophys_container_id", "driver_line"]
 
 
-COL_EVAL_LIST = ["ophys_experiment_id",
-                    "ophys_container_id",
-                    "driver_line"]
-
 class BehaviorProjectCloudApi(BehaviorProjectBase, ProjectCloudApiBase):
 
     MANIFEST_COMPATIBILITY = ["1.0.0", "2.0.0"]
 
     def _load_manifest_tables(self):
 
-        expected_metadata = set(["behavior_session_table",
-                                 "ophys_session_table",
-                                 "ophys_experiment_table",
-                                 "ophys_cells_table"])
+        expected_metadata = set(
+            [
+                "behavior_session_table",
+                "ophys_session_table",
+                "ophys_experiment_table",
+                "ophys_cells_table",
+            ]
+        )
 
         cache_metadata = set(self.cache._manifest.metadata_file_names)
 
         if cache_metadata != expected_metadata:
-            raise RuntimeError("expected S3CloudCache object to have "
-                               f"metadata file names: {expected_metadata} "
-                               f"but it has {cache_metadata}")
+            raise RuntimeError(
+                "expected S3CloudCache object to have "
+                f"metadata file names: {expected_metadata} "
+                f"but it has {cache_metadata}"
+            )
 
         self._get_ophys_session_table()
         self._get_behavior_session_table()
         self._get_ophys_experiment_table()
         self._get_ophys_cells_table()
 
     def get_behavior_session(
-            self,
-            behavior_session_id: int) -> BehaviorSession:
+        self, behavior_session_id: int
+    ) -> BehaviorSession:
         """get a BehaviorSession by specifying behavior_session_id
 
         Parameters
@@ -67,25 +75,28 @@ def get_behavior_session(
 
         """
         row = self._behavior_session_table.query(
-                f"behavior_session_id=={behavior_session_id}")
+            f"behavior_session_id=={behavior_session_id}"
+        )
         if row.shape[0] != 1:
-            raise RuntimeError("The behavior_session_table should have "
-                               "1 and only 1 entry for a given "
-                               "behavior_session_id. For "
-                               f"{behavior_session_id} "
-                               f" there are {row.shape[0]} entries.")
+            raise RuntimeError(
+                "The behavior_session_table should have "
+                "1 and only 1 entry for a given "
+                "behavior_session_id. For "
+                f"{behavior_session_id} "
+                f" there are {row.shape[0]} entries."
+            )
         row = row.squeeze()
         has_file_id = not pd.isna(row[self.cache.file_id_column])
         if not has_file_id:
             oeid = row.ophys_experiment_id[0]
             row = self._ophys_experiment_table.query(f"index=={oeid}")
         file_id = str(int(row[self.cache.file_id_column]))
         data_path = self._get_data_path(file_id=file_id)
-        return BehaviorSession.from_nwb_path(
-                nwb_path=str(data_path))
+        return BehaviorSession.from_nwb_path(nwb_path=str(data_path))
 
-    def get_behavior_ophys_experiment(self, ophys_experiment_id: int
-                                      ) -> BehaviorOphysExperiment:
+    def get_behavior_ophys_experiment(
+        self, ophys_experiment_id: int
+    ) -> BehaviorOphysExperiment:
         """get a BehaviorOphysExperiment by specifying ophys_experiment_id
 
         Parameters
@@ -99,25 +110,29 @@ def get_behavior_ophys_experiment(self, ophys_experiment_id: int
 
         """
         row = self._ophys_experiment_table.query(
-                f"index=={ophys_experiment_id}")
+            f"index=={ophys_experiment_id}"
+        )
         if row.shape[0] != 1:
-            raise RuntimeError("The behavior_ophys_experiment_table should "
-                               "have 1 and only 1 entry for a given "
-                               f"ophys_experiment_id. For "
-                               f"{ophys_experiment_id} "
-                               f" there are {row.shape[0]} entries.")
+            raise RuntimeError(
+                "The behavior_ophys_experiment_table should "
+                "have 1 and only 1 entry for a given "
+                f"ophys_experiment_id. For "
+                f"{ophys_experiment_id} "
+                f" there are {row.shape[0]} entries."
+            )
         file_id = str(int(row[self.cache.file_id_column]))
         data_path = self._get_data_path(file_id=file_id)
-        return BehaviorOphysExperiment.from_nwb_path(
-            str(data_path))
+        return BehaviorOphysExperiment.from_nwb_path(str(data_path))
 
     def _get_ophys_session_table(self):
         session_table_path = self._get_metadata_path(
-            fname="ophys_session_table")
-        df = literal_col_eval(pd.read_csv(session_table_path,
-                                          dtype={'mouse_id': str}),
-                                          columns=COL_EVAL_LIST)
-        df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
+            fname="ophys_session_table"
+        )
+        df = literal_col_eval(
+            pd.read_csv(session_table_path, dtype={"mouse_id": str}),
+            columns=COL_EVAL_LIST,
+        )
+        df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])
         self._ophys_session_table = df.set_index("ophys_session_id")
 
     def get_ophys_session_table(self) -> pd.DataFrame:
@@ -135,11 +150,13 @@ def get_ophys_session_table(self) -> pd.DataFrame:
 
     def _get_behavior_session_table(self):
         session_table_path = self._get_metadata_path(
-            fname='behavior_session_table')
-        df = literal_col_eval(pd.read_csv(session_table_path,
-                                          dtype={'mouse_id': str}),
-                                          columns=COL_EVAL_LIST)
-        df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
+            fname="behavior_session_table"
+        )
+        df = literal_col_eval(
+            pd.read_csv(session_table_path, dtype={"mouse_id": str}),
+            columns=COL_EVAL_LIST,
+        )
+        df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])
 
         self._behavior_session_table = df.set_index("behavior_session_id")
 
@@ -162,22 +179,27 @@ def get_behavior_session_table(self) -> pd.DataFrame:
 
     def _get_ophys_experiment_table(self):
         experiment_table_path = self._get_metadata_path(
-            fname="ophys_experiment_table")
-        df = literal_col_eval(pd.read_csv(experiment_table_path,
-                                          dtype={'mouse_id': str}),
-                                          columns=COL_EVAL_LIST)
-        df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
+            fname="ophys_experiment_table"
+        )
+        df = literal_col_eval(
+            pd.read_csv(experiment_table_path, dtype={"mouse_id": str}),
+            columns=COL_EVAL_LIST,
+        )
+        df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])
 
         self._ophys_experiment_table = df.set_index("ophys_experiment_id")
 
     def _get_ophys_cells_table(self):
         ophys_cells_table_path = self._get_metadata_path(
-            fname="ophys_cells_table")
-        df = literal_col_eval(pd.read_csv(ophys_cells_table_path),
-                                columns=COL_EVAL_LIST)
+            fname="ophys_cells_table"
+        )
+        df = literal_col_eval(
+            pd.read_csv(ophys_cells_table_path), columns=COL_EVAL_LIST
+        )
         # NaN's for invalid cells force this to float, push to int
-        df['cell_specimen_id'] = pd.array(df['cell_specimen_id'],
-                                          dtype="Int64")
+        df["cell_specimen_id"] = pd.array(
+            df["cell_specimen_id"], dtype="Int64"
+        )
         self._ophys_cells_table = df.set_index("cell_roi_id")
 
     def get_ophys_cells_table(self):
@@ -197,7 +219,7 @@ def get_ophys_experiment_table(self):
         return self._ophys_experiment_table
 
     def get_natural_movie_template(self, number: int) -> Iterable[bytes]:
-        """ Download a template for the natural movie stimulus. This is the
+        """Download a template for the natural movie stimulus. This is the
         actual movie that was shown during the recording session.
         :param number: identifier for this scene
         :type number: int

diff --git a/allensdk/brain_observatory/behavior/swdb/utilities.py b/allensdk/brain_observatory/behavior/swdb/utilities.py
@@ -3,8 +3,6 @@
 import pandas as pd
 import seaborn as sns
 import matplotlib as mpl
-from typing import List
-import ast
 
 '''
     This file contains a set of functions that are useful in analyzing visual behavior data
@@ -19,22 +17,17 @@ def save_figure(fig, figsize, save_dir, folder, filename, formats=['.png']):
         fig: a figure object
         figsize: tuple of desired figure size
         save_dir: string, the directory to save the figure
-        folder: string, the sub-folder to save the figure in. if the
-        folder does not exist, it will be created
+        folder: string, the sub-folder to save the figure in. if the folder does not exist, it will be created
         filename: string, the desired name of the saved figure
-        formats: a list of file formats as strings to save the figure
-        as, ex: ['.png','.pdf']
+        formats: a list of file formats as strings to save the figure as, ex: ['.png','.pdf']
     '''
     fig_dir = os.path.join(save_dir, folder)
     if not os.path.exists(fig_dir):
         os.mkdir(fig_dir)
     mpl.rcParams['pdf.fonttype'] = 42
     fig.set_size_inches(figsize)
     for f in formats:
-        fig.savefig(
-            os.path.join(fig_dir, fig_title + f),
-            transparent=True,
-            orientation='landscape')
+        fig.savefig(os.path.join(fig_dir, fig_title + f), transparent=True, orientation='landscape')
 
 
 def get_dff_matrix(session):
@@ -53,38 +46,27 @@ def get_dff_matrix(session):
 
 def get_mean_df(response_df, conditions=['cell_specimen_id', 'image_name']):
     '''
-        Computes an analysis on a selection of responses (either flashes
-        or trials). Computes mean_response, sem_response, the pref_stim,
-        fraction_active_responses.
+        Computes an analysis on a selection of responses (either flashes or trials). Computes mean_response, sem_response, the pref_stim, fraction_active_responses.
 
         INPUTS
         response_df: the dataframe to group
-        conditions: the conditions to group by, the first entry should be
-        'cell_specimen_id', the second could be 'image_name' or
-        'change_image_name'
+        conditions: the conditions to group by, the first entry should be 'cell_specimen_id', the second could be 'image_name' or 'change_image_name'
 
         OUTPUTS:
         mdf: a dataframe with the following columns:
             mean_response: the average mean_response for each condition
             sem_response: the sem of the mean_response
             mean_trace: the average dff trace for each condition
             sem_trace: the sem of the mean_trace
-            mean_responses: the list of mean_responses for each element
-                of each group
-            pref_stim: if conditions includes image_name or
-                change_image_name, sets a boolean column for whether
-                that was the cell's preferred stimulus
-            fraction_significant_responses: the fraction of
-                individual image presentations or trials that were
-                significant (p_value > 0.05)
+            mean_responses: the list of mean_responses for each element of each group
+            pref_stim: if conditions includes image_name or change_image_name, sets a boolean column for whether that was the cell's preferred stimulus
+            fraction_significant_responses: the fraction of individual image presentations or trials that were significant (p_value > 0.05)
     '''
 
     # Group by conditions
     rdf = response_df.copy()
     mdf = rdf.groupby(conditions).apply(get_mean_sem_trace)
-    mdf = mdf[
-        ['mean_response', 'sem_response', 'mean_trace', 
-            'sem_trace', 'mean_responses']]
+    mdf = mdf[['mean_response', 'sem_response', 'mean_trace', 'sem_trace', 'mean_responses']]
     mdf = mdf.reset_index()
 
     # Add preferred stimulus if we can
@@ -109,8 +91,7 @@ def get_mean_sem_trace(group):
         group: a pandas groupby object
         
         OUTPUT:
-        a pandas series with the mean_response, sem_response,
-        mean_trace, sem_trace, and mean_responses computed for the group. 
+        a pandas series with the mean_response, sem_response, mean_trace, sem_trace, and mean_responses computed for the group. 
     '''
     mean_response = np.mean(group['mean_response'])
     mean_responses = group['mean_response'].values
@@ -124,18 +105,13 @@ def get_mean_sem_trace(group):
 
 def annotate_mean_df_with_pref_stim(mean_df):
     '''
-        Computes the preferred stimulus for each cell/trial or
-        cell/flash combination. Preferred image is computed by seeing
-        which image evoked the largest average mean_response across
-        all images. 
+        Computes the preferred stimulus for each cell/trial or cell/flash combination. Preferred image is computed by seeing which image evoked the largest average mean_response across all images. 
 
         INPUTS:
         mean_df: the mean_df to be annotated
 
         OUTPUTS:
-        mean_df with a new column appended 'pref_stim' which is a
-        boolean TRUE/FALSE for whether that image was that cell's
-        preferred image.
+        mean_df with a new column appended 'pref_stim' which is a boolean TRUE/FALSE for whether that image was that cell's preferred image.
        
         ASSERTS:
         Each cell has one unique preferred stimulus 
@@ -155,13 +131,10 @@ def annotate_mean_df_with_pref_stim(mean_df):
     for cell in mdf['cell_specimen_id'].unique():
         mc = mdf[(mdf['cell_specimen_id'] == cell)]
         mc = mc[mc[image_name] != 'omitted']
-        temp = mc[
-            (mc.mean_response == np.max(mc.mean_response.values))
-            ][image_name].values
+        temp = mc[(mc.mean_response == np.max(mc.mean_response.values))][image_name].values
         if len(temp) > 0:  # need this test if the mean_response was nan
             pref_image = temp[0]
-            # PROBLEM, this is slow, and sets on slice,
-            # better to use mdf.at[test, 'pref_stim']
+            # PROBLEM, this is slow, and sets on slice, better to use mdf.at[test, 'pref_stim']
             row = mdf[(mdf['cell_specimen_id'] == cell) & (mdf[image_name] == pref_image)].index
             mdf.loc[row, 'pref_stim'] = True
 
@@ -382,41 +355,11 @@ def get_active_cell_indices(dff_traces):
 
 
 def compute_lifetime_sparseness(image_responses):
-    # image responses should be an array of the trial averaged responses
-    # to each image
-    # sparseness = 1-(sum of trial averaged responses to images / N)squared /
-    # (sum of (squared mean responses / n)) / (1-(1/N))
+    # image responses should be an array of the trial averaged responses to each image
+    # sparseness = 1-(sum of trial averaged responses to images / N)squared / (sum of (squared mean responses / n)) / (1-(1/N))
     # N = number of images
     # after Vinje & Gallant, 2000; Froudarakis et al., 2014
     N = float(len(image_responses))
-    ls = ((1 - (1 / N) * ((np.power(image_responses.sum(axis=0), 2)) /
-     (np.power(image_responses, 2).sum(axis=0)))) / (
+    ls = ((1 - (1 / N) * ((np.power(image_responses.sum(axis=0), 2)) / (np.power(image_responses, 2).sum(axis=0)))) / (
         1 - (1 / N)))
     return ls
-
-def literal_col_eval(df: pd.DataFrame,
-                     columns: List[str]) -> pd.DataFrame:
-    ''' Eval string entries of specified columns 
-    '''
-
-    for column in columns:
-        if column in df.columns:
-            df.loc[df[column].notnull(), column] = \
-                df[column][df[column].notnull()].apply(
-                    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
-                )
-    return df
-
-
-def df_list_to_tuple(df: pd.DataFrame,
-                     columns: List[str]) -> pd.DataFrame:
-    ''' convert list to tuple 
-    '''
-
-    for column in columns:
-        if column in df.columns:
-            df.loc[df[column].notnull(), column] = \
-                df[column][df[column].notnull()].apply(
-                    lambda x: tuple(x) if isinstance(x, list) else x
-                )
-    return df