refactor eval/tuple functions

small fix to literal_col_eval call
AllenInstitute · Nov 23, 2022 · 989582d · 989582d
1 parent feec27b
commit 989582d
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 76 deletions.
diff --git a/...vatory/behavior/behavior_project_cache/project_apis/data_io/behavior_project_cloud_api.py b/...vatory/behavior/behavior_project_cache/project_apis/data_io/behavior_project_cloud_api.py
@@ -1,32 +1,20 @@
 import pandas as pd
-from typing import Iterable, List
-import ast
+from typing import Iterable
 
 from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.abcs import BehaviorProjectBase  # noqa: E501
 from allensdk.brain_observatory.behavior.behavior_session import (
     BehaviorSession)
 from allensdk.brain_observatory.behavior.behavior_ophys_experiment import (
     BehaviorOphysExperiment)
-
+from allensdk.brain_observatory.behavior.swdb.utilities import literal_col_eval
 from allensdk.brain_observatory.behavior.behavior_project_cache.\
     project_apis.data_io.project_cloud_api_base import ProjectCloudApiBase  # noqa: E501
 
 
-def literal_col_eval(df: pd.DataFrame,
-                     columns: List[str] = ["ophys_experiment_id",
-                                           "ophys_container_id",
-                                           "driver_line"]) -> pd.DataFrame:
-    def converter(x):
-        if isinstance(x, str):
-            x = ast.literal_eval(x)
-        return x
-
-    for column in columns:
-        if column in df.columns:
-            df.loc[df[column].notnull(), column] = \
-                df[column][df[column].notnull()].apply(converter)
-    return df
 
+COL_EVAL_LIST = ["ophys_experiment_id",
+                    "ophys_container_id",
+                    "driver_line"]
 
 class BehaviorProjectCloudApi(BehaviorProjectBase, ProjectCloudApiBase):
 
@@ -127,7 +115,8 @@ def _get_ophys_session_table(self):
         session_table_path = self._get_metadata_path(
             fname="ophys_session_table")
         df = literal_col_eval(pd.read_csv(session_table_path,
-                                          dtype={'mouse_id': str}))
+                                          dtype={'mouse_id': str}),
+                                          columns=COL_EVAL_LIST)
         df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
         self._ophys_session_table = df.set_index("ophys_session_id")
 
@@ -148,7 +137,8 @@ def _get_behavior_session_table(self):
         session_table_path = self._get_metadata_path(
             fname='behavior_session_table')
         df = literal_col_eval(pd.read_csv(session_table_path,
-                                          dtype={'mouse_id': str}))
+                                          dtype={'mouse_id': str}),
+                                          columns=COL_EVAL_LIST)
         df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
 
         self._behavior_session_table = df.set_index("behavior_session_id")
@@ -174,15 +164,17 @@ def _get_ophys_experiment_table(self):
         experiment_table_path = self._get_metadata_path(
             fname="ophys_experiment_table")
         df = literal_col_eval(pd.read_csv(experiment_table_path,
-                                          dtype={'mouse_id': str}))
+                                          dtype={'mouse_id': str}),
+                                          columns=COL_EVAL_LIST)
         df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
 
         self._ophys_experiment_table = df.set_index("ophys_experiment_id")
 
     def _get_ophys_cells_table(self):
         ophys_cells_table_path = self._get_metadata_path(
             fname="ophys_cells_table")
-        df = literal_col_eval(pd.read_csv(ophys_cells_table_path))
+        df = literal_col_eval(pd.read_csv(ophys_cells_table_path),
+                                columns=COL_EVAL_LIST)
         # NaN's for invalid cells force this to float, push to int
         df['cell_specimen_id'] = pd.array(df['cell_specimen_id'],
                                           dtype="Int64")

diff --git a/allensdk/brain_observatory/behavior/swdb/utilities.py b/allensdk/brain_observatory/behavior/swdb/utilities.py
@@ -3,6 +3,8 @@
 import pandas as pd
 import seaborn as sns
 import matplotlib as mpl
+from typing import List
+import ast
 
 '''
     This file contains a set of functions that are useful in analyzing visual behavior data
@@ -17,17 +19,22 @@ def save_figure(fig, figsize, save_dir, folder, filename, formats=['.png']):
         fig: a figure object
         figsize: tuple of desired figure size
         save_dir: string, the directory to save the figure
-        folder: string, the sub-folder to save the figure in. if the folder does not exist, it will be created
+        folder: string, the sub-folder to save the figure in. if the
+        folder does not exist, it will be created
         filename: string, the desired name of the saved figure
-        formats: a list of file formats as strings to save the figure as, ex: ['.png','.pdf']
+        formats: a list of file formats as strings to save the figure
+        as, ex: ['.png','.pdf']
     '''
     fig_dir = os.path.join(save_dir, folder)
     if not os.path.exists(fig_dir):
         os.mkdir(fig_dir)
     mpl.rcParams['pdf.fonttype'] = 42
     fig.set_size_inches(figsize)
     for f in formats:
-        fig.savefig(os.path.join(fig_dir, fig_title + f), transparent=True, orientation='landscape')
+        fig.savefig(
+            os.path.join(fig_dir, fig_title + f),
+            transparent=True,
+            orientation='landscape')
 
 
 def get_dff_matrix(session):
@@ -46,27 +53,38 @@ def get_dff_matrix(session):
 
 def get_mean_df(response_df, conditions=['cell_specimen_id', 'image_name']):
     '''
-        Computes an analysis on a selection of responses (either flashes or trials). Computes mean_response, sem_response, the pref_stim, fraction_active_responses.
+        Computes an analysis on a selection of responses (either flashes
+        or trials). Computes mean_response, sem_response, the pref_stim,
+        fraction_active_responses.
 
         INPUTS
         response_df: the dataframe to group
-        conditions: the conditions to group by, the first entry should be 'cell_specimen_id', the second could be 'image_name' or 'change_image_name'
+        conditions: the conditions to group by, the first entry should be
+        'cell_specimen_id', the second could be 'image_name' or
+        'change_image_name'
 
         OUTPUTS:
         mdf: a dataframe with the following columns:
             mean_response: the average mean_response for each condition
             sem_response: the sem of the mean_response
             mean_trace: the average dff trace for each condition
             sem_trace: the sem of the mean_trace
-            mean_responses: the list of mean_responses for each element of each group
-            pref_stim: if conditions includes image_name or change_image_name, sets a boolean column for whether that was the cell's preferred stimulus
-            fraction_significant_responses: the fraction of individual image presentations or trials that were significant (p_value > 0.05)
+            mean_responses: the list of mean_responses for each element
+                of each group
+            pref_stim: if conditions includes image_name or
+                change_image_name, sets a boolean column for whether
+                that was the cell's preferred stimulus
+            fraction_significant_responses: the fraction of
+                individual image presentations or trials that were
+                significant (p_value > 0.05)
     '''
 
     # Group by conditions
     rdf = response_df.copy()
     mdf = rdf.groupby(conditions).apply(get_mean_sem_trace)
-    mdf = mdf[['mean_response', 'sem_response', 'mean_trace', 'sem_trace', 'mean_responses']]
+    mdf = mdf[
+        ['mean_response', 'sem_response', 'mean_trace', 
+            'sem_trace', 'mean_responses']]
     mdf = mdf.reset_index()
 
     # Add preferred stimulus if we can
@@ -91,7 +109,8 @@ def get_mean_sem_trace(group):
         group: a pandas groupby object
         
         OUTPUT:
-        a pandas series with the mean_response, sem_response, mean_trace, sem_trace, and mean_responses computed for the group. 
+        a pandas series with the mean_response, sem_response,
+        mean_trace, sem_trace, and mean_responses computed for the group. 
     '''
     mean_response = np.mean(group['mean_response'])
     mean_responses = group['mean_response'].values
@@ -105,13 +124,18 @@ def get_mean_sem_trace(group):
 
 def annotate_mean_df_with_pref_stim(mean_df):
     '''
-        Computes the preferred stimulus for each cell/trial or cell/flash combination. Preferred image is computed by seeing which image evoked the largest average mean_response across all images. 
+        Computes the preferred stimulus for each cell/trial or
+        cell/flash combination. Preferred image is computed by seeing
+        which image evoked the largest average mean_response across
+        all images. 
 
         INPUTS:
         mean_df: the mean_df to be annotated
 
         OUTPUTS:
-        mean_df with a new column appended 'pref_stim' which is a boolean TRUE/FALSE for whether that image was that cell's preferred image.
+        mean_df with a new column appended 'pref_stim' which is a
+        boolean TRUE/FALSE for whether that image was that cell's
+        preferred image.
        
         ASSERTS:
         Each cell has one unique preferred stimulus 
@@ -131,10 +155,13 @@ def annotate_mean_df_with_pref_stim(mean_df):
     for cell in mdf['cell_specimen_id'].unique():
         mc = mdf[(mdf['cell_specimen_id'] == cell)]
         mc = mc[mc[image_name] != 'omitted']
-        temp = mc[(mc.mean_response == np.max(mc.mean_response.values))][image_name].values
+        temp = mc[
+            (mc.mean_response == np.max(mc.mean_response.values))
+            ][image_name].values
         if len(temp) > 0:  # need this test if the mean_response was nan
             pref_image = temp[0]
-            # PROBLEM, this is slow, and sets on slice, better to use mdf.at[test, 'pref_stim']
+            # PROBLEM, this is slow, and sets on slice,
+            # better to use mdf.at[test, 'pref_stim']
             row = mdf[(mdf['cell_specimen_id'] == cell) & (mdf[image_name] == pref_image)].index
             mdf.loc[row, 'pref_stim'] = True
 
@@ -355,11 +382,41 @@ def get_active_cell_indices(dff_traces):
 
 
 def compute_lifetime_sparseness(image_responses):
-    # image responses should be an array of the trial averaged responses to each image
-    # sparseness = 1-(sum of trial averaged responses to images / N)squared / (sum of (squared mean responses / n)) / (1-(1/N))
+    # image responses should be an array of the trial averaged responses
+    # to each image
+    # sparseness = 1-(sum of trial averaged responses to images / N)squared /
+    # (sum of (squared mean responses / n)) / (1-(1/N))
     # N = number of images
     # after Vinje & Gallant, 2000; Froudarakis et al., 2014
     N = float(len(image_responses))
-    ls = ((1 - (1 / N) * ((np.power(image_responses.sum(axis=0), 2)) / (np.power(image_responses, 2).sum(axis=0)))) / (
+    ls = ((1 - (1 / N) * ((np.power(image_responses.sum(axis=0), 2)) /
+     (np.power(image_responses, 2).sum(axis=0)))) / (
         1 - (1 / N)))
     return ls
+
+def literal_col_eval(df: pd.DataFrame,
+                     columns: List[str]) -> pd.DataFrame:
+    ''' Eval string entries of specified columns 
+    '''
+
+    for column in columns:
+        if column in df.columns:
+            df.loc[df[column].notnull(), column] = \
+                df[column][df[column].notnull()].apply(
+                    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
+                )
+    return df
+
+
+def df_list_to_tuple(df: pd.DataFrame,
+                     columns: List[str]) -> pd.DataFrame:
+    ''' convert list to tuple 
+    '''
+
+    for column in columns:
+        if column in df.columns:
+            df.loc[df[column].notnull(), column] = \
+                df[column][df[column].notnull()].apply(
+                    lambda x: tuple(x) if isinstance(x, list) else x
+                )
+    return df
diff --git a/allensdk/brain_observatory/ecephys/ecephys_session.py b/allensdk/brain_observatory/ecephys/ecephys_session.py
@@ -8,6 +8,7 @@
 import scipy.stats
 import xarray as xr
 
+from allensdk.brain_observatory.behavior.swdb.utilities import literal_col_eval, df_list_to_tuple
 from allensdk.brain_observatory.ecephys.ecephys_session_api import (
     EcephysNwb1Api,
     EcephysNwbSessionApi,
@@ -1180,26 +1181,18 @@ def _build_stimulus_presentations(
             stimulus_presentations, default_column_renames, ignore_case=False
         )
 
-        # pandas groupby ops ignore nans, so we need a new "nonapplicable"
-        # value that pandas does not recognize as null ...
-        # stimulus_presentations.replace("", nonapplicable, inplace=True)
-
-        # pandas does not automatically convert boolean cols for fillna
-        # boolean_colnames = stimulus_presentations.dtypes[
-        #     stimulus_presentations.dtypes == "boolean"].index
-        # col_type_map = {colname: "object" for colname in boolean_colnames}
-        # stimulus_presentations = stimulus_presentations.astype(
-        #     col_type_map).fillna(nonapplicable)
+        stimulus_presentations.replace("", nonapplicable, inplace=True)
 
         # eval str(numeric) and str(lists), convert lists to tuple for
         # dict key compatibility
-        exclude_columns = ["stimulus_name"]
-        for colname in stimulus_presentations.columns:
-            if colname not in exclude_columns:
-                stimulus_presentations[colname] = stimulus_presentations[
-                    colname
-                ].apply(naming_utilities.eval_str)
-
+
+        col_list = ["phase, size, spatial_frequency"]
+        stimulus_presentations = literal_col_eval( 
+            stimulus_presentations,
+            columns=col_list)
+        stimulus_presentations = df_list_to_tuple( 
+            stimulus_presentations,
+            columns=col_list)
         stimulus_presentations["duration"] = (
             stimulus_presentations["stop_time"]
             - stimulus_presentations["start_time"]

diff --git a/allensdk/brain_observatory/ecephys/stimulus_table/naming_utilities.py b/allensdk/brain_observatory/ecephys/stimulus_table/naming_utilities.py
@@ -195,24 +195,3 @@ def map_column_names(table, name_map=None, ignore_case=True):
     output = table.rename(columns=name_map)
 
     return output
-
-
-def eval_str(val):
-    """Evaluates value if its type==str(numeric) or type==str(list)
-
-    Parameters
-    ----------
-    val: any
-
-    Returns
-    -------
-    val: evaluated val if its type==str(numeric) or type==str(list)
-    or passes the val through if it does not meet the condition
-    """
-
-    if isinstance(val, str):
-        if val.replace(".", "").isdigit():  # checks if val is numeric
-            val = eval(val)
-        elif val[0] == "[" and val[-1] == "]":  # checks if val is list
-            val = tuple(eval(val))
-    return val