Skip to content

Commit

Permalink
create utils in allensdk/core for df processing
Browse files Browse the repository at this point in the history
change to double quotes for docstring

lint

add inline notes for eval/tuple rationale

resolve merge conflicts

resolve merge conflict
  • Loading branch information
mikejhuang committed Nov 28, 2022
1 parent 03ac5a7 commit 25c055d
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 146 deletions.
Original file line number Diff line number Diff line change
@@ -1,47 +1,55 @@
import pandas as pd
from typing import Iterable

from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.abcs import BehaviorProjectBase # noqa: E501
from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.abcs import ( # noqa: E501
BehaviorProjectBase,
)
from allensdk.brain_observatory.behavior.behavior_session import (
BehaviorSession)
BehaviorSession,
)
from allensdk.brain_observatory.behavior.behavior_ophys_experiment import (
BehaviorOphysExperiment)
from allensdk.brain_observatory.behavior.swdb.utilities import literal_col_eval
from allensdk.brain_observatory.behavior.behavior_project_cache.\
project_apis.data_io.project_cloud_api_base import ProjectCloudApiBase # noqa: E501
BehaviorOphysExperiment,
)
from allensdk.core.utilities import literal_col_eval
from allensdk.brain_observatory.behavior.behavior_project_cache.project_apis.data_io.project_cloud_api_base import ( # noqa: E501
ProjectCloudApiBase,
)

COL_EVAL_LIST = ["ophys_experiment_id", "ophys_container_id", "driver_line"]


COL_EVAL_LIST = ["ophys_experiment_id",
"ophys_container_id",
"driver_line"]

class BehaviorProjectCloudApi(BehaviorProjectBase, ProjectCloudApiBase):

MANIFEST_COMPATIBILITY = ["1.0.0", "2.0.0"]

def _load_manifest_tables(self):

expected_metadata = set(["behavior_session_table",
"ophys_session_table",
"ophys_experiment_table",
"ophys_cells_table"])
expected_metadata = set(
[
"behavior_session_table",
"ophys_session_table",
"ophys_experiment_table",
"ophys_cells_table",
]
)

cache_metadata = set(self.cache._manifest.metadata_file_names)

if cache_metadata != expected_metadata:
raise RuntimeError("expected S3CloudCache object to have "
f"metadata file names: {expected_metadata} "
f"but it has {cache_metadata}")
raise RuntimeError(
"expected S3CloudCache object to have "
f"metadata file names: {expected_metadata} "
f"but it has {cache_metadata}"
)

self._get_ophys_session_table()
self._get_behavior_session_table()
self._get_ophys_experiment_table()
self._get_ophys_cells_table()

def get_behavior_session(
self,
behavior_session_id: int) -> BehaviorSession:
self, behavior_session_id: int
) -> BehaviorSession:
"""get a BehaviorSession by specifying behavior_session_id
Parameters
Expand All @@ -67,25 +75,28 @@ def get_behavior_session(
"""
row = self._behavior_session_table.query(
f"behavior_session_id=={behavior_session_id}")
f"behavior_session_id=={behavior_session_id}"
)
if row.shape[0] != 1:
raise RuntimeError("The behavior_session_table should have "
"1 and only 1 entry for a given "
"behavior_session_id. For "
f"{behavior_session_id} "
f" there are {row.shape[0]} entries.")
raise RuntimeError(
"The behavior_session_table should have "
"1 and only 1 entry for a given "
"behavior_session_id. For "
f"{behavior_session_id} "
f" there are {row.shape[0]} entries."
)
row = row.squeeze()
has_file_id = not pd.isna(row[self.cache.file_id_column])
if not has_file_id:
oeid = row.ophys_experiment_id[0]
row = self._ophys_experiment_table.query(f"index=={oeid}")
file_id = str(int(row[self.cache.file_id_column]))
data_path = self._get_data_path(file_id=file_id)
return BehaviorSession.from_nwb_path(
nwb_path=str(data_path))
return BehaviorSession.from_nwb_path(nwb_path=str(data_path))

def get_behavior_ophys_experiment(self, ophys_experiment_id: int
) -> BehaviorOphysExperiment:
def get_behavior_ophys_experiment(
self, ophys_experiment_id: int
) -> BehaviorOphysExperiment:
"""get a BehaviorOphysExperiment by specifying ophys_experiment_id
Parameters
Expand All @@ -99,25 +110,29 @@ def get_behavior_ophys_experiment(self, ophys_experiment_id: int
"""
row = self._ophys_experiment_table.query(
f"index=={ophys_experiment_id}")
f"index=={ophys_experiment_id}"
)
if row.shape[0] != 1:
raise RuntimeError("The behavior_ophys_experiment_table should "
"have 1 and only 1 entry for a given "
f"ophys_experiment_id. For "
f"{ophys_experiment_id} "
f" there are {row.shape[0]} entries.")
raise RuntimeError(
"The behavior_ophys_experiment_table should "
"have 1 and only 1 entry for a given "
f"ophys_experiment_id. For "
f"{ophys_experiment_id} "
f" there are {row.shape[0]} entries."
)
file_id = str(int(row[self.cache.file_id_column]))
data_path = self._get_data_path(file_id=file_id)
return BehaviorOphysExperiment.from_nwb_path(
str(data_path))
return BehaviorOphysExperiment.from_nwb_path(str(data_path))

def _get_ophys_session_table(self):
session_table_path = self._get_metadata_path(
fname="ophys_session_table")
df = literal_col_eval(pd.read_csv(session_table_path,
dtype={'mouse_id': str}),
columns=COL_EVAL_LIST)
df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
fname="ophys_session_table"
)
df = literal_col_eval(
pd.read_csv(session_table_path, dtype={"mouse_id": str}),
columns=COL_EVAL_LIST,
)
df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])
self._ophys_session_table = df.set_index("ophys_session_id")

def get_ophys_session_table(self) -> pd.DataFrame:
Expand All @@ -135,11 +150,13 @@ def get_ophys_session_table(self) -> pd.DataFrame:

def _get_behavior_session_table(self):
session_table_path = self._get_metadata_path(
fname='behavior_session_table')
df = literal_col_eval(pd.read_csv(session_table_path,
dtype={'mouse_id': str}),
columns=COL_EVAL_LIST)
df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
fname="behavior_session_table"
)
df = literal_col_eval(
pd.read_csv(session_table_path, dtype={"mouse_id": str}),
columns=COL_EVAL_LIST,
)
df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])

self._behavior_session_table = df.set_index("behavior_session_id")

Expand All @@ -162,22 +179,27 @@ def get_behavior_session_table(self) -> pd.DataFrame:

def _get_ophys_experiment_table(self):
experiment_table_path = self._get_metadata_path(
fname="ophys_experiment_table")
df = literal_col_eval(pd.read_csv(experiment_table_path,
dtype={'mouse_id': str}),
columns=COL_EVAL_LIST)
df['date_of_acquisition'] = pd.to_datetime(df['date_of_acquisition'])
fname="ophys_experiment_table"
)
df = literal_col_eval(
pd.read_csv(experiment_table_path, dtype={"mouse_id": str}),
columns=COL_EVAL_LIST,
)
df["date_of_acquisition"] = pd.to_datetime(df["date_of_acquisition"])

self._ophys_experiment_table = df.set_index("ophys_experiment_id")

def _get_ophys_cells_table(self):
ophys_cells_table_path = self._get_metadata_path(
fname="ophys_cells_table")
df = literal_col_eval(pd.read_csv(ophys_cells_table_path),
columns=COL_EVAL_LIST)
fname="ophys_cells_table"
)
df = literal_col_eval(
pd.read_csv(ophys_cells_table_path), columns=COL_EVAL_LIST
)
# NaN's for invalid cells force this to float, push to int
df['cell_specimen_id'] = pd.array(df['cell_specimen_id'],
dtype="Int64")
df["cell_specimen_id"] = pd.array(
df["cell_specimen_id"], dtype="Int64"
)
self._ophys_cells_table = df.set_index("cell_roi_id")

def get_ophys_cells_table(self):
Expand All @@ -197,7 +219,7 @@ def get_ophys_experiment_table(self):
return self._ophys_experiment_table

def get_natural_movie_template(self, number: int) -> Iterable[bytes]:
""" Download a template for the natural movie stimulus. This is the
"""Download a template for the natural movie stimulus. This is the
actual movie that was shown during the recording session.
:param number: identifier for this scene
:type number: int
Expand Down
91 changes: 17 additions & 74 deletions allensdk/brain_observatory/behavior/swdb/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import pandas as pd
import seaborn as sns
import matplotlib as mpl
from typing import List
import ast

'''
This file contains a set of functions that are useful in analyzing visual behavior data
Expand All @@ -19,22 +17,17 @@ def save_figure(fig, figsize, save_dir, folder, filename, formats=['.png']):
fig: a figure object
figsize: tuple of desired figure size
save_dir: string, the directory to save the figure
folder: string, the sub-folder to save the figure in. if the
folder does not exist, it will be created
folder: string, the sub-folder to save the figure in. if the folder does not exist, it will be created
filename: string, the desired name of the saved figure
formats: a list of file formats as strings to save the figure
as, ex: ['.png','.pdf']
formats: a list of file formats as strings to save the figure as, ex: ['.png','.pdf']
'''
fig_dir = os.path.join(save_dir, folder)
if not os.path.exists(fig_dir):
os.mkdir(fig_dir)
mpl.rcParams['pdf.fonttype'] = 42
fig.set_size_inches(figsize)
for f in formats:
fig.savefig(
os.path.join(fig_dir, fig_title + f),
transparent=True,
orientation='landscape')
fig.savefig(os.path.join(fig_dir, fig_title + f), transparent=True, orientation='landscape')


def get_dff_matrix(session):
Expand All @@ -53,38 +46,27 @@ def get_dff_matrix(session):

def get_mean_df(response_df, conditions=['cell_specimen_id', 'image_name']):
'''
Computes an analysis on a selection of responses (either flashes
or trials). Computes mean_response, sem_response, the pref_stim,
fraction_active_responses.
Computes an analysis on a selection of responses (either flashes or trials). Computes mean_response, sem_response, the pref_stim, fraction_active_responses.
INPUTS
response_df: the dataframe to group
conditions: the conditions to group by, the first entry should be
'cell_specimen_id', the second could be 'image_name' or
'change_image_name'
conditions: the conditions to group by, the first entry should be 'cell_specimen_id', the second could be 'image_name' or 'change_image_name'
OUTPUTS:
mdf: a dataframe with the following columns:
mean_response: the average mean_response for each condition
sem_response: the sem of the mean_response
mean_trace: the average dff trace for each condition
sem_trace: the sem of the mean_trace
mean_responses: the list of mean_responses for each element
of each group
pref_stim: if conditions includes image_name or
change_image_name, sets a boolean column for whether
that was the cell's preferred stimulus
fraction_significant_responses: the fraction of
individual image presentations or trials that were
significant (p_value > 0.05)
mean_responses: the list of mean_responses for each element of each group
pref_stim: if conditions includes image_name or change_image_name, sets a boolean column for whether that was the cell's preferred stimulus
fraction_significant_responses: the fraction of individual image presentations or trials that were significant (p_value > 0.05)
'''

# Group by conditions
rdf = response_df.copy()
mdf = rdf.groupby(conditions).apply(get_mean_sem_trace)
mdf = mdf[
['mean_response', 'sem_response', 'mean_trace',
'sem_trace', 'mean_responses']]
mdf = mdf[['mean_response', 'sem_response', 'mean_trace', 'sem_trace', 'mean_responses']]
mdf = mdf.reset_index()

# Add preferred stimulus if we can
Expand All @@ -109,8 +91,7 @@ def get_mean_sem_trace(group):
group: a pandas groupby object
OUTPUT:
a pandas series with the mean_response, sem_response,
mean_trace, sem_trace, and mean_responses computed for the group.
a pandas series with the mean_response, sem_response, mean_trace, sem_trace, and mean_responses computed for the group.
'''
mean_response = np.mean(group['mean_response'])
mean_responses = group['mean_response'].values
Expand All @@ -124,18 +105,13 @@ def get_mean_sem_trace(group):

def annotate_mean_df_with_pref_stim(mean_df):
'''
Computes the preferred stimulus for each cell/trial or
cell/flash combination. Preferred image is computed by seeing
which image evoked the largest average mean_response across
all images.
Computes the preferred stimulus for each cell/trial or cell/flash combination. Preferred image is computed by seeing which image evoked the largest average mean_response across all images.
INPUTS:
mean_df: the mean_df to be annotated
OUTPUTS:
mean_df with a new column appended 'pref_stim' which is a
boolean TRUE/FALSE for whether that image was that cell's
preferred image.
mean_df with a new column appended 'pref_stim' which is a boolean TRUE/FALSE for whether that image was that cell's preferred image.
ASSERTS:
Each cell has one unique preferred stimulus
Expand All @@ -155,13 +131,10 @@ def annotate_mean_df_with_pref_stim(mean_df):
for cell in mdf['cell_specimen_id'].unique():
mc = mdf[(mdf['cell_specimen_id'] == cell)]
mc = mc[mc[image_name] != 'omitted']
temp = mc[
(mc.mean_response == np.max(mc.mean_response.values))
][image_name].values
temp = mc[(mc.mean_response == np.max(mc.mean_response.values))][image_name].values
if len(temp) > 0: # need this test if the mean_response was nan
pref_image = temp[0]
# PROBLEM, this is slow, and sets on slice,
# better to use mdf.at[test, 'pref_stim']
# PROBLEM, this is slow, and sets on slice, better to use mdf.at[test, 'pref_stim']
row = mdf[(mdf['cell_specimen_id'] == cell) & (mdf[image_name] == pref_image)].index
mdf.loc[row, 'pref_stim'] = True

Expand Down Expand Up @@ -382,41 +355,11 @@ def get_active_cell_indices(dff_traces):


def compute_lifetime_sparseness(image_responses):
# image responses should be an array of the trial averaged responses
# to each image
# sparseness = 1-(sum of trial averaged responses to images / N)squared /
# (sum of (squared mean responses / n)) / (1-(1/N))
# image responses should be an array of the trial averaged responses to each image
# sparseness = 1-(sum of trial averaged responses to images / N)squared / (sum of (squared mean responses / n)) / (1-(1/N))
# N = number of images
# after Vinje & Gallant, 2000; Froudarakis et al., 2014
N = float(len(image_responses))
ls = ((1 - (1 / N) * ((np.power(image_responses.sum(axis=0), 2)) /
(np.power(image_responses, 2).sum(axis=0)))) / (
ls = ((1 - (1 / N) * ((np.power(image_responses.sum(axis=0), 2)) / (np.power(image_responses, 2).sum(axis=0)))) / (
1 - (1 / N)))
return ls

def literal_col_eval(df: pd.DataFrame,
columns: List[str]) -> pd.DataFrame:
''' Eval string entries of specified columns
'''

for column in columns:
if column in df.columns:
df.loc[df[column].notnull(), column] = \
df[column][df[column].notnull()].apply(
lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
return df


def df_list_to_tuple(df: pd.DataFrame,
columns: List[str]) -> pd.DataFrame:
''' convert list to tuple
'''

for column in columns:
if column in df.columns:
df.loc[df[column].notnull(), column] = \
df[column][df[column].notnull()].apply(
lambda x: tuple(x) if isinstance(x, list) else x
)
return df
Loading

0 comments on commit 25c055d

Please sign in to comment.