# Purpose:
- Dev codes to search data assets not processed yet with specific parameters
- dff_long_baseline
- Also check decrosstalk capsule id in nextflow
- Search docDB
## Background information
- Main ophys processing pipeline did not use correct decrosstalk capsule
    - from 2025-02-03 to 2025-08-31?
    - Also, it is supposed to be changed to a newer version (when?)
    - All processing was done with dff_baseline_window 60
- my own pipeline was run, but it failed from time to time.
    - Need to filter out which one is run, which one didn't.
    - Also going to have both 60 and 1800 baseline window
## Procedure
1. Find out a way to check correct decrosstalk. Ideally decrosstalk capsule information, but range of dates will do for now.
2. Check how to filter based on dff_baseline_window

In [1]:
import csv
import logging
import os
import time
import re
from dataclasses import dataclass
from typing import Union
import numpy as np
import pandas as pd
from pathlib import Path
import json

from aind_codeocean_pipeline_monitor.models import (CaptureSettings,
                                                    PipelineMonitorSettings)
from aind_data_access_api.document_db import MetadataDbClient
from codeocean import CodeOcean
from codeocean.computation import (ComputationState, DataAssetsRunParam,
                                   RunParams)
from dataclasses_json import dataclass_json

from lamf_analysis.code_ocean import docdb_utils
from lamf_analysis.code_ocean import capsule_data_utils as cdu

%load_ext autoreload
%autoreload 2

logging.basicConfig(
    filename="batch.log",
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
# Set environment variables
API_GATEWAY_HOST = "api.allenneuraldynamics.org"
DATABASE = "metadata_index"
COLLECTION = "data_assets"
docdb_api_client = MetadataDbClient(
    host=API_GATEWAY_HOST,
    database=DATABASE,
    collection=COLLECTION,
)

# domain = os.getenv("CODEOCEAN_DOMAIN")
domain = "https://codeocean.allenneuraldynamics.org/"
token = os.getenv("API_SECRET")
co_client = CodeOcean(domain=domain, token=token)


# monitor_pipeline_capsule_id = os.getenv("CO_MONITOR_PIPELINE")
monitor_pipeline_capsule_id = os.getenv("CO_CAPSULE_ID")

DATE_FORMAT = '[0-9]{4}-[0-9]{2}-[0-9]{2}'
TIME_FORMAT = '[0-9]{2}-[0-9]{2}-[0-9]{2}'
SUBJECT_ID_FORMAT = '[0-9]{6}'

COMMIT_IDS_TO_KEEP = ['e61e887fbe5491035b35f8fd223101ad276622f1']

def get_asset_ids(docdb_api_client, asset_name, max_retry=100) -> str:
    """Get the asset ID from the data access api
    Parameters
    ----------
    docdb_api_client : MetadataDbClient
        The data access api client
    asset_name : str
        The asset name
    max_retry : int
        Maximum number of retries for querying the data access api
    Returns
    -------
    list
        List of the asset IDs
        Rarely, there are multiple assets with the same name
    """
    query = {"name": asset_name}
    projection = {"external_links": 1}
    retry_count = 0
    while retry_count < max_retry:
        try:
            response = docdb_api_client.retrieve_docdb_records(
                filter_query=query, projection=projection
            )
            break
        except Exception as e:
            logging.warning(f"Retry {retry_count+1}/{max_retry} after error: {e}")
            time.sleep(1)
            retry_count += 1
    external_links_list = []
    for res in response:        
        external_links = res.get("external_links", None)
        if type(external_links) is str:
            external_links = json.loads(external_links)
            external_links = external_links.get("Code Ocean", None)
        if type(external_links) is list and len(external_links) > 1:
            external_links = external_links[0]
            external_links = external_links.get("Code Ocean", None)
        if type(external_links) is dict:
            try:
                external_links = external_links.get("Code Ocean", None)[0]
            except IndexError:
                external_links = "None"
        if type(external_links) is list:
            try:
                external_links = external_links[0]
            except IndexError:
                external_links = "None"
        external_links_list.append(external_links)
    return external_links_list

In [55]:
query = {'subject.subject_id': '804363',
        'data_description.data_level': 'derived'}

projection = {"external_links": 1,
              "name": 1,
              "processing.processing_pipeline.data_processes": 1,
              "subject.subject_id": 1,}
max_retry = 5
retry_count = 0
while retry_count < max_retry:
    try:
        response = docdb_api_client.retrieve_docdb_records(
            filter_query=query, projection=projection
        )
        break
    except Exception as e:
        logging.warning(f"Retry {retry_count+1}/{max_retry} after error: {e}")
        time.sleep(1)
        retry_count += 1


# Search docDB for ophys processing and return long_window
- Filter 'processed' in name and 'dF/F estimation' in 'processing.processing_pipeline.data_processes'
    - Camstim Behavior processing also has 'processed' in name
    - Number of total 'data_processes' vary, 40 or 48 (with or without 'Image cell classification')

In [2]:
mouse_id = 788406

agg_pipeline = [
    # Match documents with 'processed' in the name (case-insensitive)
    # And ensure processing.processing_pipeline.data_processes exists
    {
        '$match': {
            'name': {'$regex': 'processed', '$options': 'i'},
            'processing.processing_pipeline.data_processes': {'$exists': True},
            "processing.processing_pipeline.data_processes": {
                "$elemMatch": {
                    "name": "dF/F estimation",
                }
            },
            'subject.subject_id': str(mouse_id),
        }
    },
    # Project to include name and count of data_processes
    {
        '$project': {
            'name': 1,
            # 'data_processes_count': {'$size': '$processing.processing_pipeline.data_processes'},
            '_id': 1,
            'external_links': 1,
            'long_window': {
                '$let': {
                    'vars': {
                        'df_processes': {
                            '$filter': {
                                'input': '$processing.processing_pipeline.data_processes',
                                'as': 'process',
                                'cond': {'$eq': ['$$process.name', 'dF/F estimation']}
                            }
                        }
                    },
                    'in': {'$arrayElemAt': ['$$df_processes.parameters.long_window', 0]}
                }
            }
        }
    },
    # Filter to only include documents with exactly 40 data processes
    # {
    #     '$match': {
    #         'data_processes_count': 40
    #     }
    # },
    {
        '$limit': 1000
    }
]

results = docdb_api_client.aggregate_docdb_records(pipeline=agg_pipeline)

results_df = pd.DataFrame(results)
results_df['data_asset_id'] = results_df['external_links'].apply(lambda x: x['Code Ocean'][0])
results_df['processed_date'] = results_df['name'].str.split('_').str[-2]
results_df['raw_name'] = results_df['name'].str.split('_processed_').str[0]

results_df = results_df[['raw_name', 'long_window', 'data_asset_id', 'processed_date', 'name' ]]
results_df.sort_values(by='processed_date', ascending=False, inplace=True)
results_df

Unnamed: 0,raw_name,long_window,data_asset_id,processed_date,name
35,multiplane-ophys_788406_2025-06-11_08-54-38,60.0,c2ecd44b-2704-439a-84cf-dda2f6a3f4a2,2025-10-06,multiplane-ophys_788406_2025-06-11_08-54-38_pr...
34,multiplane-ophys_788406_2025-07-14_12-20-34,60.0,e3e9a0de-0bf1-4655-a46c-4d5c88e46288,2025-10-06,multiplane-ophys_788406_2025-07-14_12-20-34_pr...
33,multiplane-ophys_788406_2025-06-20_10-38-42,60.0,46375e95-b3f8-41d3-a56d-68b98486a84e,2025-10-06,multiplane-ophys_788406_2025-06-20_10-38-42_pr...
32,multiplane-ophys_788406_2025-05-31_12-37-36,60.0,cdbe1ff3-055d-4361-8551-d2cd25b4ee04,2025-10-06,multiplane-ophys_788406_2025-05-31_12-37-36_pr...
31,multiplane-ophys_788406_2025-06-04_07-29-18,60.0,a8018917-9188-4ffa-a851-81243cfd1a29,2025-10-06,multiplane-ophys_788406_2025-06-04_07-29-18_pr...
30,multiplane-ophys_788406_2025-05-29_11-29-10,60.0,54fe8aa9-107f-43f7-8abf-8a70d099aefe,2025-10-06,multiplane-ophys_788406_2025-05-29_11-29-10_pr...
24,multiplane-ophys_788406_2025-07-18_12-17-32,60.0,fd63fa7a-0fce-4510-878f-c811ce624db7,2025-10-01,multiplane-ophys_788406_2025-07-18_12-17-32_pr...
20,multiplane-ophys_788406_2025-07-21_11-16-45,60.0,ebab4a9c-0550-4a1a-b82d-c372ef1013aa,2025-10-01,multiplane-ophys_788406_2025-07-21_11-16-45_pr...
29,multiplane-ophys_788406_2025-07-23_12-31-02,60.0,e0998980-c437-4638-a4b2-7d30aa20b5fd,2025-10-01,multiplane-ophys_788406_2025-07-23_12-31-02_pr...
28,multiplane-ophys_788406_2025-07-28_12-22-00,60.0,4ab68a1c-4ab6-4ab5-9a82-ad64c60b0a6d,2025-10-01,multiplane-ophys_788406_2025-07-28_12-22-00_pr...


In [113]:
results_df.groupby('long_window').size()

long_window
60.0      27
1800.0     3
dtype: int64

In [4]:
results_df.query('long_window==60 and processed_date >= "2025-09-02"')

Unnamed: 0,raw_name,long_window,data_asset_id,processed_date,name
26,multiplane-ophys_782149_2025-03-25_09-46-08,60.0,ef74c3c1-8710-4dab-8216-3f74d58528a9,2025-10-04,multiplane-ophys_782149_2025-03-25_09-46-08_pr...
21,multiplane-ophys_782149_2025-05-06_09-18-12,60.0,d4f96b83-0948-4389-b2d3-b81f11892e1d,2025-09-12,multiplane-ophys_782149_2025-05-06_09-18-12_pr...
16,multiplane-ophys_782149_2025-04-30_11-37-48,60.0,6a78ed92-134e-4a5f-8d04-8fc9e79e2e5c,2025-09-12,multiplane-ophys_782149_2025-04-30_11-37-48_pr...
22,multiplane-ophys_782149_2025-05-07_09-10-44,60.0,8d4cffb5-712a-4389-8eda-db79a0225d32,2025-09-12,multiplane-ophys_782149_2025-05-07_09-10-44_pr...
20,multiplane-ophys_782149_2025-05-02_10-55-09,60.0,5ccc2272-8792-45b4-bbd5-b221e70af744,2025-09-12,multiplane-ophys_782149_2025-05-02_10-55-09_pr...
19,multiplane-ophys_782149_2025-05-05_11-39-29,60.0,aa46df0b-24f9-4894-9677-171d5bae9210,2025-09-12,multiplane-ophys_782149_2025-05-05_11-39-29_pr...
18,multiplane-ophys_782149_2025-05-01_09-20-00,60.0,05e59e9b-a2ed-40ad-8b98-434636bc2764,2025-09-12,multiplane-ophys_782149_2025-05-01_09-20-00_pr...
17,multiplane-ophys_782149_2025-04-25_09-14-05,60.0,870dff62-2075-4bb1-bf12-70d883ddebcf,2025-09-12,multiplane-ophys_782149_2025-04-25_09-14-05_pr...
9,multiplane-ophys_782149_2025-04-14_12-01-21,60.0,25a4331a-fceb-427e-ae06-e7a6acbda06e,2025-09-12,multiplane-ophys_782149_2025-04-14_12-01-21_pr...
10,multiplane-ophys_782149_2025-04-11_09-47-54,60.0,386f0acc-17bc-4536-9fab-4298e94c702b,2025-09-12,multiplane-ophys_782149_2025-04-11_09-47-54_pr...


# Check if the data assets are in code ocean

In [5]:
for _, row in results_df.iterrows():
    data_asset_id = row['data_asset_id']
    data_asset_name = row['name']
    data_asset = client.data_assets.get_data_asset(data_asset_id)
    assert data_asset.name == data_asset_name

NameError: name 'client' is not defined

In [117]:
client.data_assets.get_data_asset(data_asset_id)

DataAsset(id='cc90fb44-5a83-4166-9b77-7079d9c9c957', created=1731775108, name='multiplane-ophys_755252_2024-11-15_10-49-40_processed_2024-11-16_16-38-28', mount='multiplane-ophys_755252_2024-11-15_10-49-40_processed_2024-11-16_16-38-28', state=<DataAssetState.Ready: 'ready'>, type=<DataAssetType.Result: 'result'>, last_used=1759264083, files=2249, size=756662885339, description='', tags=['derived', '755252', 'multiplane-ophys'], provenance=Provenance(commit=None, run_script='code/run', docker_image=None, capsule='56bf687b-dbcd-4b93-a650-21b8584036ff', data_assets=['95ba6dcc-c32a-4812-a234-27ae729cc497'], computation='9ebdd908-9b7f-46c9-9d9f-e3b7f84b72f7'), source_bucket=None, custom_metadata={'data level': 'derived', 'experiment type': 'multiplane-ophys', 'subject id': '755252'}, app_parameters=None, nextflow_profile=None, contained_data_assets=None, last_transferred=None, transfer_error=None, failure_reason=None)

# Comparing with session info
- Using lamf-analysis codes

In [3]:
mouse_id = 790322
target_long_window = 60

session_infos = docdb_utils.get_session_infos_from_docdb(mouse_id, docdb_api_client=docdb_api_client)
processed_infos = docdb_utils.get_processed_data_info(mouse_id, docdb_api_client=docdb_api_client)
processed_infos = docdb_utils.filter_data_asset_info_by_date(processed_infos)
processed_infos = docdb_utils.filter_data_asset_info_by_long_window(processed_infos, target_long_window)
session_infos = session_infos.merge(processed_infos, right_on='raw_name', left_on='raw_asset_name', how='left')
# session_infos.sort_values(by='acquisition_date', ascending=True, inplace=True)
sessions_to_process = session_infos[session_infos['data_asset_id'].isna()].copy()
sessions_to_process

Unnamed: 0,acquisition_date,session_type,reward_consumed,rig_id,session_name,raw_asset_name,session_type_exposures,raw_name,long_window,data_asset_id,processed_date,name


In [4]:
session_infos

Unnamed: 0,acquisition_date,session_type,reward_consumed,rig_id,session_name,raw_asset_name,session_type_exposures,raw_name,long_window,data_asset_id,processed_date,name
0,2025-06-11,TRAINING_0_gratings_autorewards_15min,,MESO.2,790322_2025-06-11,multiplane-ophys_790322_2025-06-11_15-09-29,1,multiplane-ophys_790322_2025-06-11_15-09-29,60.0,299635d1-e9aa-4368-b7cd-7462c707d2b2,2025-10-06,multiplane-ophys_790322_2025-06-11_15-09-29_pr...
1,2025-06-12,TRAINING_1_gratings,,MESO.2,790322_2025-06-12,multiplane-ophys_790322_2025-06-12_15-23-11,1,multiplane-ophys_790322_2025-06-12_15-23-11,60.0,b5c61aaf-c522-4794-9044-056ca02cf333,2025-09-30,multiplane-ophys_790322_2025-06-12_15-23-11_pr...
2,2025-06-13,TRAINING_1_gratings,,MESO.2,790322_2025-06-13,multiplane-ophys_790322_2025-06-13_14-57-49,2,multiplane-ophys_790322_2025-06-13_14-57-49,60.0,7e5cbd02-1035-49f9-8295-7c4f8f9a0f9c,2025-09-30,multiplane-ophys_790322_2025-06-13_14-57-49_pr...
3,2025-06-16,TRAINING_1_gratings,,MESO.2,790322_2025-06-16,multiplane-ophys_790322_2025-06-16_12-24-56,3,multiplane-ophys_790322_2025-06-16_12-24-56,60.0,f3285111-8a41-453a-8081-315a22a2eaf9,2025-10-06,multiplane-ophys_790322_2025-06-16_12-24-56_pr...
4,2025-06-17,TRAINING_2_gratings_flashed,,MESO.2,790322_2025-06-17,multiplane-ophys_790322_2025-06-17_15-44-27,1,multiplane-ophys_790322_2025-06-17_15-44-27,60.0,5a55c883-8d2e-4884-be6b-36251b7e0598,2025-10-06,multiplane-ophys_790322_2025-06-17_15-44-27_pr...
5,2025-06-18,TRAINING_2_gratings_flashed,,MESO.2,790322_2025-06-18,multiplane-ophys_790322_2025-06-18_14-34-44,2,multiplane-ophys_790322_2025-06-18_14-34-44,60.0,711f1652-96d6-4c74-bdd3-a6b35914c5f7,2025-10-06,multiplane-ophys_790322_2025-06-18_14-34-44_pr...
6,2025-06-23,TRAINING_3_images_A_10uL_reward,,MESO.2,790322_2025-06-23,multiplane-ophys_790322_2025-06-23_12-18-18,1,multiplane-ophys_790322_2025-06-23_12-18-18,60.0,9df968b1-a132-4b32-ab37-105f2759e798,2025-09-30,multiplane-ophys_790322_2025-06-23_12-18-18_pr...
7,2025-06-24,TRAINING_3_images_A_10uL_reward,,MESO.2,790322_2025-06-24,multiplane-ophys_790322_2025-06-24_12-14-55,2,multiplane-ophys_790322_2025-06-24_12-14-55,60.0,2ce56f25-40a8-422b-a75c-80aaf41ebd43,2025-10-06,multiplane-ophys_790322_2025-06-24_12-14-55_pr...
8,2025-06-25,TRAINING_3_images_A_10uL_reward,,MESO.2,790322_2025-06-25,multiplane-ophys_790322_2025-06-25_14-02-09,3,multiplane-ophys_790322_2025-06-25_14-02-09,60.0,b4bd7a11-f820-446e-b69a-cfba81dbea7f,2025-10-06,multiplane-ophys_790322_2025-06-25_14-02-09_pr...
9,2025-06-26,TRAINING_4_images_A_training,,MESO.2,790322_2025-06-26,multiplane-ophys_790322_2025-06-26_14-34-26,1,multiplane-ophys_790322_2025-06-26_14-34-26,60.0,8962212d-06ff-45a4-a615-b8a4dd569ebf,2025-10-06,multiplane-ophys_790322_2025-06-26_14-34-26_pr...


In [25]:
query = {"subject.subject_id": str(mouse_id), "data_description.data_level": "raw"}
subject_response = docdb_api_client.retrieve_docdb_records(
                filter_query=query,                
                )

In [41]:


mouse_id = 755252
session_infos = docdb_utils.get_session_infos_from_docdb(str(mouse_id))

_, mouse_df = cdu.get_mouse_session_df(mouse_id,
                        include_pupil=False)
# Filter based on target_dff_long_baseline_window (until we decide which one to use)
target_dff_long_baseline_window = 60
mouse_df = cdu.add_dff_long_baseline_window_to_mouse_df(mouse_df)
mouse_df = mouse_df.query(f"dff_long_window == {target_dff_long_baseline_window}").copy()

# choose session indices to analyze (from session_infos df)
session_inds_to_attach = np.arange(len(session_infos)) # in this example, just attach them all
session_acq_dates_to_attach = session_infos['acquisition_date'].values[session_inds_to_attach]
raw_asset_names = session_infos['raw_asset_name'].values[session_inds_to_attach]
mouse_df_to_attach = mouse_df.query(f"raw_data_date in {list(session_acq_dates_to_attach)}")
mouse_df_to_attach = mouse_df_to_attach.merge(session_infos, left_on='raw_data_date', right_on='acquisition_date', how='left')
mouse_df_to_attach.sort_values('acquisition_date', inplace=True)
assert len(mouse_df_to_attach) == len(session_inds_to_attach)

In [33]:
mouse_df_to_attach

Unnamed: 0,raw_data_date,processed_data_date,capsule_id,commit_id,processed_data_asset_id,raw_data_asset_id,num_provenence_data_assets,num_raw_data_asset_ids,dff_long_window,acquisition_date,session_type,reward_consumed,rig_id,session_name,raw_asset_name,session_type_exposures
0,2025-01-22,2025-09-07,4cbb3e55-83df-40ad-83ff-6d65839dc9c0,6fab6e96466e8f136b5233bacb74d059263f2c79,cd6d37c7-65ec-43ec-b713-87b85c2687ac,[378c6783-8e2e-4b5f-aacf-3c5cf33df36d],3,1,60.0,2025-01-22,TRAINING_0_gratings_autorewards_15min,,MESO.2,767018_2025-01-22,multiplane-ophys_767018_2025-01-22_14-04-11,1
1,2025-01-23,2025-09-07,4cbb3e55-83df-40ad-83ff-6d65839dc9c0,6fab6e96466e8f136b5233bacb74d059263f2c79,79afe780-9e72-44d6-b597-8364ada75f16,[437334f6-45bc-4f61-b5db-a1197332a527],3,1,60.0,2025-01-23,TRAINING_1_gratings,,MESO.2,767018_2025-01-23,multiplane-ophys_767018_2025-01-23_14-25-52,1
2,2025-01-24,2025-09-07,4cbb3e55-83df-40ad-83ff-6d65839dc9c0,6fab6e96466e8f136b5233bacb74d059263f2c79,56546189-9eaf-4094-8c5d-91c4e9d2b528,[68af0c36-6669-4f75-aaac-0dfc16940534],3,1,60.0,2025-01-24,TRAINING_1_gratings,,MESO.2,767018_2025-01-24,multiplane-ophys_767018_2025-01-24_14-04-10,2
3,2025-01-27,2025-09-07,4cbb3e55-83df-40ad-83ff-6d65839dc9c0,6fab6e96466e8f136b5233bacb74d059263f2c79,816baa59-d947-4864-8d78-dd051955e33e,[76050a1b-b666-4634-888c-e02be9f2b0fb],3,1,60.0,2025-01-27,TRAINING_2_gratings_flashed,,MESO.2,767018_2025-01-27,multiplane-ophys_767018_2025-01-27_12-15-34,1
4,2025-01-28,2025-09-07,4cbb3e55-83df-40ad-83ff-6d65839dc9c0,6fab6e96466e8f136b5233bacb74d059263f2c79,f9b5d666-69a1-4479-97cc-a821f5a3f9d1,[239689ee-f5c7-4438-822b-a656258bd002],3,1,60.0,2025-01-28,TRAINING_3_images_A_10uL_reward,,MESO.2,767018_2025-01-28,multiplane-ophys_767018_2025-01-28_12-16-57,1
5,2025-01-29,2025-09-07,4cbb3e55-83df-40ad-83ff-6d65839dc9c0,6fab6e96466e8f136b5233bacb74d059263f2c79,65e3ed33-d8e6-4f85-8fbc-a96b57065f6e,[2e492aa7-e4f1-4b56-9f87-b2ce43e8d8fb],3,1,60.0,2025-01-29,TRAINING_3_images_A_10uL_reward,,MESO.2,767018_2025-01-29,multiplane-ophys_767018_2025-01-29_12-10-38,2
6,2025-01-31,2025-10-05,4cbb3e55-83df-40ad-83ff-6d65839dc9c0,,0bf230d3-ab7c-4d0d-af4a-94c450a2ce04,[6e2b3661-e0e4-46ef-873d-6ed5b5ba636c],3,1,60.0,2025-01-31,TRAINING_4_images_A_training,,MESO.2,767018_2025-01-31,multiplane-ophys_767018_2025-01-31_14-09-45,1
7,2025-02-03,2025-09-09,4cbb3e55-83df-40ad-83ff-6d65839dc9c0,6fab6e96466e8f136b5233bacb74d059263f2c79,27c81cd9-a0b2-44e6-a725-a5eb69ce12a5,[0c627278-3a4c-46f8-9713-64849c4d4862],3,1,60.0,2025-02-03,TRAINING_5_images_A_epilogue,,MESO.2,767018_2025-02-03,multiplane-ophys_767018_2025-02-03_12-55-02,1
8,2025-02-04,2025-09-07,4cbb3e55-83df-40ad-83ff-6d65839dc9c0,6fab6e96466e8f136b5233bacb74d059263f2c79,db89768d-d2f1-48b9-88f0-d5183fc19aa2,[ca3f7e4b-f3d0-4333-83af-5b1bf48b98b2],3,1,60.0,2025-02-04,TRAINING_5_images_A_handoff_ready,,MESO.2,767018_2025-02-04,multiplane-ophys_767018_2025-02-04_12-10-51,1
9,2025-02-05,2025-09-07,4cbb3e55-83df-40ad-83ff-6d65839dc9c0,6fab6e96466e8f136b5233bacb74d059263f2c79,3740d4ab-946f-4775-974a-5702376fe330,[21e7833d-5192-42af-8786-c332c6b107f0],3,1,60.0,2025-02-05,OPHYS_1_images_A,,MESO.2,767018_2025-02-05,multiplane-ophys_767018_2025-02-05_14-53-53,1


In [22]:
processed_data

[DataAsset(id='a8afc036-18c1-4180-9abd-0c40cc09935c', created=1740071409, name='multiplane-ophys_767018_2025-02-19_14-02-36_processed_2025-02-20_16-35-44', mount='multiplane-ophys_767018_2025-02-19_14-02-36_processed_2025-02-20_16-35-44', state=<DataAssetState.Ready: 'ready'>, type=<DataAssetType.Result: 'result'>, last_used=1758840989, files=3422, size=344375999379, description='', tags=['derived', '767018', 'multiplane-ophys'], provenance=Provenance(commit='c9b29805712a72f30da70d42b2c0bfe0231f635a', run_script='code/run', docker_image=None, capsule='e3ac6a41-c578-4798-b251-3b316674dce2', data_assets=['35d1284e-4dfa-4ac3-9ba8-5ea1ae2fdaeb', 'b15eb0a6-9c2e-482b-b2e4-9cdc547e4e75', 'fb4b5cef-4505-4145-b8bd-e41d6863d7a9'], computation='2665fffd-8c14-414e-acc8-f1c830ee7dcd'), source_bucket=None, custom_metadata={'data level': 'derived', 'experiment type': 'multiplane-ophys', 'subject id': '767018'}, app_parameters=None, nextflow_profile=None, contained_data_assets=None, last_transferred=N

In [34]:
DEFAULT_MOUNT_TO_IGNORE = cdu.DEFAULT_MOUNT_TO_IGNORE

mouse = 755252

processed_date_after = None
processed_date_before = None
include_pupil = False
mouse_sessions = cdu.get_mouse_sessions_by_filters(mouse_id=mouse_id)
# to prevent errors (happens when adding faulty tags)
mouse_sessions = tuple([ms for ms in mouse_sessions if ms.subject_id == str(mouse_id)])

raw_data_date_list = []
processed_data_date_list = []
capsule_ids_list = []
commit_ids_list = []
processed_data_asset_ids_list = []
raw_data_asset_ids_list = []

num_provenence_data_assets_list = []
if include_pupil:
    pupil_data_asset_ids_list = []
for session in mouse_sessions:
    try: 
        data_name = session.raw_data_asset.name
    except:
        continue
    if 'multiplane-ophys' not in data_name:
        continue
    raw_date = session.raw_data_asset.name.split('_')[2] 
    processed_data = [da for da in session.data_assets if '_processed_' in da.name]
    # processed_data = [da for da in processed_data if (da.provenance.commit is not None)]
    if include_pupil:
        pupil_data = [da for da in session.data_assets if 'dlc-eye' in da.name]
        pupil_raw_data = [np.setdiff1d(da.provenance.data_assets, DEFAULT_MOUNT_TO_IGNORE) for da in pupil_data]

    processed_data_dates = [da.name.split('_processed_')[1].split('_')[0] for da in processed_data]
    capsule_ids = [da.provenance.capsule for da in processed_data]
    commit_ids = [da.provenance.commit for da in processed_data]
    data_asset_ids = [da.id for da in processed_data]
    raw_data_asset_ids = [np.setdiff1d(da.provenance.data_assets, DEFAULT_MOUNT_TO_IGNORE) for da in processed_data]
    num_provenence_data_assets = [len(da.provenance.data_assets) for da in processed_data]
    for i in range(len(processed_data)):
        if processed_date_after is not None:
            if processed_data_dates[i] < processed_date_after:
                continue
        if processed_date_before is not None:
            if processed_data_dates[i] > processed_date_before:
                continue

        raw_data_asset_id = raw_data_asset_ids[i]

        if include_pupil:
            matching_pupil_data_ind = np.where([raw_data_asset_id in pupil_raw_data[j] for j in range(len(pupil_raw_data))])[0]
            if len(matching_pupil_data_ind) == 1:
                pupil_data_asset_ids_list.append(pupil_data[matching_pupil_data_ind[0]].id)
            elif len(matching_pupil_data_ind) == 0:
                pupil_data_asset_ids_list.append(0)
            else:
                raise ValueError(f'More than one matching pupil data asset found for {raw_data_asset_id} from {session}')

        raw_data_date_list.append(raw_date)
        capsule_ids_list.append(capsule_ids[i])
        commit_ids_list.append(commit_ids[i])
        processed_data_asset_ids_list.append(data_asset_ids[i])
        processed_data_date_list.append(processed_data_dates[i])
        raw_data_asset_ids_list.append(raw_data_asset_id)
        num_provenence_data_assets_list.append(num_provenence_data_assets[i])
mouse_session_df = pd.DataFrame({'raw_data_date': raw_data_date_list,
                                    'processed_data_date': processed_data_date_list,
                                    'capsule_id': capsule_ids_list,
                                    'commit_id': commit_ids_list,
                                    'processed_data_asset_id': processed_data_asset_ids_list,
                                    'raw_data_asset_id': raw_data_asset_ids_list,
                                    'num_provenence_data_assets': num_provenence_data_assets_list})

In [36]:
mouse_session_df['num_raw_data_asset_ids'] = mouse_session_df['raw_data_asset_id'].apply(len)

In [37]:
np.all(mouse_session_df['num_raw_data_asset_ids'].values == 1)

np.False_

In [39]:
temp_df = mouse_session_df.query('num_raw_data_asset_ids > 1').copy()
temp_df

Unnamed: 0,raw_data_date,processed_data_date,capsule_id,commit_id,processed_data_asset_id,raw_data_asset_id,num_provenence_data_assets,num_raw_data_asset_ids
26,2025-02-10,2025-06-28,e3ac6a41-c578-4798-b251-3b316674dce2,e20602d7bf64b4c6096acc5cf1def1d0526898c6,91c5d82a-c3d6-4616-b453-2c008baf6ccc,"[1d41a92d-2f10-40d1-830d-83ba23aca8bd, 8c8b2ff...",4,2
27,2025-02-10,2025-06-28,e3ac6a41-c578-4798-b251-3b316674dce2,e20602d7bf64b4c6096acc5cf1def1d0526898c6,363dcf0e-3639-4b0d-ba21-13b80a60599d,"[1d41a92d-2f10-40d1-830d-83ba23aca8bd, 9603e3b...",4,2
28,2025-02-10,2025-06-28,e3ac6a41-c578-4798-b251-3b316674dce2,e20602d7bf64b4c6096acc5cf1def1d0526898c6,4f63cb02-f418-4327-b770-277469c75022,"[1d41a92d-2f10-40d1-830d-83ba23aca8bd, b4fded8...",4,2
29,2025-02-10,2025-06-28,e3ac6a41-c578-4798-b251-3b316674dce2,e20602d7bf64b4c6096acc5cf1def1d0526898c6,f47596a5-4964-45ab-828d-be80fbf60fea,"[1d41a92d-2f10-40d1-830d-83ba23aca8bd, ef0ee62...",4,2
30,2025-02-10,2025-06-28,e3ac6a41-c578-4798-b251-3b316674dce2,e20602d7bf64b4c6096acc5cf1def1d0526898c6,7b4955b6-68ac-4cd4-99bf-41d6051630c7,"[1d41a92d-2f10-40d1-830d-83ba23aca8bd, f1f2552...",4,2
31,2025-02-10,2025-06-28,e3ac6a41-c578-4798-b251-3b316674dce2,e20602d7bf64b4c6096acc5cf1def1d0526898c6,b1b54094-2788-4b48-95e2-dcdcd1f7b31e,"[0e4262b9-2772-41f8-a73f-45a9822715d3, 1d41a92...",4,2
32,2025-02-10,2025-06-28,e3ac6a41-c578-4798-b251-3b316674dce2,e20602d7bf64b4c6096acc5cf1def1d0526898c6,04f6ff3a-6df5-4326-bfc8-beb76dee46e5,"[1d41a92d-2f10-40d1-830d-83ba23aca8bd, 3c170a8...",4,2


In [None]:
add_dff_long_baseline_window_to_mouse_df


In [25]:
[np.setdiff1d(da.provenance.data_assets, DEFAULT_MOUNT_TO_IGNORE) for da in processed_data]

[array(['6204745b-4903-4c81-8519-8d87853d35e8'], dtype='<U36'),
 array(['1d41a92d-2f10-40d1-830d-83ba23aca8bd'], dtype='<U36'),
 array(['1d41a92d-2f10-40d1-830d-83ba23aca8bd',
        '8c8b2ff5-d5a7-4ca2-bb19-e3e9e5caa8e6'], dtype='<U36'),
 array(['1d41a92d-2f10-40d1-830d-83ba23aca8bd',
        '9603e3b3-fca3-4829-bc07-e2a913456c64'], dtype='<U36'),
 array(['1d41a92d-2f10-40d1-830d-83ba23aca8bd',
        'b4fded8a-4c27-4f5f-8ad5-18fb8a195b62'], dtype='<U36'),
 array(['1d41a92d-2f10-40d1-830d-83ba23aca8bd',
        'ef0ee62d-0ca1-43aa-9125-a2a95ed2d158'], dtype='<U36'),
 array(['1d41a92d-2f10-40d1-830d-83ba23aca8bd',
        'f1f2552c-5e5d-4dda-8660-cb5d641e0aa0'], dtype='<U36'),
 array(['0e4262b9-2772-41f8-a73f-45a9822715d3',
        '1d41a92d-2f10-40d1-830d-83ba23aca8bd'], dtype='<U36'),
 array(['1d41a92d-2f10-40d1-830d-83ba23aca8bd',
        '3c170a80-3e21-4eae-aefd-9000e23c0703'], dtype='<U36'),
 array(['1d41a92d-2f10-40d1-830d-83ba23aca8bd'], dtype='<U36')]

In [6]:
success = cdu.attach_mouse_data_assets(mouse_df_to_attach)

AssertionError: raw data asset ids must be str

In [18]:
[s.raw_data_asset for s in mouse_sessions]

[DataAsset(id='378c6783-8e2e-4b5f-aacf-3c5cf33df36d', created=1742490470, name='multiplane-ophys_767018_2025-01-22_14-04-11', mount='multiplane-ophys_767018_2025-01-22_14-04-11', state=<DataAssetState.Ready: 'ready'>, type=<DataAssetType.Dataset: 'dataset'>, last_used=0, files=54, size=82792925542, description='', tags=['767018', 'multiplane-ophys', 'raw'], provenance=None, source_bucket=SourceBucket(origin=<DataAssetOrigin.AWS: 'aws'>, bucket='aind-private-data-prod-o5171v', prefix='multiplane-ophys_767018_2025-01-22_14-04-11', external=True), custom_metadata={'data level': 'raw', 'experiment type': 'multiplane-ophys', 'subject id': '767018'}, app_parameters=None, nextflow_profile=None, contained_data_assets=None, last_transferred=None, transfer_error=None, failure_reason=None),
 DataAsset(id='437334f6-45bc-4f61-b5db-a1197332a527', created=1742490855, name='multiplane-ophys_767018_2025-01-23_14-25-52', mount='multiplane-ophys_767018_2025-01-23_14-25-52', state=<DataAssetState.Ready: '

In [7]:
[raw_id for raw_id in mouse_df_to_attach.raw_data_asset_id.values]

[array(['378c6783-8e2e-4b5f-aacf-3c5cf33df36d'], dtype='<U36'),
 array(['437334f6-45bc-4f61-b5db-a1197332a527'], dtype='<U36'),
 array(['68af0c36-6669-4f75-aaac-0dfc16940534'], dtype='<U36'),
 array(['76050a1b-b666-4634-888c-e02be9f2b0fb'], dtype='<U36'),
 array(['239689ee-f5c7-4438-822b-a656258bd002'], dtype='<U36'),
 array(['2e492aa7-e4f1-4b56-9f87-b2ce43e8d8fb'], dtype='<U36'),
 array(['6e2b3661-e0e4-46ef-873d-6ed5b5ba636c'], dtype='<U36'),
 array(['0c627278-3a4c-46f8-9713-64849c4d4862'], dtype='<U36'),
 array(['ca3f7e4b-f3d0-4333-83af-5b1bf48b98b2'], dtype='<U36'),
 array(['21e7833d-5192-42af-8786-c332c6b107f0'], dtype='<U36'),
 array(['eda0f4fe-becf-442e-bfb4-ac325afa1a08'], dtype='<U36'),
 array(['1d41a92d-2f10-40d1-830d-83ba23aca8bd'], dtype='<U36'),
 array(['64d6c911-3c3a-47e6-b3e8-6a5e2873cf37'], dtype='<U36'),
 array(['066fc304-0efc-4036-b60d-a1ed02a7df5d'], dtype='<U36'),
 array(['fcb16545-a50f-44c2-9882-2a20f7f89bb8'], dtype='<U36'),
 array(['445ffd12-f441-4372-a096-7249504

In [26]:
data_type='multiplane-ophys'
session_infos = pd.DataFrame()
for response in subject_response:
    # schema_version = response['schema_version']
    # if schema_version == '1.1.1': # '1.1.1' and '1.0.2' tested
    if response['name'].startswith(data_type):
        acquisition_date = response['session']['session_start_time'][:10]
        session_name = str(mouse_id) + "_" + acquisition_date
        session_type = response['session']['session_type']
        reward_consumed = response['session']['reward_consumed_total']
        rig_id = response['session']['rig_id']
        data_asset_name = response['name']
        temp_info = {"acquisition_date": acquisition_date,
                    "session_type": session_type,
                    "reward_consumed": reward_consumed,
                    "rig_id": rig_id,
                    "session_name": session_name,
                    "raw_asset_name": data_asset_name,
                    }
        session_infos = pd.concat([session_infos, pd.DataFrame(temp_info, index=[0])], ignore_index=True)
        # else:
        #     print(f"Schema version {schema_version} not handled.")

session_infos.sort_values(by='acquisition_date', inplace=True)

In [27]:
session_infos

Unnamed: 0,acquisition_date,session_type,reward_consumed,rig_id,session_name,raw_asset_name
11,2025-01-22,TRAINING_0_gratings_autorewards_15min,,MESO.2,767018_2025-01-22,multiplane-ophys_767018_2025-01-22_14-04-11
12,2025-01-23,TRAINING_1_gratings,,MESO.2,767018_2025-01-23,multiplane-ophys_767018_2025-01-23_14-25-52
4,2025-01-24,TRAINING_1_gratings,,MESO.2,767018_2025-01-24,multiplane-ophys_767018_2025-01-24_14-04-10
18,2025-01-27,TRAINING_2_gratings_flashed,,MESO.2,767018_2025-01-27,multiplane-ophys_767018_2025-01-27_12-15-34
2,2025-01-28,TRAINING_3_images_A_10uL_reward,,MESO.2,767018_2025-01-28,multiplane-ophys_767018_2025-01-28_12-16-57
8,2025-01-29,TRAINING_3_images_A_10uL_reward,,MESO.2,767018_2025-01-29,multiplane-ophys_767018_2025-01-29_12-10-38
6,2025-01-31,TRAINING_4_images_A_training,,MESO.2,767018_2025-01-31,multiplane-ophys_767018_2025-01-31_14-09-45
15,2025-02-03,TRAINING_5_images_A_epilogue,,MESO.2,767018_2025-02-03,multiplane-ophys_767018_2025-02-03_12-55-02
16,2025-02-04,TRAINING_5_images_A_handoff_ready,,MESO.2,767018_2025-02-04,multiplane-ophys_767018_2025-02-04_12-10-51
17,2025-02-05,OPHYS_1_images_A,,MESO.2,767018_2025-02-05,multiplane-ophys_767018_2025-02-05_14-53-53


# Searching docDB using data asset ID

In [92]:
data_asset_id = 'ebf5eaad-4365-4dc8-92a1-dac5b38b2150'

filter_query = {
    "external_links.Code Ocean": data_asset_id
}

# Define what fields you want to retrieve
projection = {
    "name": 1,
    "subject.subject_id": 1,
    "data_description.data_level": 1,
    "external_links": 1,
    "_id": 1
}

# Execute the query
results = docdb_api_client.retrieve_docdb_records(
    filter_query=filter_query,
    projection=projection
)


In [93]:
results

[{'_id': '22a6d50a-5d5d-4ea6-a77e-c246fa91ded0',
  'name': 'multiplane-ophys_755252_2025-01-14_12-09-40_processed_2025-09-05_20-42-15',
  'external_links': {'Code Ocean': ['ebf5eaad-4365-4dc8-92a1-dac5b38b2150']},
  'subject': {'subject_id': '755252'},
  'data_description': {'data_level': 'derived'}}]

# Searching conditioned-mean-response results
- with dff_long_window

In [105]:
def get_session_info_df_to_run(docdb_api_client, subject_id, conditions, max_retry=100):
    """Get the raw asset names to run ophys processing on
    Parameters
    ----------
    docdb_api_client : MetadataDbClient
        The data access api client
    subject_id : str
        The subject ID
    conditions : dict
        Conditions to filter the raw asset names
        Keys can be 'raw_start_date', 'raw_end_date', 'processed_start_date', 'processed_end_date', 'dff_long_window'
        Values should be in the format 'YYYY-MM-DD', except for 'dff_long_window' which is int
    max_retry : int
        Maximum number of retries for querying the data access api
    Returns
    -------
    pd.DataFrame
        DataFrame of the session infos that meet the conditions
    """
    
    target_long_window = conditions['dff_long_window']
    session_infos = docdb_utils.get_session_infos_from_docdb(subject_id, docdb_api_client=docdb_api_client)
    processed_infos = docdb_utils.get_processed_data_info(subject_id, docdb_api_client=docdb_api_client)
    processed_infos = docdb_utils.filter_data_asset_info_by_date(processed_infos)
    processed_infos = docdb_utils.filter_data_asset_info_by_long_window(processed_infos, target_long_window)
    dlc_infos = docdb_utils.get_dlc_eye_data_info(subject_id, docdb_api_client=docdb_api_client)
    session_infos = (
        session_infos
        .merge(processed_infos, left_on='raw_asset_name', right_on='raw_name', how='left')
        .merge(dlc_infos, left_on='raw_asset_name', right_on='raw_name', how='left', suffixes=('_proc', '_dlc'))
    )

    # Drop the join key columns originating from 'raw_name'
    session_infos = session_infos.drop(
        columns=[c for c in session_infos.columns if c.startswith('raw_name_')]
    )
    return session_infos

In [None]:
suffix = 'CTL-conditioned-mean-response'
subject_id = 782149
conditions = {'dff_long_window': 60}

session_infos = get_session_info_df_to_run(docdb_api_client, subject_id, conditions)
# get previously run sessions with the same conditions
run_infos = docdb_utils.get_derived_data_assets(subject_id, suffix, conditions, docdb_api_client=docdb_api_client)
if run_infos is None:
    to_run_infos = session_infos
else:
    to_run_infos = session_infos.merge(run_infos, left_on='raw_asset_name', right_on='raw_name', how='left')
    # select rows with nan values
    to_run_infos = to_run_infos[to_run_infos['derived_name'].isna()]

No previously run sessions found.


In [102]:
suffix = 'CTL-conditioned-mean-response'
mouse_id = 782149
agg_pipeline = [
    {        
        '$match': {
            'name': {'$regex': suffix, '$options': 'i'},
            'subject.subject_id': str(mouse_id),
        }
    },
    {
        '$project': {
            "_id": 1,
            "name": 1,
            "first_process": {
                "$arrayElemAt": ["$processing.processing_pipeline.data_processes", 0]
            }
        }
    },
    {
        '$match': {
            # "first_process.parameters.dff_long_window": {"$exists": True, "$ne": None}
            "first_process.parameters.dff_long_window": 60
        }
    },
    # Project to include name and count of data_processes
    {
        '$project': {
            'name': 1,
            '_id': 1,
            'external_links': 1,
            "dff_long_window": "$first_process.parameters.dff_long_window",
        }
    },

    {
        '$limit': 1000
    }
]

results = docdb_api_client.aggregate_docdb_records(pipeline=agg_pipeline)

In [103]:
results

[]

In [73]:
parameters = {'dff_long_window': 60}
agg_pipeline = [
        {        
            '$match': {
                'name': {'$regex': suffix, '$options': 'i'},
                'subject.subject_id': str(mouse_id),
            }
        },
        {
            '$project': {
                "_id": 1,
                "name": 1,
                "code_ocean_id": {"$arrayElemAt": ["$external_links.Code Ocean", 0]},
                "process": {
                    "$arrayElemAt": ["$processing.processing_pipeline.data_processes", 0]
                }
            }
        },
    ]
if parameters is not None:
    assert isinstance(parameters, dict), "parameters should be a dictionary"
    match_params = {f"process.parameters.{k}": v for k, v in parameters.items()}
    project_params = {f"{k}": f"$process.parameters.{k}" for k in parameters.keys()}
    base_project = {'name': 1,
            '_id': 1,
            'code_ocean_id': 1,}
    updated_project = {**base_project, **project_params}
    append_pipeline = [
        {
            '$match': match_params
        },
        # Project to include name and count of data_processes
        {
            '$project': updated_project
        }]
    agg_pipeline.extend(append_pipeline)
agg_pipeline.append({'$limit': 1000})

In [91]:
results = docdb_api_client.aggregate_docdb_records(pipeline=agg_pipeline)

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,_id,name,code_ocean_id,dff_long_window
0,b1bb83e3-41b8-4288-a7c5-c24c396fde76,multiplane-ophys_767018_2025-01-27_12-15-34_CT...,e3ff482e-fca0-49ae-b1e3-51fc78dfc2e2,60
1,ae526570-d867-461b-88bf-85859b02ce94,multiplane-ophys_767018_2025-01-24_14-04-10_CT...,56ba592e-5ecf-4e8f-8f2c-a2aa280548fc,60
2,037f8644-a17c-4950-8847-b1c891898d85,multiplane-ophys_767018_2025-01-22_14-04-11_CT...,62a89b1a-6f69-40af-ba02-2d56e2116798,60


In [92]:
suffix = 'CTL-conditioned-mean-response'
mouse_id = 767018

ddb_results = docdb_utils.get_derived_data_assets(mouse_id, suffix, parameters, docdb_api_client=docdb_api_client)
ddb_results

Unnamed: 0,derived_name,derived_asset_id,dff_long_window,derived_date,raw_name
0,multiplane-ophys_767018_2025-01-27_12-15-34_CT...,e3ff482e-fca0-49ae-b1e3-51fc78dfc2e2,60,2025-10-13,multiplane-ophys_767018_2025-01-27_12-15-34
1,multiplane-ophys_767018_2025-01-24_14-04-10_CT...,56ba592e-5ecf-4e8f-8f2c-a2aa280548fc,60,2025-10-13,multiplane-ophys_767018_2025-01-24_14-04-10
2,multiplane-ophys_767018_2025-01-22_14-04-11_CT...,62a89b1a-6f69-40af-ba02-2d56e2116798,60,2025-10-13,multiplane-ophys_767018_2025-01-22_14-04-11
