In [35]:
import csv
import logging
import os
import time
import re
from dataclasses import dataclass
from typing import Union
import numpy as np
import pandas as pd
from pathlib import Path

from aind_codeocean_pipeline_monitor.models import (CaptureSettings,
                                                    PipelineMonitorSettings)
from aind_data_access_api.document_db import MetadataDbClient
from codeocean import CodeOcean
from codeocean.computation import (ComputationState, DataAssetsRunParam,
                                   RunParams)
from dataclasses_json import dataclass_json

logging.basicConfig(
    filename="batch.log",
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
# Set environment variables
API_GATEWAY_HOST = "api.allenneuraldynamics.org"
DATABASE = "metadata_index"
COLLECTION = "data_assets"
# domain = os.getenv("CODEOCEAN_DOMAIN")
domain = "https://codeocean.allenneuraldynamics.org/"
token = os.getenv("API_SECRET")

# monitor_pipeline_capsule_id = os.getenv("CO_MONITOR_PIPELINE")
monitor_pipeline_capsule_id = os.getenv("CO_CAPSULE_ID")

client = CodeOcean(domain=domain, token=token)

DATE_FORMAT = '[0-9]{4}-[0-9]{2}-[0-9]{2}'
TIME_FORMAT = '[0-9]{2}-[0-9]{2}-[0-9]{2}'
SUBJECT_ID_FORMAT = '[0-9]{6}'

COMMIT_IDS_TO_KEEP = ['e61e887fbe5491035b35f8fd223101ad276622f1']

def get_asset_ids(docdb_api_client, asset_name, max_retry=100) -> str:
    """Get the asset ID from the data access api
    Parameters
    ----------
    docdb_api_client : MetadataDbClient
        The data access api client
    asset_name : str
        The asset name
    max_retry : int
        Maximum number of retries for querying the data access api
    Returns
    -------
    list
        List of the asset IDs
        Rarely, there are multiple assets with the same name
    """
    query = {"name": asset_name}
    projection = {"external_links": 1}
    retry_count = 0
    while retry_count < max_retry:
        try:
            response = docdb_api_client.retrieve_docdb_records(
                filter_query=query, projection=projection
            )
            break
        except Exception as e:
            logging.warning(f"Retry {retry_count+1}/{max_retry} after error: {e}")
            time.sleep(1)
            retry_count += 1
    external_links_list = []
    for res in response:        
        external_links = res.get("external_links", None)
        if type(external_links) is str:
            external_links = json.loads(external_links)
            external_links = external_links.get("Code Ocean", None)
        if type(external_links) is list and len(external_links) > 1:
            external_links = external_links[0]
            external_links = external_links.get("Code Ocean", None)
        if type(external_links) is dict:
            try:
                external_links = external_links.get("Code Ocean", None)[0]
            except IndexError:
                external_links = "None"
        if type(external_links) is list:
            try:
                external_links = external_links[0]
            except IndexError:
                external_links = "None"
        external_links_list.append(external_links)
    return external_links_list

In [None]:
query = {"processing.processing_pipeline.processor_full_name": "Jinho Kim",
        }

projection = {"external_links": 1,
              "name": 1,
              "processing.processing_pipeline.data_processes": 1,
              "subject.subject_id": 1,}
processed_pattern = f'multiplane-ophys_{SUBJECT_ID_FORMAT}_{DATE_FORMAT}_{TIME_FORMAT}_processed_{DATE_FORMAT}_{TIME_FORMAT}'

retry_count = 0
while retry_count < max_retry:
    try:
        response = docdb_api_client.retrieve_docdb_records(
            filter_query=query, projection=projection
        )
        break
    except Exception as e:
        logging.warning(f"Retry {retry_count+1}/{max_retry} after error: {e}")
        time.sleep(1)
        retry_count += 1

# for res in response:
#     if 

In [52]:
match_condition = {}
subject_ids = ['755252', '767018', '767022', '775682', '779891', '782149', '783551', '785054', '788406', '790322', '797371']
match_condition = {
        "processing.processing_pipeline.data_processes": {
            "$elemMatch": {
                "name": "dF/F estimation",
                # "parameters.long_window": True
            }
        }
    }
# match_condition = {"subject.subject_id": {"$in": subject_ids}}
match_condition["subject.subject_id"] = {"$in": subject_ids}
match_condition["processing.processing_pipeline.processor_full_name"] = "Jinho Kim"


agg_pipeline = [
    {'$match': match_condition},
    # {'$project': {'name': 1, 'data_processes': {'$filter': {'input': '$processing.processing_pipeline.data_processes', 'as': 'process', 'cond': {'$eq': ['$$process.name', 'dF/F estimation']}}}}},
    # {'$project': {'name': 1, 'output_location': {'$arrayElemAt': ['$data_processes.output_location', 0]}, 'long_window': {'$arrayElemAt': ['$data_processes.parameters.long_window', 0]}}},
    # {'$match': {'long_window': {'$exists': True}}}
    {'$project': {'name': 1, 'input_data_name': '$data_description.input_data_name'}}
]

# Initialize the client
docdb_api_client = MetadataDbClient(
    host=API_GATEWAY_HOST,
    database=DATABASE,
    collection=COLLECTION,
)

# Retrieve records
results = docdb_api_client.aggregate_docdb_records(agg_pipeline)

In [53]:
results

[{'_id': 'd113c13d-1513-4266-9db6-a1ebfd050608',
  'name': 'multiplane-ophys_755252_2024-12-16_11-32-36_processed_2025-09-19_11-41-18',
  'input_data_name': 'multiplane-ophys_755252_2024-12-16_11-32-36'},
 {'_id': '46609520-8b15-48e1-b210-1630ff6e35ed',
  'name': 'multiplane-ophys_755252_2024-12-19_11-34-11_processed_2025-09-19_12-26-36',
  'input_data_name': 'multiplane-ophys_755252_2024-12-19_11-34-11'},
 {'_id': 'bdea2d4a-5e73-4e5d-a276-8e94daabc94b',
  'name': 'multiplane-ophys_755252_2024-12-17_08-56-37_processed_2025-09-19_12-14-39',
  'input_data_name': 'multiplane-ophys_755252_2024-12-17_08-56-37'},
 {'_id': '315c6f9e-6ef4-43bc-bfdd-c38b2e534ac2',
  'name': 'multiplane-ophys_755252_2024-11-12_09-43-51_processed_2025-09-05_06-06-29',
  'input_data_name': 'multiplane-ophys_755252_2024-11-12_09-43-51'},
 {'_id': 'ee1a88cb-bcae-4e22-af38-6ec407130380',
  'name': 'multiplane-ophys_755252_2024-12-18_11-24-13_processed_2025-09-05_14-20-54',
  'input_data_name': 'multiplane-ophys_75525

In [54]:
pd.DataFrame(results)

Unnamed: 0,_id,name,input_data_name
0,d113c13d-1513-4266-9db6-a1ebfd050608,multiplane-ophys_755252_2024-12-16_11-32-36_pr...,multiplane-ophys_755252_2024-12-16_11-32-36
1,46609520-8b15-48e1-b210-1630ff6e35ed,multiplane-ophys_755252_2024-12-19_11-34-11_pr...,multiplane-ophys_755252_2024-12-19_11-34-11
2,bdea2d4a-5e73-4e5d-a276-8e94daabc94b,multiplane-ophys_755252_2024-12-17_08-56-37_pr...,multiplane-ophys_755252_2024-12-17_08-56-37
3,315c6f9e-6ef4-43bc-bfdd-c38b2e534ac2,multiplane-ophys_755252_2024-11-12_09-43-51_pr...,multiplane-ophys_755252_2024-11-12_09-43-51
4,ee1a88cb-bcae-4e22-af38-6ec407130380,multiplane-ophys_755252_2024-12-18_11-24-13_pr...,multiplane-ophys_755252_2024-12-18_11-24-13
...,...,...,...
188,fc1304f8-ca87-42f7-a6dc-c28d41c73525,multiplane-ophys_797371_2025-07-02_12-06-16_pr...,multiplane-ophys_797371_2025-07-02_12-06-16
189,1fb1c85a-cbcd-4b20-b2ca-0556961890d3,multiplane-ophys_797371_2025-07-11_09-10-19_pr...,multiplane-ophys_797371_2025-07-11_09-10-19
190,479689db-178a-44a4-9b3b-af2f80af48aa,multiplane-ophys_797371_2025-06-30_12-13-32_pr...,multiplane-ophys_797371_2025-06-30_12-13-32
191,b24ad851-e5b9-458e-955e-b441c283c599,multiplane-ophys_797371_2025-07-01_09-31-17_pr...,multiplane-ophys_797371_2025-07-01_09-31-17


In [55]:
# convert results to df and save as csv
save_dir = Path('/root/capsule/scratch')
save_fn = save_dir / 'jinho_ophys_sessions.csv'
df = pd.DataFrame(results)
df.to_csv(save_fn, index=False)

In [41]:
df

Unnamed: 0,_id,name
0,d113c13d-1513-4266-9db6-a1ebfd050608,multiplane-ophys_755252_2024-12-16_11-32-36_pr...
1,46609520-8b15-48e1-b210-1630ff6e35ed,multiplane-ophys_755252_2024-12-19_11-34-11_pr...
2,bdea2d4a-5e73-4e5d-a276-8e94daabc94b,multiplane-ophys_755252_2024-12-17_08-56-37_pr...
3,40c924b5-fa89-4edc-b376-25bfbbd4357c,multiplane-ophys_755252_2024-12-05_11-34-40_ct...
4,b79c300d-c1bc-403d-b283-de95f97ad7fc,multiplane-ophys_755252_2024-12-20_11-11-23_ct...
...,...,...
522,4a72a319-88ce-4a91-ab2c-eea09361c57a,multiplane-ophys_797371_2025-08-04_09-37-46_ct...
523,b2aa4200-b340-4742-b3bc-9c2ff3011bbe,multiplane-ophys_797371_2025-07-02_12-06-16_ct...
524,662ab58e-3727-4351-918c-69a71b5d4abe,multiplane-ophys_797371_2025-07-10_10-00-18_ct...
525,bf2f2e33-41de-4ef8-a87b-aa0e766b1c06,multiplane-ophys_797371_2025-07-31_09-36-34_ct...


In [None]:
subject_id = 755252
max_retry = 100

docdb_api_client = MetadataDbClient(
        host=API_GATEWAY_HOST,
        database=DATABASE,
        collection=COLLECTION,
    )

subject_id = str(subject_id)
query = {"subject.subject_id": subject_id}

retry_count = 0
while retry_count < max_retry:
    try:
        subject_response = docdb_api_client.retrieve_docdb_records(
            filter_query=query,
            projection={"name": 1}
        )
        break
    except Exception as e:
        logging.warning(f"Retry {retry_count+1}/{max_retry} after error: {e}")
        time.sleep(2 ** retry_count)  # Exponential backoff
        retry_count += 1
all_names = [item['name'] for item in subject_response]

raw_data_pattern = f'multiplane-ophys_{subject_id}_{DATE_FORMAT}_{TIME_FORMAT}'
raw_data_names = [name for name in all_names if re.fullmatch(raw_data_pattern, name)]

In [None]:
subject_response[0]

{'_id': 'd5642233-7454-4615-b55d-8d7069789b84',
 'name': 'multiplane-ophys_755252_2024-12-05_11-34-40_processed_2024-12-07_20-15-05'}