In [1]:
from codeocean import CodeOcean
from codeocean.data_asset import DataAssetAttachParams
import os
from aind_dynamic_foraging_data_utils.code_ocean_utils import get_assets, attach_data, add_data_asset_path 
import re


In [3]:
# --- Inputs you edit ---
session_input = "behavior_751181_2025-02-27_11-24-44"            # full or prefix; e.g. "..._11-57-33" or without time
modifier = None                                          # e.g. "videoprocessed" or None
processed = False                                        # False to include raw names like "..._videoprocessed"
token_env = "API_SECRET"                                 # your CO token env var


In [4]:
import re
from typing import Optional

def _session_prefix(s: str) -> str:
    return re.sub(r'_\d{2}-\d{2}-\d{2}$', '', s)

def _name_regex(session_input: str, modifier: Optional[str]) -> str:
    """
    Matches:
      ^<prefix>(_<HH-MM-SS>)?(_<modifier>)?(_.*)?$
    Works for both exact session and prefix, with or without extra suffixes.
    """
    pref = _session_prefix(session_input)
    time_opt = r'(?:_\d{2}-\d{2}-\d{2})?'          # optional time
    mod_opt  = f'(?:_{re.escape(modifier)})?' if modifier else ''
    tail_opt = r'(?:_.*)?'                         # optional trailing stuff
    return f'^{re.escape(pref)}{time_opt}{mod_opt}{tail_opt}$'

# Then continue with:
name_re = _name_regex(session_input, modifier)

df = get_assets(
    subjects=['751181'],                   
    processed=processed,           
    modality=["behavior"],
    extra_filter={"name": {"$regex": name_re}},
)

if df is None or df.empty:
    asset_ids = []
else:
    df = df[df["name"].str.match(name_re, na=False)]
    asset_ids = [x for x in df["code_ocean_asset_id"].tolist() if x]

asset_ids


HTTPError: 503 Server Error: Service Unavailable for url: https://api.allenneuraldynamics.org/v1/metadata_index/data_assets/find?limit=0&skip=0&filter=%7B%22name%22%3A+%7B%22%24regex%22%3A+%22%5Ebehavior_751181_2025%5C%5C-02%5C%5C-27%28%3F%3A_%5C%5Cd%7B2%7D-%5C%5Cd%7B2%7D-%5C%5Cd%7B2%7D%29%3F%28%3F%3A_.%2A%29%3F%24%22%7D%2C+%22session.session_type%22%3A+%7B%22%24regex%22%3A+%22%5E%28Uncoupled%7CCoupled%29%28+Without%29%3F+Baiting%22%7D%2C+%22%24and%22%3A+%5B%7B%22data_description.modality.abbreviation%22%3A+%7B%22%24regex%22%3A+%22behavior%22%7D%7D%5D%7D

In [6]:
asset_ids

['75f7920a-17c9-4e0c-8259-aee69e0544b2',
 '2efd0064-8eb1-4124-8232-37ff7c7b96e1',
 'd4f4b00a-6603-41ef-99e1-7277e844bca5',
 'c22f4d4e-12aa-4ad1-82ed-1301c279c270',
 '6a851057-6523-446c-8108-e57924bdca92',
 'e740d882-8503-4a30-a764-7340aa250b73']

In [8]:
df = get_assets(
    subjects=[],                   
    processed=processed,           
    modality=["behavior"],
    extra_filter={"name": {"$regex": name_re}},
)

if df is None or df.empty:
    asset_ids2 = []
else:
    df = df[df["name"].str.match(name_re, na=False)]
    asset_ids2 = [x for x in df["code_ocean_asset_id"].tolist() if x]

asset_ids2

Query will be slow without explicit subject ids


['75f7920a-17c9-4e0c-8259-aee69e0544b2',
 '2efd0064-8eb1-4124-8232-37ff7c7b96e1',
 'd4f4b00a-6603-41ef-99e1-7277e844bca5',
 'c22f4d4e-12aa-4ad1-82ed-1301c279c270',
 '6a851057-6523-446c-8108-e57924bdca92',
 'e740d882-8503-4a30-a764-7340aa250b73']

In [9]:
processed

True

In [2]:
import re
import pandas as pd
from typing import Optional, List
from aind_data_access_api.document_db import MetadataDbClient

def get_assets_by_session(
    session_input: str,
    modifier: Optional[str] = None,
    modality: List[str] = ["behavior"],
    subjects: Optional[List[str]] = None,
) -> pd.DataFrame:
    """
    Simplified: get *all* data assets whose name matches
    session prefix (with or without _HH-MM-SS) and optional _modifier.
    Returns full list of matches (no deduplication).
    """

    # --- Helpers ---
    def extract_subject_id(session_input: str) -> Optional[str]:
        m = re.match(r"^behavior_(\d+)_\d{4}-\d{2}-\d{2}", session_input)
        return m.group(1) if m else None

    def session_prefix(s: str) -> str:
        return re.sub(r'_\d{2}-\d{2}-\d{2}$', '', s)

    def name_regex(session_input: str, modifier: Optional[str]) -> str:
        pref = session_prefix(session_input)
        time_opt = r'(?:_\d{2}-\d{2}-\d{2})?'           # optional time
        mod_opt  = f'(?:_{re.escape(modifier)})' if modifier else ''
        tail_opt = r'(?:_.*)?'                          # allow trailing stuff (e.g., hashes)
        return f'^{re.escape(pref)}{time_opt}{mod_opt}{tail_opt}$'

    # --- Build query ---
    name_re = name_regex(session_input, modifier)

    subj = subjects or [extract_subject_id(session_input)] if extract_subject_id(session_input) else []
    subject_filter = {
        "name": {"$regex": "^behavior_(" + "|".join(subj) + r")_.*$"}
    } if subj else {}

    modality_filter = {"$and": [{"data_description.modality.abbreviation": {"$regex": m}} for m in modality]} if modality else {}

    extra_filter = {"name": {"$regex": name_re}}

    # --- Query docDB ---
    client = MetadataDbClient(
        host="api.allenneuraldynamics.org",
        database="metadata_index",
        collection="data_assets"
    )
    df = pd.DataFrame(
        client.retrieve_docdb_records(
            filter_query={**subject_filter, **modality_filter, **extra_filter},
            projection={"name": 1, "external_links": 1, "session_name": 1, "subject.subject_id": 1},
        )
    )

    if df.empty:
        return df

    # --- Add Code Ocean asset IDs ---
    def co_id(links):
        if isinstance(links, dict) and "Code Ocean" in links:
            vals = links["Code Ocean"]
            if isinstance(vals, list) and vals:
                return vals[0]
            elif isinstance(vals, str):
                return vals
        return ""

    df["code_ocean_asset_id"] = df["external_links"].apply(co_id)
    df = df[df["name"].str.match(name_re, na=False)]
    return df.reset_index(drop=True)


In [4]:
df = get_assets_by_session("behavior_751181_2025-02-27_11-24-44",modality=[])


HTTPError: 503 Server Error: Service Unavailable for url: https://api.allenneuraldynamics.org/v1/metadata_index/data_assets/find?limit=0&skip=0&filter=%7B%22name%22%3A+%7B%22%24regex%22%3A+%22%5Ebehavior_751181_2025%5C%5C-02%5C%5C-27%28%3F%3A_%5C%5Cd%7B2%7D-%5C%5Cd%7B2%7D-%5C%5Cd%7B2%7D%29%3F%28%3F%3A_.%2A%29%3F%24%22%7D%7D&projection=%7B%22name%22%3A+1%2C+%22external_links%22%3A+1%2C+%22session_name%22%3A+1%2C+%22subject.subject_id%22%3A+1%7D

In [None]:
print(df['name'].tolist())


['behavior_758017_2025-02-04_11-57-33_curated-ZhixiaoSu_2025-02-19_16-51-34', 'behavior_758017_2025-02-04_11-57-33_videoprocessed_2025-07-17', 'behavior_758017_2025-02-04_11-57-33', 'behavior_758017_2025-02-04_11-57-33_sorted_2025-02-05_03-42-01', 'behavior_758017_2025-02-04_11-57-33_sorted_opto_bandpass_2025-02-18_05-10-02', 'behavior_758017_2025-02-04_11-57-33_sorted-opto_2025-02-06_01-17-42', 'behavior_758017_2025-02-04_11-57-33_sorted_curated_2025-02-25_12-16-48']


In [8]:
df

Unnamed: 0,_id,external_links,name,subject,code_ocean_asset_id
0,659c847f-fc80-41da-8c21-af87ba8a5ac6,{'Code Ocean': ['f11b62e9-9ecb-441b-97ba-a7daa...,behavior_758017_2025-02-04_11-57-33_curated-Zh...,,f11b62e9-9ecb-441b-97ba-a7daa92e8060
1,2716f46e-3ea6-4b6e-965c-b8f6d3a975ac,{'Code Ocean': ['e740d882-8503-4a30-a764-7340a...,behavior_758017_2025-02-04_11-57-33_videoproce...,{'subject_id': '758017'},e740d882-8503-4a30-a764-7340aa250b73
2,7318a879-6aa9-48a1-bae7-73e3e1e2eb0a,{'Code Ocean': ['75f7920a-17c9-4e0c-8259-aee69...,behavior_758017_2025-02-04_11-57-33,{'subject_id': '758017'},75f7920a-17c9-4e0c-8259-aee69e0544b2
3,10a0d897-e2ea-4945-a656-c3b900f283d1,{'Code Ocean': ['d4f4b00a-6603-41ef-99e1-7277e...,behavior_758017_2025-02-04_11-57-33_sorted_202...,{'subject_id': '758017'},d4f4b00a-6603-41ef-99e1-7277e844bca5
4,8147f8a8-c42c-4bcf-a88f-1d1632e7309e,{'Code Ocean': ['6a851057-6523-446c-8108-e5792...,behavior_758017_2025-02-04_11-57-33_sorted_opt...,{'subject_id': '758017'},6a851057-6523-446c-8108-e57924bdca92
5,9f9d8fff-776e-4e29-a5bb-12c7555388fd,{'Code Ocean': ['2efd0064-8eb1-4124-8232-37ff7...,behavior_758017_2025-02-04_11-57-33_sorted-opt...,{'subject_id': '758017'},2efd0064-8eb1-4124-8232-37ff7c7b96e1
6,9413c97d-a45b-4e76-91af-0c86eab66ba5,{'Code Ocean': ['c22f4d4e-12aa-4ad1-82ed-1301c...,behavior_758017_2025-02-04_11-57-33_sorted_cur...,{'subject_id': '758017'},c22f4d4e-12aa-4ad1-82ed-1301c279c270


In [15]:
df2 = get_assets_by_session(
    session_input="behavior_758017_2025-02-04_11-57-33",
    modifier="videoprocessed",     # ← this restricts results to names containing "_videoprocessed"
    modality=[],                   # no modality filtering
)


In [16]:
df2.name.tolist()

['behavior_758017_2025-02-04_11-57-33_videoprocessed_2025-07-17']

In [17]:
import pandas as pd
from aind_data_access_api.document_db import MetadataDbClient

exact_name = "behavior_758017_2025-02-04_11-57-33_videoprocessed_2025-10-24_21-43-19"

client = MetadataDbClient(
    host="api.allenneuraldynamics.org",
    database="metadata_index",
    collection="data_assets",
)

df_by_name = pd.DataFrame(client.retrieve_docdb_records(
    filter_query={"name": exact_name},   # exact string match
    projection=None                      # get full doc (avoid losing fields)
))

print(len(df_by_name), "match(es) for exact name")
df_by_name[["name", "_id", "external_links"]]


0 match(es) for exact name


KeyError: "None of [Index(['name', '_id', 'external_links'], dtype='object')] are in the [columns]"

In [22]:
asset_id = "e740d882-8503-4a30-a764-7340aa250b73"

df_by_id = pd.DataFrame(client.retrieve_docdb_records(
    filter_query={
        "$or": [
            {"external_links.Code Ocean": asset_id},               # string form
            {"external_links.Code Ocean": {"$in": [asset_id]}},    # list form
        ]
    },
    projection=None
))

print(len(df_by_id), "match(es) for asset_id")
df_by_id[["name", "_id", "external_links"]]


1 match(es) for asset_id


Unnamed: 0,name,_id,external_links
0,behavior_758017_2025-02-04_11-57-33_videoproce...,2716f46e-3ea6-4b6e-965c-b8f6d3a975ac,{'Code Ocean': ['e740d882-8503-4a30-a764-7340a...


In [2]:
from typing import Optional, List
import os
import warnings
import pandas as pd
from aind_data_access_api.document_db import MetadataDbClient
from codeocean import CodeOcean
from codeocean.data_asset import DataAssetAttachParams


def get_assets_by_session(
    session_id: str,
    modifier: Optional[str] = None,
    modality: Optional[List[str]] = None,
    processed: Optional[bool] = None,
):
    """
    Return ALL data assets in docDB that belong to the given session_id.
    """
    modality = modality or []

    client = MetadataDbClient(
        host="api.allenneuraldynamics.org",
        database="metadata_index",
        collection="data_assets",
    )

    name_regex = rf"^{session_id}_.+"
    name_filter = {"name": {"$regex": name_regex}}

    if processed is True:
        proc_filter = {"name": {"$regex": r"_processed_"}}
    elif processed is False:
        proc_filter = {"name": {"$not": {"$regex": r"_processed_"}}}
    else:
        proc_filter = {}

    if modifier:
        mod_filter = {"name": {"$regex": modifier}}
    else:
        mod_filter = {}

    if modality:
        modality_filter = {
            "$or": [{"data_description.modality.abbreviation": {"$regex": m}} for m in modality]
        }
    else:
        modality_filter = {}

    filter_query = {**name_filter, **proc_filter, **modality_filter, **mod_filter}

    projection = {
        "name": 1,
        "_id": 1,
        "session": 1,
        "session_name": 1,
        "external_links": 1,
        "data_description": 1,
    }

    results = pd.DataFrame(
        client.retrieve_docdb_records(filter_query=filter_query, projection=projection)
    )

    if results.empty:
        return results

    results["code_ocean_asset_id"] = results["external_links"].apply(
        lambda link: (link.get("Code Ocean", [""])[0] if isinstance(link, dict) and "Code Ocean" in link else "")
    )

    results = results.sort_values("name").reset_index(drop=True)
    return results


In [3]:
# 1) All assets for the session (raw + processed)
df_all = get_assets_by_session("behavior_751181_2025-02-27_11-24-44")

HTTPError: 503 Server Error: Service Unavailable for url: https://api.allenneuraldynamics.org/v1/metadata_index/data_assets/find?limit=0&skip=0&filter=%7B%22name%22%3A+%7B%22%24regex%22%3A+%22%5Ebehavior_751181_2025-02-27_11-24-44_.%2B%22%7D%7D&projection=%7B%22name%22%3A+1%2C+%22_id%22%3A+1%2C+%22session%22%3A+1%2C+%22session_name%22%3A+1%2C+%22external_links%22%3A+1%2C+%22data_description%22%3A+1%7D

In [5]:
from typing import Optional, List
import pandas as pd
from aind_dynamic_foraging_data_utils.code_ocean_utils import get_assets, get_subject_assets


def get_assets_by_session_via_original(
    session_id: str,
    modifier: str = "",                # e.g., "videoprocessed"
    processed: Optional[bool] = None,  # True -> only *_processed_..., False -> only raw, None -> both
    modality: Optional[List[str]] = None,  # e.g., ["behavior", "behavior-videos"]
    task: Optional[List[str]] = None,
    stage: Optional[List[str]] = None,
) -> pd.DataFrame:
    """
    Wrapper that calls the ORIGINAL get_assets() from aind_dynamic_foraging_data_utils,
    but filters directly by session_id (e.g., 'behavior_751181_2025-02-27_11-24-44').
    """
    modality = modality or []
    task = task or []
    stage = stage or []

    # Build regex filter for session-based matching
    name_regex = rf"^{session_id}_.+"

    # Add processed/raw constraints
    if processed is True:
        name_regex = rf"{name_regex}.*_processed_.*"
    elif processed is False:
        name_regex = rf"{name_regex}(?!.*_processed_).*"

    # Add optional substring constraint
    if modifier:
        name_regex = rf"{name_regex}.*{modifier}.*"

    # Query using the original get_assets (with regex filter only)
    df = get_assets(
        subjects=[],  # leave empty for general query
        processed=True if processed is True else (False if processed is False else True),
        task=task,
        modality=modality,
        stage=stage,
        extra_filter={"name": {"$regex": name_regex}},
    )

    if df is None:
        return pd.DataFrame()

    return df.reset_index(drop=True)


In [6]:
df_all = get_assets_by_session_via_original("behavior_751181_2025-02-27_11-24-44")


Query will be slow without explicit subject ids


HTTPError: 503 Server Error: Service Unavailable for url: https://api.allenneuraldynamics.org/v1/metadata_index/data_assets/find?limit=0&skip=0&filter=%7B%22name%22%3A+%7B%22%24regex%22%3A+%22%5Ebehavior_751181_2025-02-27_11-24-44_.%2B%22%7D%2C+%22session.session_type%22%3A+%7B%22%24regex%22%3A+%22%5E%28Uncoupled%7CCoupled%29%28+Without%29%3F+Baiting%22%7D%7D&projection=%7B%22name%22%3A+1%2C+%22_id%22%3A+1%2C+%22session%22%3A+1%2C+%22session_name%22%3A+1%2C+%22external_links%22%3A+1%2C+%22subject.subject_id%22%3A+1%7D