In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

Our metadata is hosted in a DocDB database that we can access and query using the aind_data_access_api. This will point us to the database

In [1]:
from aind_data_access_api.document_db import MetadataDbClient

API_GATEWAY_HOST = "api.allenneuraldynamics.org"
DATABASE = 'metadata_index'
COLLECTION = 'data_assets'

docdb_api_client = MetadataDbClient(
   host=API_GATEWAY_HOST,
   database=DATABASE,
   collection=COLLECTION,
)
print(docdb_api_client._base_url)

https://api.allenneuraldynamics.org/v1/metadata_index/data_assets


MongoDB queries can be powerful, but also can get convoluted. Here we are going to pull the metadata properties we think are most relevant and create a pandas dataframe.
Step one is to create a projection of the metadata fields we think are most relevant:

In [3]:
aggregate = [
    {
        '$match': {
            'data_description.project_name': 'Thalamus in the middle', 
            'procedures.subject_procedures.procedures.procedure_type': 'Nanoject injection', 
            'quality_control.evaluations.name': 'Overall tissue quality'
        }
    }, {
        '$project': {
            'qc_evaluations': {
                '$filter': {
                    'input': '$quality_control.evaluations', 
                    'as': 'evaluation', 
                    'cond': {
                        '$and': [
                            {
                                '$eq': [
                                    '$$evaluation.name', 'Overall tissue quality'
                                ]
                            }, {
                                '$eq': [
                                    '$$evaluation.metrics.status_history.status', 'Pass'
                                ]
                            }
                        ]
                    }
                }
            }, 
            'evaluations': '$quality_control.evaluations', 
            'name': '$name', 
            'data_level': '$data_description.data_level', 
            'subject_id': '$data_description.subject_id', 
            'genotype': '$subject.genotype', 
            'sex': '$subject.sex', 
            'date_of_birth': '$subject.date_of_birth', 
            'date_of_surgery': '$procedures.subject_procedures.start_date', 
            'procedures': '$procedures.subject_procedures'
        }
    }
]

records = docdb_api_client.aggregate_docdb_records(
    pipeline = aggregate,
)

Now we will unpack that projection into a dataframe. First a few helper functions to unpack things

In [4]:
virus_dict = {}
virus_dict['AAVrg-Syn-H2B-Turquoise']='445'
virus_dict['AAVrg-Syn-H2B-EGFP']='488'
virus_dict['AAVrg-Syn-H2B-tdTomato']='561'
virus_dict['AAVrg-Syn-iCre']='488'
virus_dict['AAVrg-Syn-Flpo']='561'
virus_dict['CVS N2cdG-H2B-GFP']='488'
virus_dict['CVS N2cdG-H2B-tdTomato']='561'

def get_qc_channel(evaluations, channel):
    for eval in evaluations:
        if eval['name']=='Cell detection in channel: '+channel:
            qc_channel = eval['metrics'][0]['status_history'][-1]['status']
            ng_channel = eval['metrics'][0]['reference']
        if eval['name']=='Histology coordinates channel: '+channel:
            coord = eval['metrics'][0]['value']
    try:
        return(qc_channel, ng_channel, coord)
    except:
        return(qc_channel, ng_channel, None)

def get_qc_overall(evaluations):
    for eval in evaluations:
        if "tissue quality" in eval['name']:
            qc_tissue = eval['metrics'][0]['status_history'][-1]['status']
            ng_link = eval['metrics'][0]['reference']
    return(qc_tissue, ng_link)

def int_to_exponential(number):
    return "{:.2e}".format(number)

Now make the data frame

In [15]:
df = pd.DataFrame(columns=('name','subject_id','genotype','sex','virus','titer','ap','ml','dv','volume','age_days','days_to_perfusion','qc_tissue','ng_link',
                           'channel','qc_channel','ng_channel','inj_coordinates'))
for record in records:
    name = record['name']
    subject_id = record['subject_id']
    genotype = record['genotype']
    sex = record['sex']
    dob = datetime.strptime(record['date_of_birth'], '%Y-%m-%d').date()
    #qc
    qc_tissue, ng_link = get_qc_overall(record['evaluations'])

    #injections
    for proc in record['procedures']:
        if proc['procedures'][0]['procedure_type']=='Perfusion':
            perfusion_date = datetime.strptime(proc['start_date'], '%Y-%m-%d').date()
        if proc['procedures'][0]['procedure_type']=="Nanoject injection":
            surgery_date = datetime.strptime(proc['start_date'], '%Y-%m-%d').date()
            for inj in proc['procedures']:
                for i in range(len(inj['injection_materials'])):
                    virus = inj['injection_materials'][i]['name']
                    channel = virus_dict[virus]
                    if channel=='445':
                        qc_channel, ng_channel, coord = get_qc_channel(record['evaluations'], '445')
                    elif channel=='488':
                        qc_channel, ng_channel, coord = get_qc_channel(record['evaluations'], '488')
                    elif channel=='561':
                        qc_channel, ng_channel, coord = get_qc_channel(record['evaluations'], '561')
                    ap = float(inj['injection_coordinate_ap'])
                    ml = float(inj['injection_coordinate_ml'])
                    dv = float(inj['injection_coordinate_depth'][0])
                    volume = float(inj['injection_volume'][0])
                    if inj['injection_materials'][i]['material_type']=='Virus':
                        try:
                            titer = int_to_exponential(int(inj['injection_materials'][i]['titer']))
                        except:
                            pass
                    else:
                        titer = np.nan
                    age = (surgery_date - dob).days
                    days_to_perfusion = (perfusion_date - surgery_date).days
                    df.loc[len(df)] = [name, subject_id, genotype, sex, virus, titer, ap, ml, dv, volume, age, days_to_perfusion, 
                                       qc_tissue, ng_link, channel, qc_channel, ng_channel, coord]


In [16]:
df.head()

Unnamed: 0,name,subject_id,genotype,sex,virus,titer,ap,ml,dv,volume,age_days,days_to_perfusion,qc_tissue,ng_link,channel,qc_channel,ng_channel,inj_coordinates
0,SmartSPIM_678704_2023-06-20_20-49-52_stitched_...,678704,wt/wt,Male,AAVrg-Syn-H2B-Turquoise,48000000000000.0,2.8,1.8,1.0,50.0,45,28,Fail,https://aind-neuroglancer-sauujisjxq-uw.a.run....,445,Fail,,
1,SmartSPIM_678704_2023-06-20_20-49-52_stitched_...,678704,wt/wt,Male,AAVrg-Syn-H2B-tdTomato,51000000000000.0,2.4,1.8,0.8,50.0,45,28,Fail,https://aind-neuroglancer-sauujisjxq-uw.a.run....,561,Pending,,
2,SmartSPIM_678703_2023-06-20_17-18-27_stitched_...,678703,wt/wt,Male,AAVrg-Syn-H2B-Turquoise,48000000000000.0,1.6,0.2,2.0,50.0,45,28,Fail,https://aind-neuroglancer-sauujisjxq-uw.a.run....,445,Pending,,
3,SmartSPIM_678703_2023-06-20_17-18-27_stitched_...,678703,wt/wt,Male,AAVrg-Syn-H2B-tdTomato,51000000000000.0,2.0,0.6,0.6,50.0,45,28,Fail,https://aind-neuroglancer-sauujisjxq-uw.a.run....,561,Pending,,
4,SmartSPIM_678706_2023-06-28_16-43-04_stitched_...,678706,wt/wt,Female,AAVrg-Syn-H2B-Turquoise,48000000000000.0,2.8,1.0,1.2,50.0,45,28,Fail,https://aind-neuroglancer-sauujisjxq-uw.a.run....,445,Fail,,


Each row of this dataframe is an injection. It includes information about the subject (id, genotype, sex), the injection (virus, titer, targeted stereotactic coordinates, volume, age at injection), quality control on the overall tissue quality (assessed in the thalamus only), and quality control on the cell counting of that given channel, validated injection coordinates in CCF coordinates, and links to relevant neuroglancer images.

We can limit this to only injections that pass both qc evaluations:

In [17]:
df[(df.qc_tissue=='Pass')&(df.qc_channel=='Pass')].head()

Unnamed: 0,name,subject_id,genotype,sex,virus,titer,ap,ml,dv,volume,age_days,days_to_perfusion,qc_tissue,ng_link,channel,qc_channel,ng_channel,inj_coordinates
9,SmartSPIM_679519_2023-08-15_11-43-09_stitched_...,679519,Ai224(TICL-NLS-EGFP-ICF-NLS-dT)-hyg/wt,Male,AAVrg-Syn-Flpo,102000000000000.0,2.8,1.0,0.6,100.0,53,31,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,561,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,"{'AP': 115, 'ML': 275, 'DV': 114}"
10,SmartSPIM_689237_2023-08-30_19-04-37_stitched_...,689237,wt/wt,Female,AAVrg-Syn-H2B-Turquoise,296000000000000.0,1.6,0.2,1.0,50.0,52,21,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,445,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,"{'AP': 139, 'ML': 244, 'DV': 96}"
11,SmartSPIM_689237_2023-08-30_19-04-37_stitched_...,689237,wt/wt,Female,AAVrg-Syn-H2B-tdTomato,51000000000000.0,1.6,1.4,0.8,50.0,52,21,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,561,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,"{'AP': 140, 'ML': 295, 'DV': 86}"
12,SmartSPIM_689238_2023-08-31_01-56-52_stitched_...,689238,wt/wt,Female,AAVrg-Syn-H2B-Turquoise,296000000000000.0,2.0,0.6,0.6,50.0,52,28,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,445,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,"{'AP': 126, 'ML': 259, 'DV': 92}"
13,SmartSPIM_689238_2023-08-31_01-56-52_stitched_...,689238,wt/wt,Female,AAVrg-Syn-H2B-tdTomato,51000000000000.0,2.0,0.2,1.8,50.0,52,28,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,561,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,"{'AP': 122, 'ML': 237, 'DV': 138}"


Find unique genotype or virus values:

In [9]:
df.genotype.unique()

array(['wt/wt', 'Ai224(TICL-NLS-EGFP-ICF-NLS-dT)-hyg/wt'], dtype=object)

In [10]:
df.virus.unique()

array(['AAVrg-Syn-H2B-Turquoise', 'AAVrg-Syn-H2B-tdTomato',
       'AAVrg-Syn-iCre', 'AAVrg-Syn-Flpo', 'AAVrg-Syn-H2B-EGFP',
       'CVS N2cdG-H2B-GFP', 'CVS N2cdG-H2B-tdTomato'], dtype=object)

Find all rows with the 'Ai224...' genotype without having to type that long complicated name:

In [11]:
df[df.genotype.str.contains('Ai224')].head()

Unnamed: 0,name,subject_id,genotype,sex,virus,titer,ap,ml,dv,volume,age_days,days_to_perfusion,qc_tissue,ng_link,channel,qc_channel,ng_channel,channel_ng_link,inj_coordinates
6,SmartSPIM_679518_2023-08-25_12-08-11_stitched_...,679518,Ai224(TICL-NLS-EGFP-ICF-NLS-dT)-hyg/wt,Male,AAVrg-Syn-iCre,75000000000000.0,1.6,0.2,1.0,50.0,60,28,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,488,Fail,https://aind-neuroglancer-sauujisjxq-uw.a.run....,https://aind-neuroglancer-sauujisjxq-uw.a.run....,
7,SmartSPIM_679518_2023-08-25_12-08-11_stitched_...,679518,Ai224(TICL-NLS-EGFP-ICF-NLS-dT)-hyg/wt,Male,AAVrg-Syn-Flpo,102000000000000.0,1.6,0.6,0.8,50.0,60,28,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,561,Fail,https://aind-neuroglancer-sauujisjxq-uw.a.run....,https://aind-neuroglancer-sauujisjxq-uw.a.run....,
8,SmartSPIM_679519_2023-08-15_11-43-09_stitched_...,679519,Ai224(TICL-NLS-EGFP-ICF-NLS-dT)-hyg/wt,Male,AAVrg-Syn-iCre,75000000000000.0,2.8,0.2,0.6,100.0,53,31,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,488,Fail,https://aind-neuroglancer-sauujisjxq-uw.a.run....,https://aind-neuroglancer-sauujisjxq-uw.a.run....,
9,SmartSPIM_679519_2023-08-15_11-43-09_stitched_...,679519,Ai224(TICL-NLS-EGFP-ICF-NLS-dT)-hyg/wt,Male,AAVrg-Syn-Flpo,102000000000000.0,2.8,1.0,0.6,100.0,53,31,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,561,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,https://aind-neuroglancer-sauujisjxq-uw.a.run....,"{'AP': 115, 'ML': 275, 'DV': 114}"
15,SmartSPIM_679521_2023-08-15_17-08-13_stitched_...,679521,Ai224(TICL-NLS-EGFP-ICF-NLS-dT)-hyg/wt,Female,AAVrg-Syn-iCre,75000000000000.0,2.8,0.2,0.6,200.0,53,31,Pass,https://aind-neuroglancer-sauujisjxq-uw.a.run....,488,Fail,https://aind-neuroglancer-sauujisjxq-uw.a.run....,https://aind-neuroglancer-sauujisjxq-uw.a.run....,


Save this dataframe so we can load it into other notebooks in this capsule

In [18]:
df.to_csv('/scratch/metadata.csv')