##  Getting started with Cell Types Database

Using data from the Allen Cell Types Database requires downloading multiple files and metadata from the API. 

Use the [CellTypesCache](http://alleninstitute.github.io/AllenSDK/allensdk.core.html#allensdk.core.cell_types_cache.CellTypesCache) class to fetch data from the API and to help organized the metadata and the downloaded files

In [2]:
from allensdk.core.cell_types_cache import CellTypesCache

# !! update this path so that it points to your external hard drive !!
ctc = CellTypesCache(manifest_file='D:/cell_types/manifest.json')

# get metadata for all the cells
cells_sdk = ctc.get_cells()
print "There are %d cells in the cache" % len(cells_sdk)

There are 248 cells in the cache


## Munge and process data to create a pandas dataframe for downstream analysis





In [3]:
import re

# helper function to break up structure into regions and layer
def split_region_layer(cell) :
    a = cell['structure']['acronym']
    si = 0
    for idx,x in enumerate(a) :
        if x.isdigit() :
            si = idx
            break
    return {'region': a[:si], 'layer': a[si:]}
    
print split_region_layer(cells_sdk[0])

{'region': u'VISp', 'layer': u'4'}


In [4]:
# helper function to get the mouse line
def transgenic_drivers( cell ) :
    tlist = cell['donor']['transgenic_lines']
    dlist = [str(x['name']) for x in tlist if x['transgenic_line_type_name'] == 'driver' ]
    return {'mouse_line':";".join( dlist )}

print transgenic_drivers(cells_sdk[0])

{'mouse_line': 'Sst-IRES-Cre'}


In [5]:
# create temporary record dictionary with the metadata we want to keep

index_key = ['id']
sample_keys = ['mouse_line','region','layer','hemisphere']
annotation_keys = ['dendrite_type','apical']
modality_keys = ['has_morphology', 'has_reconstruction']

def filter_list(keys,exclude_keys) : 
    return [ x for x in keys if x not in exclude_keys ]

def clean_keys(keys) :
    return [ str(x) for x in keys ]

# cell soma location keys
csl_keys = cells_sdk[0]['cell_soma_locations'][0].keys()
exclude_csl_keys = ['id','specimen_id']
csl_keys = clean_keys(filter_list( csl_keys, exclude_csl_keys))

# ephys feature keys
ef_keys = cells_sdk[0]['ephys_features'][0].keys()
exclude_ef_keys = ['id','specimen_id']
ef_keys = clean_keys(filter_list( ef_keys, exclude_ef_keys ))

# neuron reconstruction keys
c = [ x for x in cells_sdk if x['has_reconstruction']][0]
nr_keys = c['neuron_reconstructions'][0].keys()
exclude_nr_keys = ['id','specimen_id','tags']
nr_keys = clean_keys(filter_list( nr_keys, exclude_nr_keys ))

columns = index_key + sample_keys + annotation_keys + modality_keys + csl_keys + ef_keys + nr_keys


In [6]:
def filter_dictonary_by_keys( d, k ) :
     return { x: d[x] for x in k }
    
def fetch_data( c ) :
    d = filter_dictonary_by_keys( c, (index_key + annotation_keys + modality_keys + ['hemisphere']) )
    d.update( split_region_layer(c) )
    d.update( transgenic_drivers(c) )
    d.update( filter_dictonary_by_keys( c['cell_soma_locations'][0], csl_keys) )
    d.update( filter_dictonary_by_keys( c['ephys_features'][0], ef_keys) )
    if c['has_reconstruction'] :
        d.update( filter_dictonary_by_keys( c['neuron_reconstructions'][0], nr_keys ) )
    return d

cells_records = [fetch_data(c) for c in cells_sdk]
    
print cells_records[0]['mouse_line']

Sst-IRES-Cre


In [7]:
# create pandas dataframe
import pandas as pd

cells_df = pd.DataFrame.from_records( cells_records, index = 'id', columns=columns )
cells_df.head()

Unnamed: 0_level_0,mouse_line,region,layer,hemisphere,dendrite_type,apical,has_morphology,has_reconstruction,normalized_depth,reference_space_id,...,number_stems,number_branches,average_fragmentation,average_contraction,average_bifurcation_angle_remote,hausdorff_dimension,total_surface,max_branch_order,soma_surface,overall_height
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
324257146,Sst-IRES-Cre,VISp,4,left,aspiny,,True,True,0.410398,9,...,6.0,30.0,34.2333,0.776329,77.4101,1.12107,1416.64,6.0,111.175,215.758
469622566,Scnn1a-Tg3-Cre,VISp,5,right,spiny,truncated,True,False,0.423992,9,...,,,,,,,,,,
328876201,Sst-IRES-Cre,VISp,5,left,aspiny,,False,False,0.510874,9,...,,,,,,,,,,
466431949,Scnn1a-Tg3-Cre,VISl,4,left,spiny,truncated,True,False,0.46447,9,...,,,,,,,,,,
396903227,Scnn1a-Tg3-Cre,VISp,5,right,spiny,truncated,False,False,0.497223,9,...,,,,,,,,,,


In [49]:
cells_df.columns.values

array(['mouse_line', 'region', 'layer', 'hemisphere', 'dendrite_type',
       'apical', 'has_morphology', 'has_reconstruction',
       'normalized_depth', 'reference_space_id', 'y', 'x', 'z', 'tau',
       'upstroke_downstroke_ratio_short_square', 'thumbnail_sweep_id',
       'has_delay', 'threshold_v_ramp', 'peak_v_short_square', 'avg_isi',
       'sag', 'blowout_voltage', 'trough_t_ramp', 'slow_trough_v_ramp',
       'slow_trough_v_long_square', 'has_pause', 'electrode_0_pa',
       'trough_v_long_square', 'input_resistance_mohm', 'latency',
       'fast_trough_v_ramp', 'trough_t_long_square',
       'threshold_t_long_square', 'rheobase_sweep_id', 'peak_t_ramp',
       'threshold_t_short_square', 'has_burst',
       'slow_trough_t_long_square', 'fast_trough_t_long_square', 'ri',
       'threshold_v_short_square', 'upstroke_downstroke_ratio_ramp',
       'vm_for_sag', 'threshold_i_long_square',
       'initial_access_resistance', 'peak_t_long_square',
       'threshold_i_short_square'

## Take a look at some dataset statistics using the dataframe

In [21]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
# count number of cells by mouse_line
cells_df.groupby(['mouse_line']).size()

mouse_line
Gad2-IRES-Cre      12
Htr3a-Cre_NO152    15
Nr5a1-Cre          30
Ntsr1-Cre           8
Pvalb-IRES-Cre     51
Rbp4-Cre_KL100     12
Rorb-IRES2-Cre     31
Scnn1a-Tg2-Cre     17
Scnn1a-Tg3-Cre     36
Sst-IRES-Cre       36
dtype: int64

In [27]:
# count layer by mouse_line
cells_df.groupby(['mouse_line','layer']).size()

mouse_line       layer
Gad2-IRES-Cre    4         3
                 5         8
                 6a        1
Htr3a-Cre_NO152  2/3       9
                 4         4
                 5         2
Nr5a1-Cre        2/3       3
                 4        24
                 5         2
                 6a        1
Ntsr1-Cre        5         1
                 6a        7
Pvalb-IRES-Cre   2/3       5
                 4        10
                 5        31
                 6a        5
Rbp4-Cre_KL100   4         1
                 5         9
                 6a        2
Rorb-IRES2-Cre   2/3       2
                 4        17
                 5        10
                 6a        2
Scnn1a-Tg2-Cre   4         9
                 5         8
Scnn1a-Tg3-Cre   4        25
                 5        11
Sst-IRES-Cre     2/3       5
                 4         2
                 5        22
                 6a        7
dtype: int64

In [48]:
# count dendrite_type by mouse_line
cells_df.groupby(['mouse_line','dendrite_type']).size()

mouse_line       dendrite_type 
Gad2-IRES-Cre    aspiny            12
Htr3a-Cre_NO152  aspiny            15
Nr5a1-Cre        aspiny             2
                 spiny             28
Ntsr1-Cre        spiny              8
Pvalb-IRES-Cre   aspiny            51
Rbp4-Cre_KL100   spiny             12
Rorb-IRES2-Cre   aspiny             1
                 spiny             30
Scnn1a-Tg2-Cre   aspiny             1
                 spiny             16
Scnn1a-Tg3-Cre   aspiny             1
                 spiny             35
Sst-IRES-Cre     aspiny            33
                 sparsely spiny     2
                 spiny              1
dtype: int64

In [30]:
# average upstroke/downstroke ratio by mouse_line
cells_df.groupby(['mouse_line']).upstroke_downstroke_ratio_short_square.mean()

mouse_line
Gad2-IRES-Cre      1.594569
Htr3a-Cre_NO152    2.336137
Nr5a1-Cre          3.439860
Ntsr1-Cre          2.764741
Pvalb-IRES-Cre     1.433088
Rbp4-Cre_KL100     3.325245
Rorb-IRES2-Cre     3.501382
Scnn1a-Tg2-Cre     3.523081
Scnn1a-Tg3-Cre     3.529249
Sst-IRES-Cre       1.861222
Name: upstroke_downstroke_ratio_short_square, dtype: float64

In [47]:
# average max_euclidean_distance by mouse_line
cells_df.groupby(['mouse_line']).max_euclidean_distance.mean()

mouse_line
Gad2-IRES-Cre      229.198000
Htr3a-Cre_NO152    193.325000
Nr5a1-Cre          316.766917
Ntsr1-Cre          555.668667
Pvalb-IRES-Cre     238.118222
Rbp4-Cre_KL100     539.851500
Rorb-IRES2-Cre     415.419400
Scnn1a-Tg2-Cre     465.101833
Scnn1a-Tg3-Cre     439.672375
Sst-IRES-Cre       244.610545
Name: max_euclidean_distance, dtype: float64