## Import available `cdapython` functions

In [96]:
from cdapython.data_exploration import tables, columns, column_values, column_data_types
from cdapython.fetch import fetch_rows
from cdapython.query import summary_counts

## Get a list of searchable CDA tables

In [97]:
tables()

['diagnosis',
 'file',
 'researchsubject',
 'somatic_mutation',
 'specimen',
 'subject',
 'treatment']

## Explore CDA tables' columns in detail

In [98]:
columns()

Unnamed: 0,table,column,data_type,nullable,description
0,diagnosis,diagnosis_id,text,False,The logical identifier of the entity in the re...
1,diagnosis,age_at_diagnosis,integer,True,The age in days of the individual at the time ...
2,diagnosis,grade,text,True,"The degree of abnormality of cancer cells, a m..."
3,diagnosis,method_of_diagnosis,text,True,The method used to confirm the subjects malign...
4,diagnosis,morphology,text,True,Code that represents the histology of the dise...
...,...,...,...,...,...
200,treatment,treatment_anatomic_site,text,True,The anatomical site that the treatment targets.
201,treatment,treatment_effect,text,True,The effect of a treatment on the diagnosis or ...
202,treatment,treatment_end_reason,text,True,The reason the treatment ended.
203,treatment,treatment_outcome,text,True,The final outcome of the treatment.


## See what values are populated in a given column

In [101]:
column_values( 'primary_diagnosis_site' )

Unnamed: 0,primary_diagnosis_site,count
0,Chest,28221
1,Breast,24682
2,,16424
3,Bronchus and lung,13401
4,Hematopoietic and reticuloendothelial systems,11590
...,...,...
211,"Skin, NOS (excludes skin of labia majora C51.0...",1
212,Small Intestine,1
213,"Soft palate, NOS (excludes nasopharyngeal surf...",1
214,Spinal meninges,1


## Fetch subject row summary information for a column value

In [129]:
fetch_rows( table= 'subject', match_all = 'primary_diagnosis_site = kid*', count_only=True )

{'distinct_subject_rows': 3806, 'total_result_rows': 3806}

## Fetch subject rows for a column value

In [130]:
fetch_rows( table= 'subject', match_all = 'primary_diagnosis_site = kid*')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  result_dataframe[column].fillna( '', inplace=True )


Unnamed: 0,subject_id,cause_of_death,days_to_birth,days_to_death,ethnicity,race,sex,species,vital_status
0,c4kc_kits.KiTS-00000,,,,,,F,Homo sapiens,
1,c4kc_kits.KiTS-00001,,,,,,M,Homo sapiens,
2,c4kc_kits.KiTS-00002,,,,,,M,Homo sapiens,
3,c4kc_kits.KiTS-00003,,,,,,F,Homo sapiens,
4,c4kc_kits.KiTS-00004,,,,,,M,Homo sapiens,
...,...,...,...,...,...,...,...,...,...
3801,TCGA.TCGA-Y8-A898,,-25344,,not hispanic or latino,white,male,Homo sapiens,Alive
3802,TCGA.TCGA-Y8-A8RY,,-23192,,not hispanic or latino,white,male,Homo sapiens,Alive
3803,TCGA.TCGA-Y8-A8RZ,,-20386,,not hispanic or latino,white,male,Homo sapiens,Alive
3804,TCGA.TCGA-Y8-A8S0,,-21383,,not hispanic or latino,white,male,Homo sapiens,Alive


### Fetch rows from any table for a column value



<div class="tabbed-set tabbed-alternate" data-tabs="1:6"><input checked="checked" id="__tabbed_1_1" name="__tabbed_1" type="radio" /><input id="__tabbed_1_2" name="__tabbed_1" type="radio" /><input id="__tabbed_1_3" name="__tabbed_1" type="radio" /><input id="__tabbed_1_4" name="__tabbed_1" type="radio" /><input id="__tabbed_1_5" name="__tabbed_1" type="radio" /><input id="__tabbed_1_6" name="__tabbed_1" type="radio" /><div class="tabbed-labels"><label for="__tabbed_1_1">subject data</label><label for="__tabbed_1_2">research subject data</label><label for="__tabbed_1_3">specimen data</label><label for="__tabbed_1_4">diagnosis data</label><label for="__tabbed_1_5">treatment data</label><label for="__tabbed_1_6">mutation data</label></div>
<div class="tabbed-content">
<div class="tabbed-block">
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="n">fetch_rows</span><span class="p">(</span> <span class="n">table</span><span class="o">=</span> <span class="s2">&quot;subject&quot;</span><span class="p">,</span> <span class="n">match_all</span><span class="o">=</span><span class="s1">&#39;primary_disease_site = kid*&#39;</span> <span class="p">)</span>
</code></pre></div></td></tr></table></div>
</div>
<div class="tabbed-block">
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="n">fetch_rows</span><span class="p">(</span> <span class="n">table</span><span class="o">=</span> <span class="s2">&quot;research_subject&quot;</span><span class="p">,</span> <span class="n">match_all</span><span class="o">=</span><span class="s1">&#39;primary_disease_site = kid*&#39;</span> <span class="p">)</span>
</code></pre></div></td></tr></table></div>
</div>
<div class="tabbed-block">
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="n">fetch_rows</span><span class="p">(</span> <span class="n">table</span><span class="o">=</span> <span class="s2">&quot;specimen&quot;</span><span class="p">,</span> <span class="n">match_all</span><span class="o">=</span><span class="s1">&#39;primary_disease_site = kid*&#39;</span> <span class="p">)</span>
</code></pre></div></td></tr></table></div>
</div>
<div class="tabbed-block">
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="n">fetch_rows</span><span class="p">(</span> <span class="n">table</span><span class="o">=</span> <span class="s2">&quot;diagnosis&quot;</span><span class="p">,</span> <span class="n">match_all</span><span class="o">=</span><span class="s1">&#39;primary_disease_site = kid*&#39;</span> <span class="p">)</span>
</code></pre></div></td></tr></table></div>
</div>
<div class="tabbed-block">
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="n">fetch_rows</span><span class="p">(</span> <span class="n">table</span><span class="o">=</span> <span class="s2">&quot;treatment&quot;</span><span class="p">,</span> <span class="n">match_all</span><span class="o">=</span><span class="s1">&#39;primary_disease_site = kid*&#39;</span> <span class="p">)</span>
</code></pre></div></td></tr></table></div>
</div>
<div class="tabbed-block">
<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="n">fetch_rows</span><span class="p">(</span> <span class="n">table</span><span class="o">=</span> <span class="s2">&quot;mutation&quot;</span><span class="p">,</span> <span class="n">match_all</span><span class="o">=</span><span class="s1">&#39;primary_disease_site = kid*&#39;</span> <span class="p">)</span>
</code></pre></div></td></tr></table></div>
</div>
</div>
</div>


In [99]:
columns(
        table=[ 'file', '*subject*' ],
        column=[ '*data*', 'days*', '*size', '*_id' ],
        data_type=[ 'boolean', '*int*' ],
        nullable=True, 
        description_contains=[ 'date used', 'file in bytes' ],
        sort_by=[ 'table:desc', 'column:desc' ]
)

Unnamed: 0,table,column,data_type,nullable,description
0,subject,days_to_death,integer,True,Number of days between the date used for index...
1,subject,days_to_birth,integer,True,Number of days between the date used for index...
2,file,byte_size,bigint,True,Size of the file in bytes. Maps to dcat:byteSize.


Look for anatomical information:

In [100]:
columns( column='*site*' )

Unnamed: 0,table,column,data_type,nullable,description
0,researchsubject,primary_diagnosis_site,text,True,The text term used to describe the primary sit...
1,somatic_mutation,primary_site,text,True,Anatomical site of the cancer under investigat...
2,specimen,anatomical_site,text,True,"Per GDC Dictionary, the text term that represe..."
3,treatment,treatment_anatomic_site,text,True,The anatomical site that the treatment targets.


Profile and count values in a column of interest:

Refine `columns()` search:

Refine `column_values()` search:

In [102]:
column_values( 'primary_diagnosis_site', sort_by='value:desc', filters=[ '*lung*', '*kidney*' ] )

Unnamed: 0,primary_diagnosis_site,count
0,"Lung/Bronchus, Unknown",38
1,Lung/Bronchus,18
2,"Lung, NOS",2
3,Lung Phantom,8
4,Lung,4964
5,"Kidney, Unknown",42
6,"Kidney, NOS",3
7,Kidney,5734
8,Bronchus and lung,13401
9,"Abdomen, Arm, Bladder, Chest, Head-Neck, Kidne...",88


Explore CDA records, filtering on columns of interest:

In [103]:
fetch_rows( table= "subject", count_only=True )

{'distinct_subject_rows': 160442, 'total_result_rows': 160442}

Refine record search as we go:

In [104]:
fetch_rows( table= "subject", match_some=[ 'subject_id = 4d*', 'subject_id = acrin*' ], count_only=True )

{'distinct_subject_rows': 1714, 'total_result_rows': 1714}

Now we have a manageable dataset. Stop counting and start looking at data:

In [105]:
fetch_rows( table= "subject", match_some=[ 'subject_id = 4d*', 'subject_id = acrin*' ] )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  result_dataframe[column].fillna( '', inplace=True )


Unnamed: 0,subject_id,cause_of_death,days_to_birth,days_to_death,ethnicity,race,sex,species,vital_status
0,4d_lung.100_HM10395,,,,,,M,Homo sapiens,
1,4d_lung.101_HM10395,,,,,,F,Homo sapiens,
2,4d_lung.102_HM10395,,,,,,M,Homo sapiens,
3,4d_lung.103_HM10395,,,,,,M,Homo sapiens,
4,4d_lung.104_HM10395,,,,,,F,Homo sapiens,
...,...,...,...,...,...,...,...,...,...
1709,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-247,,,,,,M,Homo sapiens,
1710,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-248,,,,,,M,Homo sapiens,
1711,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-249,,,,,,F,Homo sapiens,
1712,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-250,,,,,,M,Homo sapiens,


In [106]:
fetch_rows( table= "subject", match_some=[ 'subject_id = 4d*', 'subject_id = acrin*' ], match_all=[ 'sex = f' ] )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  result_dataframe[column].fillna( '', inplace=True )


Unnamed: 0,subject_id,cause_of_death,days_to_birth,days_to_death,ethnicity,race,sex,species,vital_status
0,4d_lung.101_HM10395,,,,,,F,Homo sapiens,
1,4d_lung.104_HM10395,,,,,,F,Homo sapiens,
2,4d_lung.106_HM10395,,,,,,F,Homo sapiens,
3,4d_lung.107_HM10395,,,,,,F,Homo sapiens,
4,4d_lung.108_HM10395,,,,,,F,Homo sapiens,
...,...,...,...,...,...,...,...,...,...
1526,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-236,,,,,,F,Homo sapiens,
1527,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-239,,,,W,,F,Homo sapiens,
1528,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-242,,,,,,F,Homo sapiens,
1529,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-249,,,,,,F,Homo sapiens,


In [107]:
fetch_rows( table= "subject", match_some=[ 'subject_id = 4d*', 'subject_id = acrin*' ], match_all=[ 'sex = f*', 'primary_diagnosis_site != breast' ] )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  result_dataframe[column].fillna( '', inplace=True )


Unnamed: 0,subject_id,cause_of_death,days_to_birth,days_to_death,ethnicity,race,sex,species,vital_status
0,4d_lung.101_HM10395,,,,,,F,Homo sapiens,
1,4d_lung.104_HM10395,,,,,,F,Homo sapiens,
2,4d_lung.106_HM10395,,,,,,F,Homo sapiens,
3,4d_lung.107_HM10395,,,,,,F,Homo sapiens,
4,4d_lung.108_HM10395,,,,,,F,Homo sapiens,
...,...,...,...,...,...,...,...,...,...
89,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-236,,,,,,F,Homo sapiens,
90,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-239,,,,W,,F,Homo sapiens,
91,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-242,,,,,,F,Homo sapiens,
92,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-249,,,,,,F,Homo sapiens,


Extensive debug information!

In [108]:
fetch_rows( table= "subject", match_some=[ 'subject_id = 4d*', 'subject_id = acrin*' ], match_all=[ 'sex = f*', 'primary_diagnosis_site != breast' ], debug=True )

{
    "node_type": "AND",
    "l": {
        "node_type": "AND",
        "l": {
            "node_type": "LIKE",
            "l": {
                "node_type": "column",
                "value": "sex"
            },
            "r": {
                "node_type": "quoted",
                "value": "f%"
            }
        },
        "r": {
            "node_type": "!=",
            "l": {
                "node_type": "column",
                "value": "primary_diagnosis_site"
            },
            "r": {
                "node_type": "quoted",
                "value": "breast"
            }
        }
    },
    "r": {
        "node_type": "OR",
        "l": {
            "node_type": "LIKE",
            "l": {
                "node_type": "column",
                "value": "subject_id"
            },
            "r": {
                "node_type": "quoted",
                "value": "4d%"
            }
        },
        "r": {
            "node_type": "LIKE",
            "l": {


--------------------------------------------------------------------------------
BEGIN DEBUG MESSAGE: fetch_rows(): Loaded CDA API URL from default config
--------------------------------------------------------------------------------

[{'url': 'http://localhost:8080', 'description': 'URL of CDA REST API service'}]

--------------------------------------------------------------------------------
END  DEBUG MESSAGE: fetch_rows(): Loaded CDA API URL from default config
--------------------------------------------------------------------------------

--------------------------------------------------------------------------------
BEGIN DEBUG MESSAGE: fetch_rows(): Querying CDA API 'subject' endpoint (fetching all results)
--------------------------------------------------------------------------------

Organizing result data...
   -- filtering API columns: ['subject_identifier', 'subject_associated_project']
Handling missing values...
The behavior will change in pandas 3.0. This inplace 

Unnamed: 0,subject_id,cause_of_death,days_to_birth,days_to_death,ethnicity,race,sex,species,vital_status
0,4d_lung.101_HM10395,,,,,,F,Homo sapiens,
1,4d_lung.104_HM10395,,,,,,F,Homo sapiens,
2,4d_lung.106_HM10395,,,,,,F,Homo sapiens,
3,4d_lung.107_HM10395,,,,,,F,Homo sapiens,
4,4d_lung.108_HM10395,,,,,,F,Homo sapiens,
...,...,...,...,...,...,...,...,...,...
89,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-236,,,,,,F,Homo sapiens,
90,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-239,,,,W,,F,Homo sapiens,
91,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-242,,,,,,F,Homo sapiens,
92,acrin_nsclc_fdg_pet.ACRIN-NSCLC-FDG-PET-249,,,,,,F,Homo sapiens,


There is handy help!

In [109]:
help( columns )

Help on function columns in module cdapython.data_exploration:

columns(*, return_data_as='', output_file='', sort_by='', debug=False, **filter_arguments)
    Get structured metadata describing searchable CDA columns.
    
    Arguments:
        return_data_as ( string; optional: 'dataframe' or 'list' or 'tsv' ):
            Specify how columns() should return results: as a pandas DataFrame,
            a Python list, or as output written to a TSV file named by the user.
            If this argument is omitted, columns() will default to returning
            results as a DataFrame.
    
        output_file( string; optional ):
            If return_data_as='tsv' is specified, output_file should contain a
            resolvable path to a file into which columns() will write
            tab-delimited results.
    
        sort_by( string or list of strings; optional:
                    any combination of 'table', 'column', 'data_type',
                    and/or 'nullable'):
           

In [110]:
help( column_values )

Help on function column_values in module cdapython.data_exploration:

column_values(column='', *, return_data_as='dataframe', output_file='', sort_by='', filters='', data_source='', force=False, debug=False)
    Show all distinct values present in `column`, along with a count
    of occurrences for each value.
    
    Arguments:
        column ( string; required ):
            The column to fetch values from.
    
        return_data_as ( string; optional:
                'dataframe' (default) or 'list' or 'tsv' ):
            Specify how column_values() should return results: as a pandas
            DataFrame, a Python list, or as output written to a TSV file named
            by the user.
    
        output_file( string; optional ):
            If return_data_as='tsv' is specified, output_file should contain a
            resolvable path to a file into which column_values will write
            tab-delimited results.
    
        sort_by( string; optional:
                'count' (

In [111]:
help( subjects )

Help on function subjects in module cdapython.query:

subjects(*, match_all=[], match_some=[], link_to_table='', add_columns=[], data_source=[], count_only=False, debug=False)
    Get CDA subject records (rows from the CDA `subject` table).
    
    Arguments:
        match_all ( string or list of strings; optional ):
            One or more conditions, expressed as filter strings (see below),
            ALL of which must be met by all result rows.
    
        match_some ( string or list of strings; optional ):
            One or more conditions, expressed as filter strings (see below),
            AT LEAST ONE of which must be met by all result rows.
    
        link_to_table ( string; optional ):
            A non-`subject` table from which to fetch rows related to the
            `subject` row results that this function produces. `link_to_table`
            results will be appended to `subject` rows to which they're related:
            any `subject` row related to more than one 

In [112]:
help( column_data_types )

Help on function column_data_types in module cdapython.data_exploration:

column_data_types()
    Get a list of all data types stored in searchable CDA columns.
    
    Returns:
        list of strings: names of data types stored in searchable CDA columns.



In [113]:
help( tables )

Help on function tables in module cdapython.data_exploration:

tables()
    Get a list of all searchable CDA data tables.
    
    Returns:
        list of strings: names of searchable CDA tables.



In [114]:
help( summary_counts )

Help on function summary_counts in module cdapython.query:

summary_counts(table='', *, match_all=[], match_some=[], data_source=[], debug=False)
    For a set of rows in a user-specified table that all match a user-specified set of filters, get
    a report showing counts of values present in that set of rows, profiled across a small set of
    pre-selected columns.
    
    Arguments:
        table ( string; required ):
            The table whose rows are to be filtered and counted. (Run the tables()
            function to get a list.)
    
        match_all ( string or list of strings; optional ):
            One or more conditions, expressed as filter strings (see below),
            ALL of which must be met by all result rows.
    
        match_some ( string or list of strings; optional ):
            One or more conditions, expressed as filter strings (see below),
            AT LEAST ONE of which must be met by all result rows.
    
        data_source ( string or list of str

There are many useful error messages awaiting you on your journey!

In [115]:
fetch_rows( table= "subject", match_all='primary_disease_site = kid*ey' )

fetch_rows(): ERROR: match_all: requested column 'primary_disease_site' is not a searchable CDA column.


In [116]:
fetch_rows( table= "subject", match_all='primary_diagnosis_site = kid*ey' )

fetch_rows(): ERROR: match_all: wildcards (*) are only allowed at the ends of string values; string 'kid*ey' is noncompliant (it has one in the middle). Please fix.


You can ask about missing values if you like!

In [117]:
fetch_rows( table= "subject", match_all=[ 'sex = NULL', 'ethnicity != NULL' ] )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  result_dataframe[column].fillna( '', inplace=True )


Unnamed: 0,subject_id,cause_of_death,days_to_birth,days_to_death,ethnicity,race,sex,species,vital_status
0,cmb_crc.CMB-CRC-MSB-02381,,,,Non-Hispanic,,,Homo sapiens,
1,cmb_crc.CMB-CRC-MSB-03032,,,,1,,,Homo sapiens,
2,cmb_gec.CMB-GEC-MSB-06857,,,,Non-Hispanic [9],,,Homo sapiens,
3,cmb_lca.CMB-LCA-MSB-05837,,,,Non-Hispanic [9],,,Homo sapiens,
4,cmb_lca.CMB-LCA-MSB-09117,,,,Non-Hispanic [9],,,Homo sapiens,
5,cmb_lca.CMB-LCA-MSB-09977,,,,Non-Hispanic [9],,,Homo sapiens,
6,cmb_mel.CMB-MEL-MSB-05412,,,,W,,,Homo sapiens,
7,cmb_mel.CMB-MEL-MSB-07612,,,,Not Hispanic Lat,,,Homo sapiens,
8,cmb_mel.CMB-MEL-MSB-09286,,,,W,,,Homo sapiens,
9,cmb_pca.CMB-PCA-MSB-07483,,,,Non-Hispanic [9],,,Homo sapiens,


You can filter by data source!

In [118]:
fetch_rows( table= "subject", data_source=[ 'GDC', 'PDC', 'IDC' ] )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  result_dataframe[column].fillna( '', inplace=True )


Unnamed: 0,subject_id,cause_of_death,days_to_birth,days_to_death,ethnicity,race,sex,species,vital_status
0,CPTAC.01BR001,,,,not hispanic or latino,black or african american,female,Homo sapiens,Not Reported
1,CPTAC.01BR008,,,,not reported,black or african american,not reported,Homo sapiens,Not Reported
2,CPTAC.01BR009,,,,not reported,black or african american,not reported,Homo sapiens,Not Reported
3,CPTAC.01BR010,,,,not reported,black or african american,not reported,Homo sapiens,Not Reported
4,CPTAC.01BR015,,,,not hispanic or latino,white,female,Homo sapiens,Not Reported
...,...,...,...,...,...,...,...,...,...
1677,TCGA.TCGA-E2-A150,,-17580,,not hispanic or latino,white,female,Homo sapiens,Alive
1678,TCGA.TCGA-E2-A154,,-24999,,not hispanic or latino,white,female,Homo sapiens,Alive
1679,TCGA.TCGA-E2-A158,,-15903,,not hispanic or latino,white,female,Homo sapiens,Alive
1680,TCGA.TCGA-E2-A159,,-18621,,not hispanic or latino,white,female,Homo sapiens,Alive


You can join columns from other tables!

In [119]:
fetch_rows( table= "subject", match_all=[ 'subject_id = TCGA*', 'sex = f*', 'hugo_symbol != NULL', 'days_to_death != NULL' ], data_source=[ 'GDC', 'PDC', 'IDC' ], add_columns=[ 'hugo_symbol', 'hotspot' ] )

fetch_rows(): ERROR: error message from API: 'PreparedStatementCallback; bad SQL grammar [SELECT row_to_json(json) FROM (SELECT subject.id AS subject_id, subject.cause_of_death AS cause_of_death, subject.days_to_birth AS days_to_birth, subject.days_to_death AS days_to_death, subject.ethnicity AS ethnicity, subject.race AS race, subject.sex AS sex, subject.species AS species, subject.vital_status AS vital_status, somatic_mutation.hugo_symbol AS hugo_symbol, somatic_mutation.hotspot AS hotspot FROM subject AS subject  LEFT JOIN subject_data_source AS subject_data_source ON subject.integer_id_alias = subject_data_source.subject_alias WHERE ((((subject_data_source.subject_from_gdc = ?) AND (subject_data_source.subject_from_pdc = ?)) AND (subject_data_source.subject_from_idc = ?)) AND ((((COALESCE(UPPER(subject.id), '') LIKE UPPER(?)) AND (COALESCE(UPPER(subject.sex), '') LIKE UPPER(?))) AND (somatic_mutation.hugo_symbol IS NOT NULL)) AND (subject.days_to_death IS NOT NULL))) GROUP BY subje

You can join entire other tables!

In [120]:
fetch_rows( table= "subject", match_all=[ 'subject_id = TCGA*', 'sex = f*', 'hugo_symbol != NULL', 'days_to_death != NULL' ], data_source=[ 'GDC', 'PDC', 'IDC' ], link_to_table='researchsubject' )

fetch_rows(): ERROR: error message from API: 'PreparedStatementCallback; bad SQL grammar [SELECT row_to_json(json) FROM (SELECT subject.id AS subject_id, subject.cause_of_death AS cause_of_death, subject.days_to_birth AS days_to_birth, subject.days_to_death AS days_to_death, subject.ethnicity AS ethnicity, subject.race AS race, subject.sex AS sex, subject.species AS species, subject.vital_status AS vital_status, researchsubject.id AS researchsubject_id, researchsubject.member_of_research_project AS member_of_research_project, researchsubject.primary_diagnosis_condition AS primary_diagnosis_condition, researchsubject.primary_diagnosis_site AS primary_diagnosis_site FROM subject AS subject  LEFT JOIN subject_researchsubject AS subject_researchsubject ON subject.integer_id_alias = subject_researchsubject.subject_alias  LEFT JOIN researchsubject AS researchsubject ON subject_researchsubject.researchsubject_alias = researchsubject.integer_id_alias  LEFT JOIN subject_data_source AS subject_d

You can get value-count summaries for things, optionally based on matches to custom filters:

In [121]:
summary_counts( table='subject', match_all='subject_id = CPTAC.01BR*' )

summary_counts(): ERROR: unexpected return type 'float64' observed in result column 'subject_id'; please inform the CDA devs of this event.


In [122]:
summary_counts( table='diagnosis', match_all='primary_diagnosis = Adeno*' )

summary_counts(): ERROR: unexpected return type 'float64' observed in result column 'diagnosis_id'; please inform the CDA devs of this event.


You can also count things based on filters applied to related tables:

In [123]:
summary_counts( table='diagnosis', match_all='subject_id = CPTAC.01BR*' )

summary_counts(): ERROR: unexpected return type 'float64' observed in result column 'diagnosis_id'; please inform the CDA devs of this event.


This particular bit doesn't work and has been humorously disabled!

In [124]:
summary_counts( table='somatic_mutation', match_all=[ 'hugo_symbol = DOK5' ] )

summary_counts(): ERROR_WITH_APOLOGIES: summary counts for somatic_mutation are not available at present. Please select any of our other fine tables.
