In [1]:
import os
import json
from pprint import pprint
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1)

from dataservice.util.data_import.utils import (
    read_json, 
    write_json, 
    cols_to_lower, 
    dropna_rows_cols,
    reformat_column_names
)

DATA_DIR = '/Users/singhn4/Projects/kids_first/data/Chung'
DBGAP_DIR = os.path.join(DATA_DIR, 'dbgap')
MANIFESTS_DIR = os.path.join(DATA_DIR, 'manifests')

In [2]:
# Create study
study = {
    'data_access_authority': 'dbGaP',
    'study_id': 'phs001110',
    'study_version': 'v1.p1',
    'study_name': 'Genomic Analysis of Congenital Diaphragmatic Hernia and Associated Congenital Anomalies',
    'attribution': 'https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs001110.v1.p1'
}
study_df = pd.DataFrame([study])
study_df.to_csv(os.path.join(DATA_DIR, 'study.txt'))

# Create investigator
invest = {
    'investigator_name': 'Wendy Chung',
    'institution': 'Columbia University Health Sciences'
}
inv_df = pd.DataFrame([invest])
inv_df.to_csv(os.path.join(DATA_DIR, 'investigator.txt'))

In [35]:
@reformat_column_names
@dropna_rows_cols
def read_study_file_data(filepaths=None):
    """
    Read in raw study files
    """
    if not filepaths:
        filepaths = os.listdir(DBGAP_DIR)
        filepaths.extend(os.listdir(MANIFESTS_DIR))

    study_files = [{"study_file_name": f}
                   for f in filepaths]
    return pd.DataFrame(study_files)

@reformat_column_names
@dropna_rows_cols
def read_study_data(filepath=None):
    """
    Read study data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'study.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_investigator_data(filepath=None):
    """
    Read investigator data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'investigator.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_subject_data(filepath=None):
    """
    Read subject data file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '4a_dbGaP_SubjectDS_corrected_7-16.xlsx')
    df = pd.read_excel(filepath, dtype={'SUBJECT_ID': str})
    # Decode consent ints to consent strings
    def func(row): 
        _map = {1: "Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)"}
        return _map.get(row['CONSENT'])
    df['CONSENT'] = df.apply(func, axis=1)

    # Decode affected status ints to strings
    def func(row): 
        _map = {0:'unknown', 1: "affected", 2: "unaffected"}
        return _map.get(row['AFFECTED_STATUS'])
    df['AFFECTED_STATUS'] = df.apply(func, axis=1)
    return df

@reformat_column_names
@dropna_rows_cols
def read_subject_attr_data(filepath=None):
    """
    Read subject attributes file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '3a_dbGaP_SubjectAttributesDS_corrected.6.12.xlsx')
    df = pd.read_excel(filepath, dtype={'SUBJECT_ID': str})
    # Decode body_site chars to strings
    def func(row): 
        _map = {'B':'blood', 'SK': 'skin', 'D': 'diaphragm', 'SV': 'saliva', 'A': 'amniocytes', 'M': 'amniocytes'}
        return _map.get(row['body_site'])
    df['body_site'] = df.apply(func, axis=1)
    return df

@reformat_column_names
@dropna_rows_cols
def read_family_data(filepath=None):
    """
    Read pedigree data
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '6a_dbGaP_PedigreeDS_corrected.6.12.xlsx')
    df = pd.read_excel(filepath)
    del df['SEX']
    
    return df

@reformat_column_names
# @dropna_rows_cols
def read_sample_manifests():
    """
    Read and combine all sample manifest sheets
    """
    # Sample manifests
    # Combine all sample manifest sheets
    dfs = [pd.read_excel(os.path.join(MANIFESTS_DIR, filename))

           for filename in os.listdir(MANIFESTS_DIR)

           ]
    df = pd.concat(dfs)
    df = df[df['Sample ID'].notnull()]
    df.rename(columns={'Alias.2': 'is_proband'}, inplace=True)
    df['is_proband'] = df['is_proband'].apply(lambda x: True if x == 'Proband' else False)
    
    def func(row):
        val = str(row['Volume']).strip("uL")
        try:
            val = int(val)
        except ValueError:
            val = np.NaN
        return val
    
    df['Volume'] = df.apply(func, axis=1)
    
    def func(row):
        val = str(row['Concentration']).strip("ng/uL")
        try:
            val = int(val)
        except ValueError:
            val = np.NaN
        return val
    
    df['Concentration'] = df.apply(func, axis=1)
    
    df = df[pd.notnull(df.Concentration)]
    df = df[pd.notnull(df.Volume)]
    df.Concentration = df.Concentration.astype('int')
    df.Volume = df.Volume.astype('int')
    
    return df[['Concentration', 'Volume', 'Sample ID', 'Sample Type', 'is_proband']]

@reformat_column_names
@dropna_rows_cols
def read_subject_sample_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '5a_dbGaP_SubjectSampleMappingDS cumulative.xlsx')
    return pd.read_excel(filepath, delimiter='\t')

@reformat_column_names
@dropna_rows_cols
def read_demographic_data(filepath=None):
    """
    Read demographic data from phenotype file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '2a_dbGaP_SubjectPhenotypesDS.xlsx')
    df = pd.read_excel(filepath)
    # Make all values lower case
    for col in ['Ethnicity', 'Race']:
        df[col] = df[col].apply(lambda x: str(x).lower().strip())
        
    return df[['SUBJECT_ID', 'SEX', 'Ethnicity', 'Race']]

@reformat_column_names
@dropna_rows_cols
def read_phenotype_data(filepath=None):
    """
    Read phenotype file and insert HPO IDs
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '2a_dbGaP_SubjectPhenotypesDS.xlsx')

    df = pd.read_excel(filepath)
    df.drop(['Ethnicity', 'Race', 'SEX', 'discharge_status', 'ISOLATED'], inplace=True, axis=1)
    # Reshape to build the phenotypes df
    cols = df.columns.tolist()[1:]
    phenotype_df = pd.melt(df, id_vars='SUBJECT_ID', value_vars=cols,
                           var_name='phenotype', value_name='value')

    # Drop rows where value is NaN
    phenotype_df = phenotype_df[pd.notnull(phenotype_df['value'])]

    # Decode phenotypes to descriptive strings
    def func(row):
        _map = {0: 'no', 1: 'yes'}
        return _map.get(row['value'], row['value'])
    phenotype_df['value'] = phenotype_df.apply(func, axis=1)

    # Decode phenotypes to descriptive strings
    def func(row):
        # Always take most specific value
        if row['value'] not in ['yes', 'no']:
            val = row['value']
        else:
            _map = {'CHD': 'congenital heart defect', 'CNS': 'central nervous system defect', 
                'GI': 'gastrointestinal defect'}
            val = _map.get(row['phenotype'], 'congenital birth defect')
        return val
    phenotype_df['phenotype'] = phenotype_df.apply(func, axis=1)

    # Set observed
    phenotype_df['observed'] = phenotype_df['value'].apply(lambda x: 'positive' if x != 'no' else 'negative')
    del phenotype_df['value']
    
    # Add HPOs
    from dataservice.util.data_import.etl.hpo import mapper
    hpo_mapper = mapper.HPOMapper(DATA_DIR)
    phenotype_df = hpo_mapper.add_hpo_id_col(phenotype_df)
    
    # Add unique col
    def func(row): return "_".join(['phenotype', str(row.name)])
    phenotype_df['phenotype_id'] = phenotype_df.apply(func, axis=1)
    
    return phenotype_df

@reformat_column_names
@dropna_rows_cols
def read_outcome_data(filepath=None):
    """
    Read outcome data from phenotype file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '2a_dbGaP_SubjectPhenotypesDS.xlsx')
    df = pd.read_excel(filepath)
    
    # Replace NaN values with None
    df['discharge_status'] = df['discharge_status'].where((pd.notnull(df['discharge_status'])),999)
    
    # Map discharge status
    # 1=Alive 4=Deceased 0=Fetal sample 8=unknown NA=Not applicable
    def func(row): 
        _map = {0:'alive', 1: 'deceased', 4:'fetal sample', 8: 'unknown'}
        return _map.get(int(row['discharge_status']), 'not applicable')
    df['discharge_status'] = df.apply(func, axis=1)
    
    # Add unique col
    def func(row): return "_".join(['outcome', str(row.name)])
    df['outcome_id'] = df.apply(func, axis=1)
    
    return df[['SUBJECT_ID', 'discharge_status', 'outcome_id']]

@reformat_column_names
@dropna_rows_cols
def read_genomic_file_manifest(filepath=None):
    """
    Read genomic file manifest (ties subjects to genomic files)
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR, 'sample.txt')

    df = pd.read_csv(filepath, delimiter='\t')
    return df[['entity:sample_id', 'aligned_reads', 'crai_or_bai_path', 'cram_or_bam_path', 
               'library-1_name', 'library-2_name', 'max_insert_size', 'mean_depth', 'mean_insert_size', 
               'mean_read_length','min_insert_size', 'sample_alias', 'total_reads']]

def read_genomic_files_info(filepath=None):
    """
    Read genomic file info
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR, 'genomic_files_by_uuid.json')
    data = read_json(filepath)
    df = pd.DataFrame(list(data.values()))
    # Reformat
    df['md5sum'] = df['hashes'].apply(lambda x: x['md5'])
    df['file_url'] = df['urls'].apply(lambda x: x[0])
    df['file_format'] = df['file_name'].apply(lambda x: '.'.join(x.split('.')[1:]))
    df.rename(columns={'did':'uuid', 'size':'file_size'}, inplace=True)
    def func(x):
        x = x.strip()
        if x == 'cram':
            val = 'submitted aligned reads'
        elif x.endswith('crai'):
            val = 'submitted aligned reads index'
        elif 'vcf' in x:
            val = 'variant calling'
        else:
            val = None
        return val
    df['data_type'] = df['file_format'].apply(func)
    df['subject_id'] = df['file_name'].apply(lambda file_name: file_name.split('.')[0])
    return df


## Explore

In [4]:
# Db gap files
files = {f:os.path.join(DBGAP_DIR, f) for f in os.listdir(DBGAP_DIR)}
pprint(list(files.keys()))

['2a_dbGaP_SubjectPhenotypesDS.xlsx',
 '2b_dbGaP_SubjectPhenotypesDD.xlsx',
 '3a_dbGaP_SubjectAttributesDS_corrected.6.12.xlsx',
 '3b_dbGaP_SubjectAttributesDD.xlsx',
 '4a_dbGaP_SubjectDS_corrected_7-16.xlsx',
 '4b_dbGaP_SubjectDD_corrected_6_12.xlsx',
 '5a_dbGaP_SubjectSampleMappingDS cumulative.xlsx',
 '5b_dbGaP_SubjectSampleMappingDD.xlsx',
 '6a_dbGaP_PedigreeDS_corrected.6.12.xlsx',
 '6b_dbGaP_PedigreeDD.xlsx']


### Subject 

In [5]:
# Description of data
df1 = pd.read_excel(files['4b_dbGaP_SubjectDD_corrected_6_12.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df1

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES,Unnamed: 8
0,SUBJECT_ID,De-identified Subject ID,,encoded value,,,Collected at enrollment,,
1,CONSENT,Consent group as determined by DAC,,encoded value,,,Collected at enrollment,"1=Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",
2,AFFECTED_STATUS,Case control status of the subject for congenital diaphragmatic hernia/defects,,encoded value,,,Collected at enrollment,1=affected,2=unaffected


In [6]:
# Subject data
df = pd.read_excel(files['4a_dbGaP_SubjectDS_corrected_7-16.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df.head()

Unnamed: 0,SUBJECT_ID,CONSENT,AFFECTED_STATUS
0,06-0015,1,1
1,07-0016,1,1
2,09-0019,1,1
3,6-15F,1,2
4,6-15M,1,2


In [7]:
print(df['CONSENT'].unique())
print(df['AFFECTED_STATUS'].unique())
df.describe(include=['O']).T.sort_values('unique', ascending=False)

[1]
[1 2]


Unnamed: 0,count,unique,top,freq
SUBJECT_ID,962,962,c1110WISa,1


In [8]:
# Subject attributes data description
df = pd.read_excel(files['3b_dbGaP_SubjectAttributesDD.xlsx'])
df

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,SAMPLE_ID,De-identified Sample ID,,string,,,Collected at enrollment,,,,,,
1,body_site,Body site where sample was collected,,encoded value,,,Collected at enrollment,B=blood,SK=skin,D=diaphragm,SV=saliva,A=amniocytes,M=muscle
2,analyte_type,Analyte Type,,encoded value,,,Collected at enrollment,,,,,,
3,is_tumor,Tumor status,,encoded value,,,Collected at enrollment,Y=Is Tumor,N=Is not a tumor,,,,
4,SUBJECT_ID,De-identified Subject ID,,string,,,,,,,,,


In [9]:
# Subject attributes
df = pd.read_excel(files['3a_dbGaP_SubjectAttributesDS_corrected.6.12.xlsx'], delimiter='\t')
df.head()

Unnamed: 0,SUBJECT_ID,SAMPLE_ID,body_site,analyte_type,is_tumor
0,06-0015,SM-DH4NH,D,DNA,N
1,07-0016,SM-DH4NJ,B,DNA,N
2,09-0019,SM-DH4NK,SK,DNA,N
3,6-15F,SM-DH4NF,B,DNA,N
4,6-15M,SM-DH4NG,B,DNA,N


In [10]:
print(df.body_site.unique())
df.describe(include=['O']).T.sort_values('unique', ascending=False)

['D' 'B' 'SK' 'M' 'A']


Unnamed: 0,count,unique,top,freq
SUBJECT_ID,962,962,c1110WISa,1
SAMPLE_ID,962,962,SM-DI7P6,1
body_site,962,5,B,945
analyte_type,962,1,DNA,962
is_tumor,962,1,N,962


### Family/Pedigree

In [11]:
# Data description
df = pd.read_excel(files['6b_dbGaP_PedigreeDD.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df.head()

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES,Unnamed: 8,Unnamed: 9
0,FAMILY_ID,Family_ID,,encoded value,,,Collected at enrollment,,,
1,SUBJECT_ID,De-identified Subject ID,,encoded value,,,Collected at enrollment,,,
2,Father,ID of father,,encoded value,,,Collected at enrollment,,,
3,Mother,ID of mother,,encoded value,,,Collected at enrollment,,,
4,SEX,Gender of participant,,encoded value,,,Collected at enrollment,M=male,F=female,U=unknown


In [12]:
df = pd.read_excel(files['6a_dbGaP_PedigreeDS_corrected.6.12.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df.head()

Unnamed: 0,Family_id,SUBJECT_ID,Father,Mother,SEX
0,06-0015,06-0015,6-15F,6-15M,F
1,07-0016,07-0016,CDH1377,7-16M,M
2,09-0019,09-0019,CDH1247,CDH1246,M
3,06-0015,6-15F,,,M
4,06-0015,6-15M,,,F


In [13]:
df.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,962,962,c1110WISa,1
Family_id,962,321,06-0014,3
Father,320,320,CDH1382,1
Mother,320,320,CDH141,1
SEX,962,3,M,518


### Phenotypes

In [14]:
# Data description
df = pd.read_excel(files['2b_dbGaP_SubjectPhenotypesDD.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,SUBJECT_ID,De-identified Subject ID,,string,,,Collected at enrollment,,,,,
1,SEX,Gender of participant,,encoded value,,,Collected at enrollment,M=Male,F=Female,U=unknown,,
2,ISOLATED,If the subject has a second birth defect,,encoded value,,,Collected at enrollment,1=Isolated,2=Non-isolated,U=unknown,NA=Not applicable,
3,CHD,If the subject has a congenital heart defect,,encoded value,,,Collected at enrollment,1=yes,0=No,NA=Not applicable,,
4,CNS,If the subject has a central nervous system defect,,encoded value,,,Collected at enrollment,1=yes,0=No,NA=Not applicable,,
5,GI,If the subject has a gastrointestinal defect,,encoded value,,,Collected at enrollment,1=yes,0=No,NA=Not applicable,,
6,other_cong_malf_1,Other birth defects (1),,encoded value,,,Collected at enrollment,text,0=No,NA=Not applicable,,
7,other_cong_malf_2,Other birth defects (2),,encoded value,,,Collected at enrollment,text,0=No,NA=Not applicable,,
8,other_cong_malf_3,Other birth defects (3),,encoded value,,,Collected at enrollment,text,0=No,NA=Not applicable,,
9,discharge_status,Discharge status,,encoded value,,,Collected at enrollment,1=Alive,4=Deceased,0=Fetal sample,8=unknown,NA=Not applicable


In [15]:
# Data
df = pd.read_excel(files['2a_dbGaP_SubjectPhenotypesDS.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df.head()

Unnamed: 0,SUBJECT_ID,SEX,ISOLATED,CHD,CNS,GI,other_cong_malf_1,other_cong_malf_2,other_cong_malf_3,discharge_status,Ethnicity,Race
0,06-0015,F,2.0,0.0,0.0,0.0,congenital pulmonary sequestration malformation,0,0,1.0,non-Hispanic,white
1,07-0016,M,1.0,0.0,0.0,0.0,0,0,0,1.0,non-Hispanic,white
2,09-0019,M,2.0,1.0,1.0,0.0,pulmonary hypoplasia,"Facial & Body deformities suggestive of Potter sequence: wide inter brain weight, canthal distance; flattened, widened nose; micrognathia; flattened pinnae w/ somewhat bulbous earlobes; varus deformity with metatarsus adductus, feet, severe",small kidneys; renal proximal tubular epithelial degeneration and sloughing with cortical medullary junction congestion,4.0,non-Hispanic,white
3,6-15F,M,,,,,,,,,non-Hispanic,white
4,6-15M,F,,,,,,,,,non-Hispanic,white


In [16]:
df.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,962,962,c1110WISa,1
other_cong_malf_1,321,61,0,254
other_cong_malf_2,320,25,0,296
other_cong_malf_3,319,10,0,310
Race,962,9,White,450
Ethnicity,962,5,Non-Hispanic,412
SEX,962,3,M,518
ISOLATED,323,3,1,183


### Samples

In [17]:
# Subject sample data description
filepath = os.path.join(DBGAP_DIR, '5b_dbGaP_SubjectSampleMappingDD.xlsx')
df = pd.read_excel(filepath, delimiter='\t')
df.head()

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES
0,SUBJECT_ID,De-identified Subject ID,,encoded value,,,Collected at enrollment,
1,SAMPLE_ID,De-identified Sample ID,,encoded value,,,Collected at enrollment,
2,SAMPLE_USE,Sample use,,encoded value,,,Collected at enrollment,


In [18]:
# Subject sample mapping
filepath = os.path.join(DBGAP_DIR, '5a_dbGaP_SubjectSampleMappingDS cumulative.xlsx')
df = pd.read_excel(filepath, delimiter='\t')
df.head()

Unnamed: 0,SUBJECT_ID,SAMPLE_ID,SAMPLE_USE
0,1,1,Seq_DNA_WholeGenome
1,2,2,Seq_DNA_WholeGenome
2,7,7,Seq_DNA_WholeGenome
3,8,8,Seq_DNA_WholeGenome
4,10,10,Seq_DNA_WholeGenome


In [19]:
df.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,1556,1556,h1251YOXb,1
SAMPLE_ID,1556,1556,SM-DI7P6,1
SAMPLE_USE,1556,1,Seq_DNA_WholeGenome,1556


In [20]:
# Sample manifests
# Combine all sample manifest sheets
dfs = [pd.read_excel(os.path.join(MANIFESTS_DIR, filename))

       for filename in os.listdir(MANIFESTS_DIR)

       ]
df = pd.concat(dfs)
df = df[df['Sample ID'].notnull()]
print(df.shape)
df.head()

(908, 15)


Unnamed: 0,Alias,Alias.1,Alias.2,Alias.3,Collected After 01/25/2015,Concentration,Gender,RIN Number,Sample ID,Sample Type,Unnamed: 12,Unnamed: 13,Unnamed: 8,Volume,Well
1,h1258FBBc,h1258FBBc1,Father,,,50,Male,,SM-DJT8D,DNA,,,,70,A01
2,m1021LEMa,m1021LEMa1,Proband,,,50,Female,,SM-DJT8E,DNA,,,,70,A02
3,m1029QENa,m1029QENa1,Proband,,,50,Unknown,,SM-DJT8F,DNA,,,,30,A03
4,m1008SAWb,m1008SAWb1,Mother,,,50,Female,,SM-DJT8G,DNA,,,,33,A04
5,m1038MWLb,m1038MWLb1,Mother,,,50,Female,,SM-DJT8H,DNA,,,,49,A05


In [21]:
# Phenotype sheets
dfs = [pd.read_excel(os.path.join(MANIFESTS_DIR, filename), sheet_name=1)

       for filename in os.listdir(MANIFESTS_DIR)

       ]
df = pd.concat(dfs)
df = df[df['Sample ID'].notnull()]
# Make all values lower case
df['Primary Disease'] = df['Primary Disease'].apply(lambda x: str(x).lower())

print(df.shape)
df.head()

(908, 10)


Unnamed: 0,Age,BSP Notes PT,Ethnicity,Primary Disease,Race,Sample ID,Unnamed: 7,Unnamed: 8,Well,multiple races
1,,,non-Hispanic,unaffected parent,White,SM-DJT8D,h1258FBBc,h1258FBBc1,A01,
2,,"hypoplastic aortic arch, peristant L SVC, 2 vessel cord",non-Hispanic,congenital diaphragmatic hernia - complex,White,SM-DJT8E,m1021LEMa,m1021LEMa1,A02,
3,,,unknown,"congenital diaphragmatic hernia - complex, poc",White,SM-DJT8F,m1029QENa,m1029QENa1,A03,
4,,,non-Hispanic,unaffected parent,White,SM-DJT8G,m1008SAWb,m1008SAWb1,A04,
5,,,non-Hispanic,unaffected parent,White,SM-DJT8H,m1038MWLb,m1038MWLb1,A05,


## Extract

#### Participants, Family Relationships

In [22]:
subject_df = read_subject_data()
subject_df.head()

Unnamed: 0,subject_id,consent,affected_status
0,06-0015,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected
1,07-0016,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected
2,09-0019,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected
3,6-15F,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected
4,6-15M,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected


In [23]:
subject_attr_df = read_subject_attr_data()
subject_attr_df.head()

Unnamed: 0,subject_id,sample_id,body_site,analyte_type,is_tumor
0,06-0015,SM-DH4NH,diaphragm,DNA,N
1,07-0016,SM-DH4NJ,blood,DNA,N
2,09-0019,SM-DH4NK,skin,DNA,N
3,6-15F,SM-DH4NF,blood,DNA,N
4,6-15M,SM-DH4NG,blood,DNA,N


In [24]:
family_df = read_family_data()
family_df.head()

Unnamed: 0,family_id,subject_id,father,mother
0,06-0015,06-0015,6-15F,6-15M
1,07-0016,07-0016,CDH1377,7-16M
2,09-0019,09-0019,CDH1247,CDH1246
3,06-0015,6-15F,,
4,06-0015,6-15M,,


In [25]:
sample_manifest_df = read_sample_manifests()
sample_manifest_df.head()
sample_manifest_df.concentration.unique()
sample_manifest_df.volume.unique()

array([70, 30, 33, 49, 48, 28, 20, 41, 32, 42, 35])

In [26]:
# Participant df
# Merge subject + subject attributes
df1 = pd.merge(subject_df, subject_attr_df, on='subject_id')
df1.head()

# Merge family
df2 = pd.merge(df1, family_df, on='subject_id')
print('{} Participants w/o samples merged'.format(df2.shape))

# Merge proband from sample manifests
participant_df = pd.merge(df2, sample_manifest_df[['sample_id', 'is_proband']], on='sample_id')
print('{} Participants w samples merged'.format(participant_df.shape))
participant_df.head()

(959, 10) Participants w/o samples merged
(863, 11) Participants w samples merged


Unnamed: 0,subject_id,consent,affected_status,sample_id,body_site,analyte_type,is_tumor,family_id,father,mother,is_proband
0,06-0015,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NH,diaphragm,DNA,N,06-0015,6-15F,6-15M,True
1,07-0016,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NJ,blood,DNA,N,07-0016,CDH1377,7-16M,True
2,09-0019,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NK,skin,DNA,N,09-0019,CDH1247,CDH1246,True
3,6-15F,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected,SM-DH4NF,blood,DNA,N,06-0015,,,False
4,6-15M,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected,SM-DH4NG,blood,DNA,N,06-0015,,,False


#### Demographic

In [27]:
demographic_df = read_demographic_data()
demographic_df = pd.merge(demographic_df, participant_df, on='subject_id')
demographic_df.head()

Unnamed: 0,subject_id,sex,ethnicity,race,consent,affected_status,sample_id,body_site,analyte_type,is_tumor,family_id,father,mother,is_proband
0,06-0015,F,non-hispanic,white,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NH,diaphragm,DNA,N,06-0015,6-15F,6-15M,True
1,07-0016,M,non-hispanic,white,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NJ,blood,DNA,N,07-0016,CDH1377,7-16M,True
2,09-0019,M,non-hispanic,white,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NK,skin,DNA,N,09-0019,CDH1247,CDH1246,True
3,6-15F,M,non-hispanic,white,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected,SM-DH4NF,blood,DNA,N,06-0015,,,False
4,6-15M,F,non-hispanic,white,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected,SM-DH4NG,blood,DNA,N,06-0015,,,False


#### Samples

In [28]:
# Subject sample mappings
subject_sample_df = read_subject_sample_data()
subject_sample_df.head()

# Merge with subject data
df3 = pd.merge(participant_df, subject_sample_df[['subject_id', 'sample_use']], on='subject_id')
print(df3.shape)
df3.head()

# Merge with sample manifests
sample_df = pd.merge(df3, sample_manifest_df, on='sample_id')
print(sample_df.shape)
sample_df.head()

(863, 12)
(863, 16)


Unnamed: 0,subject_id,consent,affected_status,sample_id,body_site,analyte_type,is_tumor,family_id,father,mother,is_proband_x,sample_use,concentration,volume,sample_type,is_proband_y
0,06-0015,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NH,diaphragm,DNA,N,06-0015,6-15F,6-15M,True,Seq_DNA_WholeGenome,50,30,DNA,True
1,07-0016,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NJ,blood,DNA,N,07-0016,CDH1377,7-16M,True,Seq_DNA_WholeGenome,50,30,DNA,True
2,09-0019,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NK,skin,DNA,N,09-0019,CDH1247,CDH1246,True,Seq_DNA_WholeGenome,50,30,DNA,True
3,6-15F,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected,SM-DH4NF,blood,DNA,N,06-0015,,,False,Seq_DNA_WholeGenome,50,30,DNA,False
4,6-15M,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected,SM-DH4NG,blood,DNA,N,06-0015,,,False,Seq_DNA_WholeGenome,50,30,DNA,False


#### Aliquots, Seq Experiments
No aliquots for this cohort.
Not sure how to identify unique seq experiments. Use sample id for now

In [29]:
# Merge with aliquots
gf_manifest_df = read_genomic_file_manifest()
seq_exp_df = pd.merge(gf_manifest_df, sample_df[['subject_id', 'sample_id', 'sample_use']], left_on='sample_alias', right_on='subject_id')
# Add unique col
def func(row): return "_".join(['seq_exp_id', str(row.name)])
seq_exp_df['seq_exp_id'] = seq_exp_df.apply(func, axis=1)
seq_exp_df.head()

Unnamed: 0,entity:sample_id,aligned_reads,crai_or_bai_path,cram_or_bam_path,library-1_name,library-2_name,max_insert_size,mean_depth,mean_insert_size,mean_read_length,min_insert_size,sample_alias,total_reads,subject_id,sample_id,sample_use,seq_exp_id
0,RP-1370_06-0015_v2_WGS_GCP,752191693,gs://fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/06-0015/v2/06-0015.cram.crai,gs://fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/06-0015/v2/06-0015.cram,0221646737_Illumina_P5-Hewij_P7-Liyik,,246055736,19.88,380.82334,151,1,06-0015,752537620,06-0015,SM-DH4NH,Seq_DNA_WholeGenome,seq_exp_id_0
1,RP-1370_07-0016_v2_WGS_GCP,790690515,gs://fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/07-0016/v2/07-0016.cram.crai,gs://fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/07-0016/v2/07-0016.cram,0221645466_Illumina_P5-Kahiz_P7-Yobex,0207550124_Illumina_P5-Ponox_P7-Xijoy,246001344,19.86,372.97413,151,1,07-0016,788328522,07-0016,SM-DH4NJ,Seq_DNA_WholeGenome,seq_exp_id_1
2,RP-1370_09-0019_v2_WGS_GCP,868284797,gs://fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/09-0019/v2/09-0019.cram.crai,gs://fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/09-0019/v2/09-0019.cram,0221646782_Illumina_P5-Fezex_P7-Halex,,245816040,19.9,387.47247,151,1,09-0019,869990198,09-0019,SM-DH4NK,Seq_DNA_WholeGenome,seq_exp_id_2
3,RP-1370_6-15F_v2_WGS_GCP,778327664,gs://fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/6-15F/v2/6-15F.cram.crai,gs://fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/6-15F/v2/6-15F.cram,0221646735_Illumina_P5-Hilec_P7-Hawix,,246870780,19.82,381.81597,151,1,6-15F,776249338,6-15F,SM-DH4NF,Seq_DNA_WholeGenome,seq_exp_id_3
4,RP-1370_6-15M_v2_WGS_GCP,745295971,gs://fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/6-15M/v2/6-15M.cram.crai,gs://fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/6-15M/v2/6-15M.cram,0221646736_Illumina_P5-Nawoh_P7-Layet,,246875577,19.91,360.694,151,1,6-15M,740223414,6-15M,SM-DH4NG,Seq_DNA_WholeGenome,seq_exp_id_4


#### Phenotype

In [30]:
# Read phenotype
phenotype_df = read_phenotype_data()
phenotype_df.head()

# Merge with participant df
phenotype_df = pd.merge(phenotype_df, participant_df, on='subject_id')
phenotype_df.head()

Unnamed: 0,subject_id,phenotype,observed,hpo_id,phenotype_id,consent,affected_status,sample_id,body_site,analyte_type,is_tumor,family_id,father,mother,is_proband
0,06-0015,congenital heart defect,negative,,phenotype_0,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NH,diaphragm,DNA,N,06-0015,6-15F,6-15M,True
1,06-0015,central nervous system defect,negative,,phenotype_962,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NH,diaphragm,DNA,N,06-0015,6-15F,6-15M,True
2,06-0015,gastrointestinal defect,negative,,phenotype_1924,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NH,diaphragm,DNA,N,06-0015,6-15F,6-15M,True
3,06-0015,congenital pulmonary sequestration malformation,positive,,phenotype_2886,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NH,diaphragm,DNA,N,06-0015,6-15F,6-15M,True
4,06-0015,congenital birth defect,negative,,phenotype_3848,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NH,diaphragm,DNA,N,06-0015,6-15F,6-15M,True


In [39]:
phenos = phenotype_df[phenotype_df.observed == 'positive']
phenos.groupby(['phenotype']).count().sort_values(['phenotype_id'], ascending=False)

Unnamed: 0_level_0,subject_id,observed,hpo_id,phenotype_id,consent,affected_status,sample_id,body_site,analyte_type,is_tumor,family_id,father,mother,is_proband
phenotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
congenital heart defect,68,68,0,68,68,68,68,68,68,68,68,68,68,68
gastrointestinal defect,21,21,0,21,21,21,21,21,21,21,21,21,21,21
central nervous system defect,16,16,0,16,16,16,16,16,16,16,16,16,16,16
hypospadias,3,3,3,3,3,3,3,3,3,3,3,3,3,3
congenital pulmonary sequestration malformation,2,2,0,2,2,2,2,2,2,2,2,2,2,2
2 vessel cord,2,2,0,2,2,2,2,2,2,2,2,2,2,2
extralobar pulmonary sequestration,2,2,0,2,2,2,2,2,2,2,2,2,2,2
intrauterine growth restriction,2,2,0,2,2,2,2,2,2,2,2,2,2,2
hydrocele,1,1,0,1,1,1,1,1,1,1,1,1,1,1
hydrocephelus,1,1,0,1,1,1,1,1,1,1,1,1,1,1


In [31]:
outcome_df = read_outcome_data()
outcome_df.head()

Unnamed: 0,subject_id,discharge_status,outcome_id
0,06-0015,deceased,outcome_0
1,07-0016,deceased,outcome_1
2,09-0019,fetal sample,outcome_2
3,6-15F,not applicable,outcome_3
4,6-15M,not applicable,outcome_4


In [32]:
outcome_df = outcome_df[outcome_df.discharge_status != 'not applicable']
outcome_df.discharge_status.unique()
outcome_df.shape

(322, 3)

### Genomic Files

In [34]:
# Merge with sequencing experiments
gf_file_info_df = read_genomic_files_info()
genomic_file_df = pd.merge(gf_file_info_df, seq_exp_df, on='subject_id')
genomic_file_df.head()

Unnamed: 0,uuid,file_name,form,hashes,metadata,file_size,urls,md5sum,file_url,file_format,...,max_insert_size,mean_depth,mean_insert_size,mean_read_length,min_insert_size,sample_alias,total_reads,sample_id,sample_use,seq_exp_id
0,00297e3e-1e19-4056-b5ec-c59226d09dd8,CDH7-34M.cram,object,{'md5': '6e2fe205efc9d82fbd8c24a842873ec6'},{'acls': 'phs001110'},20100318976,[s3://kf-seq-data-broad/fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/CDH7-34M/v2/CDH7-34M.cram],6e2fe205efc9d82fbd8c24a842873ec6,s3://kf-seq-data-broad/fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/CDH7-34M/v2/CDH7-34M.cram,cram,...,245913006,19.96,382.62611,151,1,CDH7-34M,831117088,SM-DH75Y,Seq_DNA_WholeGenome,seq_exp_id_584
1,c3fc366f-281e-41d9-8f24-206f7cd0f094,CDH7-34M.cram.crai,object,{'md5': 'd9755b829048bffd091aeee99f77f662'},{'acls': 'phs001110'},1470352,[s3://kf-seq-data-broad/fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/CDH7-34M/v2/CDH7-34M.cram.crai],d9755b829048bffd091aeee99f77f662,s3://kf-seq-data-broad/fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/CDH7-34M/v2/CDH7-34M.cram.crai,cram.crai,...,245913006,19.96,382.62611,151,1,CDH7-34M,831117088,SM-DH75Y,Seq_DNA_WholeGenome,seq_exp_id_584
2,0032bad7-90ce-4363-be9c-d73c2fdcb878,CDH1314.cram,object,{'md5': 'e00d2c7e5feac91f6247b7863f97143b'},{'acls': 'phs001110'},19263951402,[s3://kf-seq-data-broad/fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/CDH1314/v1/CDH1314.cram],e00d2c7e5feac91f6247b7863f97143b,s3://kf-seq-data-broad/fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/CDH1314/v1/CDH1314.cram,cram,...,243734535,19.95,381.3132,151,1,CDH1314,819844500,SM-DI7HG,Seq_DNA_WholeGenome,seq_exp_id_284
3,e0511ab2-f0e6-4460-8e31-d4b211d7b759,CDH1314.cram.crai,object,{'md5': 'c2a692bf74b2e1cf7c93a67e9443fad1'},{'acls': 'phs001110'},1452924,[s3://kf-seq-data-broad/fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/CDH1314/v1/CDH1314.cram.crai],c2a692bf74b2e1cf7c93a67e9443fad1,s3://kf-seq-data-broad/fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/CDH1314/v1/CDH1314.cram.crai,cram.crai,...,243734535,19.95,381.3132,151,1,CDH1314,819844500,SM-DI7HG,Seq_DNA_WholeGenome,seq_exp_id_284
4,003d79b1-0e98-440d-99aa-d0512ccd8d20,CDH09-0022.cram,object,{'md5': 'da9079035a143f50d9c62a8b45d48709'},{'acls': 'phs001110'},17783570533,[s3://kf-seq-data-broad/fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/CDH09-0022/v2/CDH09-0022.cram],da9079035a143f50d9c62a8b45d48709,s3://kf-seq-data-broad/fc-ff4e8f53-e153-4c78-b630-0ebe66030d80/GMKF_Chung_CDH_WGS_V1/RP-1370/WGS/CDH09-0022/v2/CDH09-0022.cram,cram,...,243785317,19.89,366.04037,151,1,CDH09-0022,813011942,SM-DI8WJ,Seq_DNA_WholeGenome,seq_exp_id_133
