In [2]:
import os
import json
from pprint import pprint
import pandas as pd
pd.set_option('display.max_colwidth', -1)

DATA_DIR = '/Users/singhn4/Projects/kids_first/data/Chung'
DBGAP_DIR = os.path.join(DATA_DIR, 'dbgap')
MANIFESTS_DIR = os.path.join(DATA_DIR, 'manifests')

In [3]:
# Helper functions
def read_json(filepath):
    with open(filepath, 'r') as json_file:
        return json.load(json_file)

def write_json(data, filepath):
    with open(filepath, 'w') as json_file:
        json.dump(data, json_file, sort_keys=True, indent=4, separators=(',', ':'))
        
def cols_to_lower(df):
    df.columns = map((lambda x: x.replace(" ", "_").lower()), df.columns)
        
def dropna_rows_cols(df_func):
    """
    Decorator to drop rows and cols w all nan values
    Replace NaN values with None
    """

    def wrapper(*args, **kwargs):
        df = df_func(*args, **kwargs)

        # None or empty df
        try:
            if df.empty:
                return df
        except AttributeError:
            return df

        # Rows
        df.dropna(how="all", inplace=True)
        # Cols
        df.dropna(how="all", axis=1, inplace=True)
        # Replace NaN values with None
        df = df.where((pd.notnull(df)), None)
        return df

    return wrapper

def reformat_column_names(df_func):
    """
    Decorator to reformat DataFrame column names.

    Replace all column names having whitespace with underscore
    and make lowercase
    """

    def wrapper(*args, **kwargs):
        df = df_func(*args, **kwargs)
        # None or empty df
        try:
            if df.empty:
                return df
        except AttributeError:
            return df
        
        cols_to_lower(df)
        
        return df

    return wrapper

In [4]:
# Create study
study = {
    'data_access_authority': 'dbGaP',
    'study_id': 'phs001110',
    'study_version': 'v1.p1',
    'study_name': 'Genomic Analysis of Congenital Diaphragmatic Hernia and Associated Congenital Anomalies',
    'attribution': 'https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs001110.v1.p1'
}
study_df = pd.DataFrame([study])
study_df.to_csv(os.path.join(DATA_DIR, 'study.txt'))

# Create investigator
invest = {
    'investigator_name': 'Wendy Chung',
    'institution': 'Columbia University Health Sciences'
}
inv_df = pd.DataFrame([invest])
inv_df.to_csv(os.path.join(DATA_DIR, 'investigator.txt'))

In [26]:
@reformat_column_names
@dropna_rows_cols
def read_study_file_data(filepaths=None):
    """
    Read in raw study files
    """
    if not filepaths:
        filepaths = os.listdir(DBGAP_DIR)
        filepaths.extend(os.listdir(MANIFESTS_DIR))

    study_files = [{"study_file_name": f}
                   for f in filepaths]
    return pd.DataFrame(study_files)

@reformat_column_names
@dropna_rows_cols
def read_study_data(filepath=None):
    """
    Read study data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'study.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_investigator_data(filepath=None):
    """
    Read investigator data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'investigator.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_subject_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '4a_dbGaP_SubjectDS_corrected_7-16.xlsx')
    df = pd.read_csv(filepath, delimiter='\t', dtype={'SUBJID': str})
    
    # Decode consent ints to consent strings
    def func(row): 
        _map = {1: "Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)"}
        return _map.get(row.get('CONSENT'))
    df['CONSENT'] = df.apply(func, axis=1)
    
    # Decode affected status ints to strings
    def func(row): 
        _map = {1: True, 2: False}
        return _map.get(row.get('AFFECTED_STATUS'))
    df['AFFECTED_STATUS'] = df.apply(func, axis=1)
    
    return df

@reformat_column_names
@dropna_rows_cols
def read_phenotype_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SubjectPhenotypesDS.txt')
    df = pd.read_csv(filepath, 
                    delimiter='\t', 
                    dtype={'SUBJID': str})
    
    # Decode sex ints to gender strings
    def func(row): 
        _map = {1: "male", 2: "female"}
        return _map[row['Sex']]
    df['Sex'] = df.apply(func, axis=1)

    # Decode affected status ints to strings
    def func(row): 
        _map = {0:'unknown', 1: "not affected", 2: "affected"}
        return _map[row['AFFSTAT']]
    df['AFFSTAT'] = df.apply(func, axis=1)

    # Decode proband ints to booleans
    def func(row): 
        _map = {1: True, 2: False}
        return _map[row['Proband']]
    df['Proband'] = df.apply(func, axis=1)
    
    # Create ethnicity column
    _map = {'Hispanic': 'hispanic or latino'}
    df['ethnicity'] = df['Race'].apply(lambda x: _map.get(x, 'not hispanic or latino'))
    
    
    return df

@reformat_column_names
@dropna_rows_cols
def read_family_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_PedgreeDS.txt')
    df = pd.read_csv(filepath, delimiter='\t', dtype={'SUBJID': str})
    del df['SEX']
    return df

@reformat_column_names
@dropna_rows_cols
def create_participant_data():
    """
    Create participant data from 
    """
    # Subject file
    subject_df = read_subject_data()
    # Phenotype file
    phenotypes_df = read_phenotype_data()
    # Family file
    family_df = read_family_data()
    
    # Merge subject + phenotype
    df1 = pd.merge(subject_df, phenotypes_df, on='subject_id')
    
    # Merge family
    df = pd.merge(df1, family_df, on='subject_id')
    
    return df

def create_diagnosis_df(phenotype_df):
    """
    Create diagnosis df from phenotype df
    """
    def func(row): 
        _map = {'affected':'adolescent idiopathic scoliosis', 
                'not affected': None}
        return _map.get(row['affstat'], row['affstat'])
    phenotype_df['diagnosis'] = phenotype_df.apply(func, axis=1)
    
    return phenotype_df[['subject_id', 'diagnosis']]

def create_phenotype_df(phenotype_df):
    """
    Create phenotype df from original phenotype_df
    """
    # Extract columns
    phenotype_df = phenotype_df[['subject_id', 'affstat']]
    # Drop unknowns
    phenotype_df = phenotype_df[phenotype_df.affstat != 'unknown']
    
    # Add columns
    def func(row): 
        _map = {'affected':'positive', 
                'not affected': 'negative'}
        return _map.get(row['affstat'], row['affstat'])
    
    phenotype_df['observed'] = phenotype_df.apply(func, axis=1)
    phenotype_df['hpo_id'] = 'HP:0002650'
    phenotype_df['phenotype'] = 'adolescent idiopathic scoliosis'
    return phenotype_df
df = create_diagnosis_df(read_phenotype_data())
df.diagnosis.unique()

FileNotFoundError: File b'/Users/singhn4/Projects/kids_first/data/Chung/dbgap/HL132375-01A1_V2_SubjectPhenotypesDS.txt' does not exist

In [96]:
@reformat_column_names
@dropna_rows_cols
def read_sample_attr_data(filepath=None):
    """
    Read sample attributes file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SampleAttributesDS.txt')
    return pd.read_csv(filepath, delimiter='\t')

@reformat_column_names
@dropna_rows_cols
def read_subject_sample_data(filepath=None):
    """
    Read subject sample mapping file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SubjectSampleMappingDS.txt')
    return pd.read_csv(filepath, delimiter='\t')

# Sample attributes file
sample_attr_df = read_sample_attr_data()
sample_attr_df.shape
# Subject sample file
subject_sample_df = read_subject_sample_data()
# Subject file
subject_df = read_subject_data()

# Merge sample attributes w subject sample
df1 = pd.merge(sample_attr_df, subject_sample_df, on='sample_id')
# Merge sample with subject
sample_df = pd.merge(df1, subject_df, on='subject_id')
sample_df

@reformat_column_names
@dropna_rows_cols
def read_seq_exp_data(filepath=None):
    """
    Read sequencing experiment data
    """
    if not filepath:
        filepath = os.path.join(MANIFESTS_DIR, 'manifest_171210.csv')

    df = pd.read_csv(filepath)
    df['Sample Description'] = df['Sample Description'].apply(lambda x: x.split(':')[-1].strip())
    df.describe(include=['O']).T.sort_values('unique', ascending=False)
    
    # Subject sample mapping
    filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SubjectSampleMappingDS.txt')
    subject_sample_df = pd.read_csv(filepath, delimiter='\t')

    # Merge with subject samples
    df = pd.merge(subject_sample_df, df, left_on='SAMPLE_ID', right_on='Sample Description')

    # Add unique col
    def func(row): return "_".join(['seq_exp', str(row.name)])
    df['seq_exp_id'] = df.apply(func, axis=1)

    return df
df = read_seq_exp_data()
df.shape

(395, 11)

## Explore

In [12]:
# Db gap files
files = {f:os.path.join(DBGAP_DIR, f) for f in os.listdir(DBGAP_DIR)}
pprint(list(files.keys()))

['2a_dbGaP_SubjectPhenotypesDS.xlsx',
 '2b_dbGaP_SubjectPhenotypesDD.xlsx',
 '3a_dbGaP_SubjectAttributesDS_corrected.6.12.xlsx',
 '3b_dbGaP_SubjectAttributesDD.xlsx',
 '4a_dbGaP_SubjectDS_corrected_7-16.xlsx',
 '4b_dbGaP_SubjectDD_corrected_6_12.xlsx',
 '5a_dbGaP_SubjectSampleMappingDS cumulative.xlsx',
 '5b_dbGaP_SubjectSampleMappingDD.xlsx',
 '6a_dbGaP_PedigreeDS_corrected.6.12.xlsx',
 '6b_dbGaP_PedigreeDD.xlsx']


### Subject 

In [13]:
# Description of data
df1 = pd.read_excel(files['4b_dbGaP_SubjectDD_corrected_6_12.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df1

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES,Unnamed: 8
0,SUBJECT_ID,De-identified Subject ID,,encoded value,,,Collected at enrollment,,
1,CONSENT,Consent group as determined by DAC,,encoded value,,,Collected at enrollment,"1=Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",
2,AFFECTED_STATUS,Case control status of the subject for congenital diaphragmatic hernia/defects,,encoded value,,,Collected at enrollment,1=affected,2=unaffected


In [15]:
# Subject data
df = pd.read_excel(files['4a_dbGaP_SubjectDS_corrected_7-16.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df.head()

Unnamed: 0,SUBJECT_ID,CONSENT,AFFECTED_STATUS
0,06-0015,1,1
1,07-0016,1,1
2,09-0019,1,1
3,6-15F,1,2
4,6-15M,1,2


In [25]:
print(df['CONSENT'].unique())
print(df['AFFECTED_STATUS'].unique())
df.describe(include=['O']).T.sort_values('unique', ascending=False)

[1]
[1 2]


Unnamed: 0,count,unique,top,freq
SUBJECT_ID,962,962,CDH02-0069,1


In [49]:
# Subject attributes data description
df = pd.read_excel(files['3b_dbGaP_SubjectAttributesDD.xlsx'])
df

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,SAMPLE_ID,De-identified Sample ID,,string,,,Collected at enrollment,,,,,,
1,body_site,Body site where sample was collected,,encoded value,,,Collected at enrollment,B=blood,SK=skin,D=diaphragm,SV=saliva,A=amniocytes,M=muscle
2,analyte_type,Analyte Type,,encoded value,,,Collected at enrollment,,,,,,
3,is_tumor,Tumor status,,encoded value,,,Collected at enrollment,Y=Is Tumor,N=Is not a tumor,,,,
4,SUBJECT_ID,De-identified Subject ID,,string,,,,,,,,,


In [46]:
# Subject attributes
df = pd.read_excel(files['3a_dbGaP_SubjectAttributesDS_corrected.6.12.xlsx'], delimiter='\t')
df.head()

Unnamed: 0,SUBJECT_ID,SAMPLE_ID,body_site,analyte_type,is_tumor
0,06-0015,SM-DH4NH,D,DNA,N
1,07-0016,SM-DH4NJ,B,DNA,N
2,09-0019,SM-DH4NK,SK,DNA,N
3,6-15F,SM-DH4NF,B,DNA,N
4,6-15M,SM-DH4NG,B,DNA,N


In [48]:
print(df.body_site.unique())
df.describe(include=['O']).T.sort_values('unique', ascending=False)

['D' 'B' 'SK' 'M' 'A']


Unnamed: 0,count,unique,top,freq
SUBJECT_ID,962,962,CDH02-0069,1
SAMPLE_ID,962,962,SM-DH5A6,1
body_site,962,5,B,945
analyte_type,962,1,DNA,962
is_tumor,962,1,N,962


### Family/Pedigree

In [31]:
# Data description
df = pd.read_excel(files['6b_dbGaP_PedigreeDD.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df.head()

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES,Unnamed: 8,Unnamed: 9
0,FAMILY_ID,Family_ID,,encoded value,,,Collected at enrollment,,,
1,SUBJECT_ID,De-identified Subject ID,,encoded value,,,Collected at enrollment,,,
2,Father,ID of father,,encoded value,,,Collected at enrollment,,,
3,Mother,ID of mother,,encoded value,,,Collected at enrollment,,,
4,SEX,Gender of participant,,encoded value,,,Collected at enrollment,M=male,F=female,U=unknown


In [32]:
df = pd.read_excel(files['6a_dbGaP_PedigreeDS_corrected.6.12.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df.head()

Unnamed: 0,Family_id,SUBJECT_ID,Father,Mother,SEX
0,06-0015,06-0015,6-15F,6-15M,F
1,07-0016,07-0016,CDH1377,7-16M,M
2,09-0019,09-0019,CDH1247,CDH1246,M
3,06-0015,6-15F,,,M
4,06-0015,6-15M,,,F


In [33]:
df.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,962,962,CDH02-0069,1
Family_id,962,321,02-0050,3
Father,320,320,CDH957,1
Mother,320,320,CDH7-34M,1
SEX,962,3,M,518


### Phenotypes

In [36]:
# Data description
df = pd.read_excel(files['2b_dbGaP_SubjectPhenotypesDD.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,SUBJECT_ID,De-identified Subject ID,,string,,,Collected at enrollment,,,,,
1,SEX,Gender of participant,,encoded value,,,Collected at enrollment,M=Male,F=Female,U=unknown,,
2,ISOLATED,If the subject has a second birth defect,,encoded value,,,Collected at enrollment,1=Isolated,2=Non-isolated,U=unknown,NA=Not applicable,
3,CHD,If the subject has a congenital heart defect,,encoded value,,,Collected at enrollment,1=yes,0=No,NA=Not applicable,,
4,CNS,If the subject has a central nervous system defect,,encoded value,,,Collected at enrollment,1=yes,0=No,NA=Not applicable,,
5,GI,If the subject has a gastrointestinal defect,,encoded value,,,Collected at enrollment,1=yes,0=No,NA=Not applicable,,
6,other_cong_malf_1,Other birth defects (1),,encoded value,,,Collected at enrollment,text,0=No,NA=Not applicable,,
7,other_cong_malf_2,Other birth defects (2),,encoded value,,,Collected at enrollment,text,0=No,NA=Not applicable,,
8,other_cong_malf_3,Other birth defects (3),,encoded value,,,Collected at enrollment,text,0=No,NA=Not applicable,,
9,discharge_status,Discharge status,,encoded value,,,Collected at enrollment,1=Alive,4=Deceased,0=Fetal sample,8=unknown,NA=Not applicable


In [38]:
# Data
df = pd.read_excel(files['2a_dbGaP_SubjectPhenotypesDS.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df.head()

Unnamed: 0,SUBJECT_ID,SEX,ISOLATED,CHD,CNS,GI,other_cong_malf_1,other_cong_malf_2,other_cong_malf_3,discharge_status,Ethnicity,Race
0,06-0015,F,2.0,0.0,0.0,0.0,congenital pulmonary sequestration malformation,0,0,1.0,non-Hispanic,white
1,07-0016,M,1.0,0.0,0.0,0.0,0,0,0,1.0,non-Hispanic,white
2,09-0019,M,2.0,1.0,1.0,0.0,pulmonary hypoplasia,"Facial & Body deformities suggestive of Potter sequence: wide inter brain weight, canthal distance; flattened, widened nose; micrognathia; flattened pinnae w/ somewhat bulbous earlobes; varus deformity with metatarsus adductus, feet, severe",small kidneys; renal proximal tubular epithelial degeneration and sloughing with cortical medullary junction congestion,4.0,non-Hispanic,white
3,6-15F,M,,,,,,,,,non-Hispanic,white
4,6-15M,F,,,,,,,,,non-Hispanic,white


In [39]:
df.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,962,962,CDH02-0069,1
other_cong_malf_1,321,61,0,254
other_cong_malf_2,320,25,0,296
other_cong_malf_3,319,10,0,310
Race,962,9,White,450
Ethnicity,962,5,Non-Hispanic,412
SEX,962,3,M,518
ISOLATED,323,3,1,183


### Samples

In [50]:
# Sample attributes data description
filepath = os.path.join(DBGAP_DIR, '5b_dbGaP_SubjectSampleMappingDD.xlsx')
df = pd.read_excel(filepath, delimiter='\t')
df.head()

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES
0,SUBJECT_ID,De-identified Subject ID,,encoded value,,,Collected at enrollment,
1,SAMPLE_ID,De-identified Sample ID,,encoded value,,,Collected at enrollment,
2,SAMPLE_USE,Sample use,,encoded value,,,Collected at enrollment,


In [51]:
# Sample attributes data
filepath = os.path.join(DBGAP_DIR, '5a_dbGaP_SubjectSampleMappingDS cumulative.xlsx')
df = pd.read_excel(filepath, delimiter='\t')
df.head()

Unnamed: 0,SUBJECT_ID,SAMPLE_ID,SAMPLE_USE
0,1,1,Seq_DNA_WholeGenome
1,2,2,Seq_DNA_WholeGenome
2,7,7,Seq_DNA_WholeGenome
3,8,8,Seq_DNA_WholeGenome
4,10,10,Seq_DNA_WholeGenome


In [52]:
df.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,1556,1556,4-73m,1
SAMPLE_ID,1556,1556,4-73m,1
SAMPLE_USE,1556,1,Seq_DNA_WholeGenome,1556


## Extract

In [61]:
@reformat_column_names
@dropna_rows_cols
def read_subject_data(filepath=None):
    """
    Read subject data file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '4a_dbGaP_SubjectDS_corrected_7-16.xlsx')
    df = pd.read_excel(filepath, dtype={'SUBJECT_ID': str})
    # Decode consent ints to consent strings
    def func(row): 
        _map = {1: "Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)"}
        return _map.get(row['CONSENT'])
    df['CONSENT'] = df.apply(func, axis=1)

    # Decode affected status ints to strings
    def func(row): 
        _map = {0:'unknown', 1: "affected", 2: "unaffected"}
        return _map.get(row['AFFECTED_STATUS'])
    df['AFFECTED_STATUS'] = df.apply(func, axis=1)
    return df
subject_df = read_subject_data()
subject_df.head()

Unnamed: 0,subject_id,consent,affected_status
0,06-0015,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected
1,07-0016,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected
2,09-0019,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected
3,6-15F,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected
4,6-15M,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected


In [69]:
@reformat_column_names
@dropna_rows_cols
def read_subject_attr_data(filepath=None):
    """
    Read subject attributes file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '3a_dbGaP_SubjectAttributesDS_corrected.6.12.xlsx')
    df = pd.read_excel(filepath, dtype={'SUBJECT_ID': str})
    # Decode body_site chars to strings
    def func(row): 
        _map = {'B':'blood', 'SK': 'skin', 'D': 'diaphragm', 'SV': 'saliva', 'A': 'amniocytes', 'M': 'amniocytes'}
        return _map.get(row['body_site'])
    df['body_site'] = df.apply(func, axis=1)
    return df
subject_attr_df = read_subject_attr_data()
subject_attr_df.head()

Unnamed: 0,subject_id,sample_id,body_site,analyte_type,is_tumor
0,06-0015,SM-DH4NH,diaphragm,DNA,N
1,07-0016,SM-DH4NJ,blood,DNA,N
2,09-0019,SM-DH4NK,skin,DNA,N
3,6-15F,SM-DH4NF,blood,DNA,N
4,6-15M,SM-DH4NG,blood,DNA,N


In [81]:
@reformat_column_names
@dropna_rows_cols
def read_family_data(filepath=None):
    """
    Read pedigree data
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '6a_dbGaP_PedigreeDS_corrected.6.12.xlsx')
    df = pd.read_excel(filepath)
    del df['SEX']
    
    # Add is_proband column
    def func(row): return pd.notnull(row['Mother']) and pd.notnull(row['Father'])
    df['is_proband'] = df.apply(func, axis=1)
    
    return df
family_df = read_family_data()
family_df.head()

Unnamed: 0,family_id,subject_id,father,mother,is_proband
0,06-0015,06-0015,6-15F,6-15M,True
1,07-0016,07-0016,CDH1377,7-16M,True
2,09-0019,09-0019,CDH1247,CDH1246,True
3,06-0015,6-15F,,,False
4,06-0015,6-15M,,,False


In [84]:
# Participant df
# Merge subject + subject attributes
df1 = pd.merge(subject_df, subject_attr_df, on='subject_id')

# Merge family
participant_df = pd.merge(df1, family_df[['subject_id', 'is_proband', 'family_id']], on='subject_id')
participant_df.head()

Unnamed: 0,subject_id,consent,affected_status,sample_id,body_site,analyte_type,is_tumor,is_proband,family_id
0,06-0015,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NH,diaphragm,DNA,N,True,06-0015
1,07-0016,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NJ,blood,DNA,N,True,07-0016
2,09-0019,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NK,skin,DNA,N,True,09-0019
3,6-15F,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected,SM-DH4NF,blood,DNA,N,False,06-0015
4,6-15M,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected,SM-DH4NG,blood,DNA,N,False,06-0015


In [99]:
@reformat_column_names
@dropna_rows_cols
def read_demographic_data(filepath=None):
    """
    Read demographic data from phenotype file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '2a_dbGaP_SubjectPhenotypesDS.xlsx')
    df = pd.read_excel(filepath)
    # Make all values lower case
    for col in ['Ethnicity', 'Race']:
        df[col] = df[col].apply(lambda x: str(x).lower().strip())
    return df[['SUBJECT_ID', 'SEX', 'Ethnicity', 'Race']]
demographic_df = read_demographic_data()
demographic_df = pd.merge(demographic_df, participant_df, on='subject_id')
demographic_df.head()

Unnamed: 0,subject_id,sex,ethnicity,race,consent,affected_status,sample_id,body_site,analyte_type,is_tumor,is_proband,family_id
0,06-0015,F,non-hispanic,white,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NH,diaphragm,DNA,N,True,06-0015
1,07-0016,M,non-hispanic,white,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NJ,blood,DNA,N,True,07-0016
2,09-0019,M,non-hispanic,white,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",affected,SM-DH4NK,skin,DNA,N,True,09-0019
3,6-15F,M,non-hispanic,white,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected,SM-DH4NF,blood,DNA,N,False,06-0015
4,6-15M,F,non-hispanic,white,"Disease-Specific (Congenital Diaphragmatic Hernia, COL, GSO, RD) (DS-CDH-COL-GSO-RD)",unaffected,SM-DH4NG,blood,DNA,N,False,06-0015


In [97]:
# Phenotype data description
df = pd.read_excel(files['2b_dbGaP_SubjectPhenotypesDD.xlsx'], delimiter='\t', dtype={'SUBJID': str})
df

Unnamed: 0,VARNAME,VARDESC,DOCFILE,TYPE,VARIABLE_SOURCE,SOURCE_VARIABLE_ID,COLLINTERVAL,VALUES,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,SUBJECT_ID,De-identified Subject ID,,string,,,Collected at enrollment,,,,,
1,SEX,Gender of participant,,encoded value,,,Collected at enrollment,M=Male,F=Female,U=unknown,,
2,ISOLATED,If the subject has a second birth defect,,encoded value,,,Collected at enrollment,1=Isolated,2=Non-isolated,U=unknown,NA=Not applicable,
3,CHD,If the subject has a congenital heart defect,,encoded value,,,Collected at enrollment,1=yes,0=No,NA=Not applicable,,
4,CNS,If the subject has a central nervous system defect,,encoded value,,,Collected at enrollment,1=yes,0=No,NA=Not applicable,,
5,GI,If the subject has a gastrointestinal defect,,encoded value,,,Collected at enrollment,1=yes,0=No,NA=Not applicable,,
6,other_cong_malf_1,Other birth defects (1),,encoded value,,,Collected at enrollment,text,0=No,NA=Not applicable,,
7,other_cong_malf_2,Other birth defects (2),,encoded value,,,Collected at enrollment,text,0=No,NA=Not applicable,,
8,other_cong_malf_3,Other birth defects (3),,encoded value,,,Collected at enrollment,text,0=No,NA=Not applicable,,
9,discharge_status,Discharge status,,encoded value,,,Collected at enrollment,1=Alive,4=Deceased,0=Fetal sample,8=unknown,NA=Not applicable


In [158]:
# Phenotype
filepath = os.path.join(DBGAP_DIR, '2a_dbGaP_SubjectPhenotypesDS.xlsx')
df = pd.read_excel(filepath)
df.drop(['Ethnicity', 'Race', 'SEX', 'discharge_status', 'ISOLATED'], inplace=True, axis=1)
# Reshape to build the phenotypes df
cols = df.columns.tolist()[1:]
phenotype_df = pd.melt(df, id_vars='SUBJECT_ID', value_vars=cols,
                       var_name='phenotype', value_name='value')

# Drop rows where value is NaN
phenotype_df = phenotype_df[pd.notnull(phenotype_df['value'])]

# Decode phenotypes to descriptive strings
def func(row):
    _map = {0: 'no', 1: 'yes'}
    return _map.get(row['value'], row['value'])
phenotype_df['value'] = phenotype_df.apply(func, axis=1)

# Decode phenotypes to descriptive strings
def func(row):
    # Always take most specific value
    if row['value'] not in ['yes', 'no']:
        val = row['value']
    else:
        _map = {'CHD': 'congenital heart defect', 'CNS': 'central nervous system defect', 
            'GI': 'gastrointestinal defect'}
        val = _map.get(row['phenotype'], row['phenotype'])
    return val
phenotype_df['phenotype'] = phenotype_df.apply(func, axis=1)

# Set observed
phenotype_df['observed'] = phenotype_df['value'].apply(lambda x: 'positive' if x != 'no' else 'negative')
del phenotype_df['value']
phenotype_df

Unnamed: 0,SUBJECT_ID,phenotype,observed
0,06-0015,congenital heart defect,negative
1,07-0016,congenital heart defect,negative
2,09-0019,congenital heart defect,positive
6,c1068ANDa,congenital heart defect,negative
9,c1072HICa,congenital heart defect,negative
12,c1086GARa,congenital heart defect,negative
15,c1087AITa,congenital heart defect,negative
18,c1088SIDa,congenital heart defect,negative
21,c1094WELa,congenital heart defect,negative
24,c1100KENa,congenital heart defect,negative


In [123]:
df.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,962,962,CDH02-0069,1
other_cong_malf_1,321,61,0,254
other_cong_malf_2,320,25,0,296
other_cong_malf_3,319,10,0,310
Race,962,9,White,450
Ethnicity,962,5,Non-Hispanic,412
SEX,962,3,M,518
ISOLATED,323,3,1,183


In [122]:
@reformat_column_names
@dropna_rows_cols
def read_outcome_data(filepath=None):
    """
    Read outcome data from phenotype file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, '2a_dbGaP_SubjectPhenotypesDS.xlsx')
    df = pd.read_excel(filepath)
    
    # Replace NaN values with None
    df['discharge_status'] = df['discharge_status'].where((pd.notnull(df['discharge_status'])),999)
    
    # Map discharge status
    # 1=Alive 4=Deceased 0=Fetal sample 8=unknown NA=Not applicable
    def func(row): 
        _map = {0:'alive', 1: 'deceased', 4:'fetal sample', 8: 'unknown'}
        return _map.get(int(row['discharge_status']), 'not applicable')
    df['discharge_status'] = df.apply(func, axis=1)
    return df[['SUBJECT_ID', 'discharge_status']]
outcome_df = read_outcome_data()
outcome_df.head()

Unnamed: 0,subject_id,discharge_status
0,06-0015,deceased
1,07-0016,deceased
2,09-0019,fetal sample
3,6-15F,not applicable
4,6-15M,not applicable
