In [1]:
import os
import pandas as pd

DATA_DIR = '/Users/singhn4/Projects/kids_first/data/Seidman_2015'
DBGAP_DIR = os.path.join(DATA_DIR, 'dbgap')
ALIQUOT_SHIP_DIR = os.path.join(DATA_DIR, 'manifests', 'shipping')

### Extraction - Methods

In [2]:
# Helper functions
def dropna_rows_cols(df_func):
    """
    Decorator to drop rows and cols w all nan values
    """

    def wrapper(*args, **kwargs):
        df = df_func(*args, **kwargs)

        # None or empty df
        try:
            if df.empty:
                return df
        except AttributeError:
            return df

        # Rows
        df.dropna(how="all", inplace=True)
        # Cols
        df.dropna(how="all", axis=1, inplace=True)
        return df

    return wrapper

def reformat_column_names(df_func):
    """
    Decorator to reformat DataFrame column names.

    Replace all column names having whitespace with underscore
    and make lowercase
    """

    def wrapper(*args, **kwargs):
        df = df_func(*args, **kwargs)
        # None or empty df
        try:
            if df.empty:
                return df
        except AttributeError:
            return df
        df.columns = map((lambda x: x.replace(" ", "_").lower()),
                         df.columns)
        return df

    return wrapper

In [3]:
@reformat_column_names
@dropna_rows_cols
def read_study_file_data(filepaths=None):
    """
    Read in raw study files
    """
    if not filepaths:
        filepaths = os.listdir(DBGAP_DIR)

    study_files = [{"study_file_name": f}
                   for f in filepaths if 'dbGaP' in f]
    return pd.DataFrame(study_files)

@reformat_column_names
@dropna_rows_cols
def read_study_data(filepath=None):
    """
    Read study data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'study.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_investigator_data(filepath=None):
    """
    Read investigator data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'investigator.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_family_data(filepath=None):
    """
    Read family data for all participants
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR,
                                '7a_dbGaP_PedigreeDS.txt')
    df = pd.read_csv(filepath,
                     delimiter='\t',
                     dtype={'SUBJID': str})
    # Subset of columns
    df.drop(['SEX'], axis=1, inplace=True)

    # Add proband column
    def func(row): return bool(row['MOTHER'] and row['FATHER'])
    df['is_proband'] = df.apply(func, axis=1)

    return df

In [4]:
# Find the potential negative/positive observed vaue
from pandas.api.types import is_numeric_dtype
filepath = os.path.join(DBGAP_DIR, '3a_dbGaP_SubjectPhenotypes_ExtracardiacFindingsDS.txt')

# Read csv
df = pd.read_csv(filepath,
             delimiter='\t',
             dtype={'SUBJID': str})

cols = df.columns.tolist()[1:]
for col in cols:
    if not is_numeric_dtype(df[col]):
        print(df[col].unique())


['No' 'Yes']
['None' 'No' 'Yes']
['None' 'No' 'Yes']
['Unknown' 'None' 'Yes' 'No/Not checked']
['None' 'abnormal fetal calvarium'
 'Dolichocephaly with bony synostosis of sagittal structure.'
 'dolichochocehaly' 'unilateral coronal synostosis']
['Unknown' 'None' 'Yes' 'No/Not checked']
['Unknown' 'None' 'Yes' 'No/Not checked']
['Unknown' 'None' 'Yes']
['Unknown' 'None' 'Yes' 'No/Not checked']
['None' 'frontal bossing (occipital protuberance)'
 'cephalohematoma, Caput Succedaneum'
 'short, sloping forehead with prominent occiput' 'bitemporal hallowing'
 'small anterior fontanelle' 'congenital scar from eye to scalp per Mom'
 'brachycephaly' 'parietal encephalocele, plagiocephaly'
 'overriding coronal sutures'
 'medical record indicates that she had a cyst in her brain'
 'narrow forehead' 'metopic prominence' 'plagiocephaly' 'Plagiocephaly'
 'sloped forehead, narrow' 'flat face' '3 fontanelles'
 'Plagiocephaly, triangular shaped in coronal plane' 'bitemporal narrowing']
['None' 'No' 'Yes

In [140]:
@reformat_column_names
@dropna_rows_cols
def read_phenotype_data(filepath=None):
    """
    Read phenotype data
    """
    filepath = os.path.join(
    DBGAP_DIR,
    '3a_dbGaP_SubjectPhenotypes_ExtracardiacFindingsDS.txt')

    # Read csv
    df = pd.read_csv(filepath,
                     delimiter='\t',
                     dtype={'SUBJID': str})

    # Convert age years to days
    df['LATEST_EXAM_AGE'] = df["LATEST_EXAM_AGE"].apply(
        lambda x: float(x) * 365)
    age_at_event_days = df[['LATEST_EXAM_AGE', 'SUBJID']]

    # Select string based phenotypes
    df = df.select_dtypes(include='object')

    # Make all values lower case
    for col in df.columns.tolist():
        df[col] = df[col].apply(lambda x: str(x).lower())

    # Reshape to build the phenotypes df
    cols = df.columns.tolist()[2:]
    phenotype_cols = [col for col in cols if not col.startswith('OTHER')]
    phenotype_df = pd.melt(df, id_vars='SUBJID', value_vars=phenotype_cols,
                           var_name='phenotype', value_name='observed')

    # Merge with HPO mapping
    mapping_filepath = os.path.join(DATA_DIR, 'phenotype_hpo_mapping.txt')
    if os.path.isfile(mapping_filepath):
        hpo_df = pd.read_csv(mapping_filepath)
        new_df = pd.merge(hpo_df, phenotype_df, on='phenotype')
        new_df.rename(columns={"Yes": "hpo_id"}, inplace=True)
        phenotype_df = new_df[['hpo_id', 'phenotype', 'observed','SUBJID']]

    # Remove unkonwns
    unknown_values = ['none', 'unknown', 'no/not checked' 'not applicable', 'absent']
    phenotype_df = phenotype_df[phenotype_df['observed'].apply(lambda x: x not in unknown_values)]

    # Map to positive/negative
    def func(row): 
        return 'negative' if row['observed'] == 'no' else 'positive'
    phenotype_df['observed'] = phenotype_df.apply(func, axis=1)

    # Clean up hpo_id
    phenotype_df.loc[(phenotype_df.hpo_id == 'None') | (phenotype_df.hpo_id == '--'), 'hpo_id'] = None

    # Merge back in age at event in days
    phenotype_df = pd.merge(phenotype_df, age_at_event_days, on='SUBJID')

    # Add unique col
    def func(row): return "_".join(['phenotype', str(row.name)])
    phenotype_df['phenotype_id'] = phenotype_df.apply(func, axis=1)

    return phenotype_df

['none' 'no' 'yes' 'not applicable' 'no/not checked' 'unknown'
 'congenital hypoglycemia' 'hypothyroidism, vit d deficiency' 'hypothyroid'
 'late onset of puberty/growth hormone deficiency'
 'hypothyroidism (update )' 'congenital hypothyroidism' 'short stature'
 'hypothyroidism' 'central hypothyrodism' 'hypercholesterolemia'
 'hypothyroidism, hyperbilirubinemia' 'hypoglycemia, hypoparathyroidism'
 'delayed puberty-no facial hair,pubic or underarm hair.'
 'low growth factor/idiopathic short stature' 'pheochromocytoma   surgery '
 'hypoglycemia' 'immune globulin deficiency for unknown reasons'
 'exposed to toxoplasma in utero - born negative'
 'juvenile rheumatoid arthritis' 'abnormal fetal calvarium'
 'dolichocephaly with bony synostosis of sagittal structure.'
 'dolichochocehaly' 'unilateral coronal synostosis' 'hypoplastic' 'absent'
 'other']


In [6]:
# Gender
@reformat_column_names
@dropna_rows_cols
def read_gender_data(filepath=None):
    """
    Read gender data for all subjects
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR,
                                '3a_dbGaP_SubjectPhenotypes_GenderDS.txt')
    df = pd.read_csv(filepath,
                     delimiter='\t',
                     dtype={'SUBJID': str})

    return df

In [7]:
# Demographic
@reformat_column_names
@dropna_rows_cols
def read_demographic_data(filepaths=None):
    """
    Read demographic data for all subjects (child, mother, father)
    """
    if not filepaths:
        filenames = ['3a_dbGaP_SubjectPhenotypes_DemographicsDS.txt',
                     '3a_dbGaP_SubjectPhenotypes_MaternalDemographicsDS.txt',
                     '3a_dbGaP_SubjectPhenotypes_PaternalDemographicsDS.txt']

        filepaths = [os.path.join(DBGAP_DIR, filename)
                     for filename in filenames
                     ]

    child_demo_df = pd.read_csv(os.path.join(filepaths[0]),
                                delimiter='\t',
                                dtype={'SUBJID': str})

    mother_demo_df = pd.read_csv(os.path.join(filepaths[1]),
                                 delimiter='\t',
                                 dtype={'SUBJID': str})

    father_demo_df = pd.read_csv(os.path.join(filepaths[2]),
                                 delimiter='\t',
                                 dtype={'SUBJID': str})

    # Combine demographics of all subjects
    subject_demo_df = pd.concat(
        [child_demo_df, mother_demo_df, father_demo_df])

    subject_demo_df.drop_duplicates('SUBJID', inplace=True)
    
    # Subset of columns
    subject_demo_df = subject_demo_df[['RACE', 'ETHNICITY', 'SUBJID']]

    def func(row): return "_".join(['demographic', str(row.name)])
    subject_demo_df['demographic_id'] = subject_demo_df.apply(func, axis=1)


    return subject_demo_df

In [8]:
# Diagnosis
@reformat_column_names
@dropna_rows_cols
def read_diagnosis_data(filepath=None):
    """
    Read diagnoses data for all subjects
    """
    if not filepath:
        filename = '3a_dbGaP_SubjectPhenotypes_PatientDiagnosisDS.txt'
        filepath = os.path.join(DBGAP_DIR, filename)

    diagnosis_df = pd.read_csv(filepath,
                               delimiter='\t',
                               dtype={'SUBJID': str})

    def func(row): return "_".join(['diagnosis', str(row.name)])
    diagnosis_df['diagnosis_id'] = diagnosis_df.apply(func, axis=1)

    return diagnosis_df

In [9]:
# Sample
@reformat_column_names
@dropna_rows_cols
def read_subject_sample_data(filepath=None):
    """
    Read sample metadata for all subjects
    """
    if not filepath:
        filename = '6a_dbGaP_SubjectSampleMappingDS.txt'
        filepath = os.path.join(DBGAP_DIR, filename)

    subject_sample_df = pd.read_csv(filepath,
                                    delimiter='\t',
                                    dtype={'SUBJID': str})
    subject_sample_df.drop_duplicates('SUBJID', inplace=True)

    return subject_sample_df

In [10]:
# Aliquot
@reformat_column_names
@dropna_rows_cols
def read_sample_shipping_manifest_data(*filepaths):
    """
    Read shipping manifest for samples (from PI/sample source center)
    """
    if not filepaths:
        filepaths = [os.path.join(ALIQUOT_SHIP_DIR, filename)

                     for filename in os.listdir(ALIQUOT_SHIP_DIR)
                     ]

    # Combine all manifest files
    dfs = [pd.read_excel(filepath,
                         delimiter='/t',
                         dtype={'*barcode': str},
                         skiprows=[0, 1],
                         header=[6])

           for filepath in filepaths

           if os.path.basename(filepath).startswith("PCGC")

           ]
    df = pd.concat(dfs)

    # Rename columns
    df.columns = map((lambda x: x.lower().lstrip("*")), df.columns)
    
    # Subset of columns
    df = df[['barcode',
             'external_id',
             'sample_collection_site',
             'sample_role',
             'concentration_ng_per_ul',
             'initial_volume_microliters']]

    # Drop rows where id cols are nan
    id_cols = [col for col in df.columns if "id" in col]
    df.dropna(subset=id_cols, inplace=True)

    return df

In [11]:
# Sequencing experiment (from read group metadata)
@reformat_column_names
@dropna_rows_cols
def read_seq_experiment_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DATA_DIR, "seidman_metadata.xlsx")

    df = pd.read_excel(filepath, dtype={"date": str})
    # Rename some columns
    df.rename(columns={"library_name (in original BAM header)":
                       "library_name",
                       "barcode": "rg_barcode"}, inplace=True)
    df["read_length"] = df["read_length"].apply(
        lambda x: int(x.split("x")[0]))
    
    # Create new columns
    df['max_insert_size'] = df['insert_size'].max()
    df['mean_insert_size'] = df['insert_size'].mean()
    df['mean_read_length'] = df['read_length'].mean()
    df['total_reads'] = df['read_length'].count()
    
    # Subset of columns
    df = df[['sample_name',
             'library_name',
             'rg_barcode',
             'run_name',
             'read_length',
             'date',
             'library_strategy',
             'library_source',
             'library_selection',
             'insert_size',
             'instrument',
             'library_layout',
             'max_insert_size',
             'mean_insert_size',
             'mean_read_length',
             'total_reads']]
    
    return df

### Extraction - Execution

In [12]:
# Study files

In [13]:
study_files_df = read_study_file_data()
study_files_df.head()

Unnamed: 0,study_file_name
0,3a_dbGaP_SubjectPhenotypes_CardiacAbnormalitie...
1,3a_dbGaP_SubjectPhenotypes_CardiacSurgeriesPro...
2,3a_dbGaP_SubjectPhenotypes_CatheterizationDS.txt
3,3a_dbGaP_SubjectPhenotypes_CopynumberResultsDS...
4,3a_dbGaP_SubjectPhenotypes_DemographicsDS.txt


In [14]:
# Family 
family_df = read_family_data()
family_df.head()

Unnamed: 0,famid,subjid,mother,father,is_proband
0,1,210,211,212,True
1,1,211,0,0,False
2,1,212,0,0,False
3,2,272,273,274,True
4,2,273,0,0,False


In [15]:
# Phenotypes
phenotype_df = read_phenotype_data()
phenotype_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,hpo_id,phenotype,observed,latest_exam_age,subjid,phenotype_id
0,,ABDOMINAL_ABNORMALITIES_PRESENT,none,,3673,phenotype_0
1,,ABDOMINAL_ABNORMALITIES_PRESENT,no,,21296,phenotype_1
2,,ABDOMINAL_ABNORMALITIES_PRESENT,no,,9653,phenotype_2
3,,ABDOMINAL_ABNORMALITIES_PRESENT,none,,15386,phenotype_3
4,,ABDOMINAL_ABNORMALITIES_PRESENT,none,,26312,phenotype_4


In [16]:
# Gender
gender_df = read_gender_data()
gender_df.head()

Unnamed: 0,subjid,sex
0,5787,M
1,16252,F
2,26290,F
3,14665,F
4,2248,M


In [17]:
# Demographic
demographic_df = read_demographic_data()
demographic_df.head()

Unnamed: 0,race,ethnicity,subjid,demographic_id
0,White,No,3673,demographic_0
1,Black or African American,No,21296,demographic_1
2,White,No,9653,demographic_2
3,White,No,15386,demographic_3
4,White,Yes,26312,demographic_4


In [18]:
# Diagnosis data
diagnosis_df = read_diagnosis_data()
diagnosis_df.head()
# diagnosis_df[diagnosis_df['subjid'] == '279']

Unnamed: 0,subjid,diagnosis,diagnosis_id
0,3673,L-loop corrected transposition of the great ar...,diagnosis_0
1,3673,Complete heart block,diagnosis_1
2,21296,Tetralogy of Fallot with pulmonary atresia,diagnosis_2
3,9653,Right aortic arch with mirror image branching ...,diagnosis_3
4,9653,"Ventricular septal defect, membranous",diagnosis_4


In [19]:
# Sample data
subject_sample_df = read_subject_sample_data()
subject_sample_df.head()

Unnamed: 0,subjid,sampid,samp_source,source_sampid,sample_use
0,22921,CG0019-7254,CORIELL,CG0019-7254,Seq_DNA_WholeGenome
1,16165,CG0012-4474,CORIELL,CG0012-4474,Seq_DNA_WholeGenome
2,9800,CG0009-6098,CORIELL,CG0009-6098,Seq_DNA_WholeGenome
3,15386,CG0011-4770,CORIELL,CG0011-4770,Seq_DNA_WholeGenome
4,570,CG0000-1854,CORIELL,CG0000-1854,Seq_DNA_WholeGenome


In [20]:
# Aliquot/Sample Shipping data
shipping_manifest_df = read_sample_shipping_manifest_data()
shipping_manifest_df.head()

Unnamed: 0,barcode,external_id,sample_collection_site,sample_role,concentration_ng_per_ul,initial_volume_microliters
0,1125743864,CG0000-6676,Blood,Affected,50.0,66.1
1,1125743887,CG0000-6978,Blood,Parent,50.0,65.7
2,1125743888,CG0000-6989,Blood,Parent,50.0,66.0
3,1125743911,CG0000-6731,Blood,Affected,50.0,65.7
4,1125743912,CG0000-6738,Blood,Parent,50.0,66.2


In [21]:
# Sequencing experiments
seq_exp_df = read_seq_experiment_data()
seq_exp_df.head()

Unnamed: 0,sample_name,library_name,rg_barcode,run_name,read_length,date,library_strategy,library_source,library_selection,insert_size,instrument,library_layout,max_insert_size,mean_insert_size,mean_read_length,total_reads
0,CG0023-9153,GMKFCS.ST-00000936-1_2AMP,HMNMFCCXX-1,160423_ST-E00212_0248_AHMNMFCCXX_1,150,2016-05-10 00:00:00,WGS,GENOMIC,RANDOM,361,HiSeq X Ten,Paired-End,428,392.684444,150.0,900
1,CG0002-6325,GMKFCS.ST-00000953-1_2AMP,HMNMFCCXX-2,160423_ST-E00212_0248_AHMNMFCCXX_2,150,2016-05-10 00:00:00,WGS,GENOMIC,RANDOM,369,HiSeq X Ten,Paired-End,428,392.684444,150.0,900
2,CG0002-6931,GMKFCS.ST-00000954-1_2AMP,HMNMFCCXX-3,160423_ST-E00212_0248_AHMNMFCCXX_3,150,2016-05-10 00:00:00,WGS,GENOMIC,RANDOM,360,HiSeq X Ten,Paired-End,428,392.684444,150.0,900
3,CG0002-7519,GMKFCS.ST-00003538-1_2AMP,HMNMFCCXX-4,160423_ST-E00212_0248_AHMNMFCCXX_4,150,2016-05-10 00:00:00,WGS,GENOMIC,RANDOM,365,HiSeq X Ten,Paired-End,428,392.684444,150.0,900
4,CG0002-7976,GMKFCS.ST-00003539-1_2AMP,HMNMFCCXX-5,160423_ST-E00212_0248_AHMNMFCCXX_5,150,2016-05-10 00:00:00,WGS,GENOMIC,RANDOM,352,HiSeq X Ten,Paired-End,428,392.684444,150.0,900


### Explore

In [22]:
# Participants
print("Family")
print(family_df.nunique())
print("\nDemographics")
print(demographic_df.nunique())
print("\nGender")
print(demographic_df.nunique())
print("\nDiagnosis")
print(demographic_df.nunique())

Family
famid          736
subjid        2208
mother         737
father         737
is_proband       2
dtype: int64

Demographics
race                6
ethnicity           2
subjid            736
demographic_id    736
dtype: int64

Gender
race                6
ethnicity           2
subjid            736
demographic_id    736
dtype: int64

Diagnosis
race                6
ethnicity           2
subjid            736
demographic_id    736
dtype: int64


### Combine

In [23]:
# Investigator
investigator_df = read_investigator_data()
investigator_df.head()

Unnamed: 0,unnamed:_0,institution,investigator_name
0,0,Harvard Medical School,Christine E. Seidman


In [24]:
# Study
study_df = read_study_data()
study_df.head()

Unnamed: 0,unnamed:_0,attribution,data_access_authority,study_id,study_name,study_version
0,0,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-...,dbGaP,phs001138,Discovery of the Genetic Basis of Structural H...,v1.p2


In [25]:
# Study files
study_files_df = read_study_file_data()
study_files_df.head()

Unnamed: 0,study_file_name
0,3a_dbGaP_SubjectPhenotypes_CardiacAbnormalitie...
1,3a_dbGaP_SubjectPhenotypes_CardiacSurgeriesPro...
2,3a_dbGaP_SubjectPhenotypes_CatheterizationDS.txt
3,3a_dbGaP_SubjectPhenotypes_CopynumberResultsDS...
4,3a_dbGaP_SubjectPhenotypes_DemographicsDS.txt


In [26]:
# Family
family_df = read_family_data()
family_df.head()

Unnamed: 0,famid,subjid,mother,father,is_proband
0,1,210,211,212,True
1,1,211,0,0,False
2,1,212,0,0,False
3,2,272,273,274,True
4,2,273,0,0,False


In [27]:
# Create participant df
# Merge Gender + Demographics
gender_demo_df = pd.merge(gender_df, demographic_df, on='subjid')
# Add Family
df1 = pd.merge(gender_demo_df, family_df, on='subjid')
df1

Unnamed: 0,subjid,sex,race,ethnicity,demographic_id,famid,mother,father,is_proband
0,390,F,White,No,demographic_64,8,391,392,True
1,6688,M,White,No,demographic_602,260,6689,6690,True
2,18844,M,White,No,demographic_591,582,18845,18846,True
3,25597,M,White,No,demographic_657,668,25598,25599,True
4,2561,M,White,No,demographic_617,107,2562,2563,True
5,1295,F,White,No,demographic_406,49,1296,1297,True
6,2575,F,Black or African American,No,demographic_41,108,2576,2577,True
7,4132,F,White,No,demographic_479,164,4133,4134,True
8,4740,M,White,No,demographic_343,191,4741,4742,True
9,5725,M,White,No,demographic_317,224,5726,5727,True


In [28]:
# Merge Diagnosis
df2 = pd.merge(df1, diagnosis_df, on='subjid')
df2.head()

Unnamed: 0,subjid,sex,race,ethnicity,demographic_id,famid,mother,father,is_proband,diagnosis,diagnosis_id
0,390,F,White,No,demographic_64,8,391,392,True,"Pulmonary stenosis, bilateral branch pulmonary...",diagnosis_232
1,390,F,White,No,demographic_64,8,391,392,True,Single ventricle comprised of mostly left vent...,diagnosis_233
2,390,F,White,No,demographic_64,8,391,392,True,L-loop transposition of the great arteries,diagnosis_234
3,390,F,White,No,demographic_64,8,391,392,True,Double outlet left ventricle,diagnosis_235
4,390,F,White,No,demographic_64,8,391,392,True,Pulmonary stenosis,diagnosis_236


In [29]:
# Merge Sample
df3 = pd.merge(df2, subject_sample_df, on='subjid')
df3.head()

Unnamed: 0,subjid,sex,race,ethnicity,demographic_id,famid,mother,father,is_proband,diagnosis,diagnosis_id,sampid,samp_source,source_sampid,sample_use
0,390,F,White,No,demographic_64,8,391,392,True,"Pulmonary stenosis, bilateral branch pulmonary...",diagnosis_232,CG0003-5269,CORIELL,CG0003-5269,Seq_DNA_WholeGenome
1,390,F,White,No,demographic_64,8,391,392,True,Single ventricle comprised of mostly left vent...,diagnosis_233,CG0003-5269,CORIELL,CG0003-5269,Seq_DNA_WholeGenome
2,390,F,White,No,demographic_64,8,391,392,True,L-loop transposition of the great arteries,diagnosis_234,CG0003-5269,CORIELL,CG0003-5269,Seq_DNA_WholeGenome
3,390,F,White,No,demographic_64,8,391,392,True,Double outlet left ventricle,diagnosis_235,CG0003-5269,CORIELL,CG0003-5269,Seq_DNA_WholeGenome
4,390,F,White,No,demographic_64,8,391,392,True,Pulmonary stenosis,diagnosis_236,CG0003-5269,CORIELL,CG0003-5269,Seq_DNA_WholeGenome


In [30]:
# Merge Aliquot
df4 = pd.merge(df3, shipping_manifest_df, left_on='sampid', right_on='external_id')
df4.head()

Unnamed: 0,subjid,sex,race,ethnicity,demographic_id,famid,mother,father,is_proband,diagnosis,...,sampid,samp_source,source_sampid,sample_use,barcode,external_id,sample_collection_site,sample_role,concentration_ng_per_ul,initial_volume_microliters
0,390,F,White,No,demographic_64,8,391,392,True,"Pulmonary stenosis, bilateral branch pulmonary...",...,CG0003-5269,CORIELL,CG0003-5269,Seq_DNA_WholeGenome,1125743958,CG0003-5269,Blood,Affected,50.0,60.1
1,390,F,White,No,demographic_64,8,391,392,True,Single ventricle comprised of mostly left vent...,...,CG0003-5269,CORIELL,CG0003-5269,Seq_DNA_WholeGenome,1125743958,CG0003-5269,Blood,Affected,50.0,60.1
2,390,F,White,No,demographic_64,8,391,392,True,L-loop transposition of the great arteries,...,CG0003-5269,CORIELL,CG0003-5269,Seq_DNA_WholeGenome,1125743958,CG0003-5269,Blood,Affected,50.0,60.1
3,390,F,White,No,demographic_64,8,391,392,True,Double outlet left ventricle,...,CG0003-5269,CORIELL,CG0003-5269,Seq_DNA_WholeGenome,1125743958,CG0003-5269,Blood,Affected,50.0,60.1
4,390,F,White,No,demographic_64,8,391,392,True,Pulmonary stenosis,...,CG0003-5269,CORIELL,CG0003-5269,Seq_DNA_WholeGenome,1125743958,CG0003-5269,Blood,Affected,50.0,60.1


In [31]:
# Merge Sequencing Experiment
full_participant_df = pd.merge(df4, seq_exp_df, left_on='external_id', right_on='sample_name')
full_participant_df.head()

Unnamed: 0,subjid,sex,race,ethnicity,demographic_id,famid,mother,father,is_proband,diagnosis,...,library_strategy,library_source,library_selection,insert_size,instrument,library_layout,max_insert_size,mean_insert_size,mean_read_length,total_reads
0,390,F,White,No,demographic_64,8,391,392,True,"Pulmonary stenosis, bilateral branch pulmonary...",...,WGS,GENOMIC,RANDOM,387,HiSeq X Ten,Paired-End,428,392.684444,150.0,900
1,390,F,White,No,demographic_64,8,391,392,True,Single ventricle comprised of mostly left vent...,...,WGS,GENOMIC,RANDOM,387,HiSeq X Ten,Paired-End,428,392.684444,150.0,900
2,390,F,White,No,demographic_64,8,391,392,True,L-loop transposition of the great arteries,...,WGS,GENOMIC,RANDOM,387,HiSeq X Ten,Paired-End,428,392.684444,150.0,900
3,390,F,White,No,demographic_64,8,391,392,True,Double outlet left ventricle,...,WGS,GENOMIC,RANDOM,387,HiSeq X Ten,Paired-End,428,392.684444,150.0,900
4,390,F,White,No,demographic_64,8,391,392,True,Pulmonary stenosis,...,WGS,GENOMIC,RANDOM,387,HiSeq X Ten,Paired-End,428,392.684444,150.0,900


In [32]:
# Create study
study = {
    'data_access_authority': 'dbGaP',
    'study_id': 'phs001138',
    'study_version': 'v1.p2',
    'study_name': 'Discovery of the Genetic Basis of Structural Heart'
    'and Other Birth Defects',
    'attribution': 'https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/'
    'GetAcknowledgementStatement.cgi?study_id=phs001138.v1.p2'
}
study_df = pd.DataFrame([study])
study_df.to_csv(os.path.join(DATA_DIR, 'study.txt'))

# Create investigator
invest = {
    'investigator_name': 'Christine E. Seidman',
    'institution': 'Harvard Medical School'
}
inv_df = pd.DataFrame([invest])
inv_df.to_csv(os.path.join(DATA_DIR, 'investigator.txt'))

In [33]:
def _add_study_cols(study_df, df):
    # Add study cols to a df
    cols = study_df.columns.tolist()
    row = study_df.iloc[0]
    for col in cols:
        df[col] = row[col]
    return df

In [34]:
# Add study to full participant df
_add_study_cols(study_df, full_participant_df)

# Add study to basic participant df
participant_df = _add_study_cols(study_df, family_df)

# Add study to investigator df
study_investigator_df =_add_study_cols(study_df, investigator_df)

# Add study to study files df
study_study_files_df = _add_study_cols(study_df, study_files_df)

# Phenotype df
phenotype_participant_df = pd.merge(phenotype_df, participant_df,
                                    on='subjid')

In [35]:
from pprint import pprint
n = 22
chunk_size = 10
entity_type ='participant'
entities = ['{}_{}'.format(entity_type, j) for j in range(n)]
for i in range(0, n, chunk_size):
    chunk = entities[i - chunk_size:i]
    if chunk:
        start = i - chunk_size + 1
        print('Adding {}:{} {}s to session'.format(start, i,
                                                   entity_type))
        pprint([e for e in entities[start:i]])
        print('Flushing {} {}\ns'.format(chunk_size, entity_type))

print('Flushing remaining {} {}s to session\n'.format(
    len(entities[i:]) + 1, entity_type))

remaining = entities[0:1] + entities[i:]

pprint([e for e in remaining])

Adding 1:10 participants to session
['participant_1',
 'participant_2',
 'participant_3',
 'participant_4',
 'participant_5',
 'participant_6',
 'participant_7',
 'participant_8',
 'participant_9']
Flushing 10 participant
s
Adding 11:20 participants to session
['participant_11',
 'participant_12',
 'participant_13',
 'participant_14',
 'participant_15',
 'participant_16',
 'participant_17',
 'participant_18',
 'participant_19']
Flushing 10 participant
s
Flushing remaining 3 participants to session

['participant_0', 'participant_20', 'participant_21']


In [36]:
import json
def read_json(filepath):
    with open(filepath, 'r') as json_file:
        return json.load(json_file)


def write_json(data, filepath):
    with open(filepath, 'w') as json_file:
        json.dump(data, json_file, sort_keys=True, indent=4, separators=(',', ':'))

In [37]:
fp = os.path.join(DATA_DIR, 'genomic_file_uuid.json')
file_json = read_json(fp)
write_json(file_json, fp)

In [38]:
# Genomic file info df
def read_genomic_file_info(filepath=None):
    if not filepath:
        filepath = os.path.join(DATA_DIR, 'genomic_file_uuid.json')
        
    def get_ext(fp):
        filename = os.path.basename(fp)
        parts = filename.split('.')
        if len(parts) > 2:
            ext = '.'.join(parts[1:])
        else:
            ext = parts[-1]
        return ext

    with open(filepath, 'r') as json_file:
        uuid_dict = json.load(json_file)

    gf_dicts = []
    for k, v in uuid_dict.items():
        file_info = {
            'uuid': v['did'],
            'md5sum': v['hashes']['md5'],
            'file_url': v['urls'][0],
            'file_size': v['size'],
            'data_type': 'submitted aligned reads',
            'file_format': get_ext(v['urls'][0]),
            'file_name': os.path.basename(v['urls'][0])
        }
        gf_dicts.append(file_info)
        
    return pd.DataFrame(gf_dicts)

def read_sample_gf_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DATA_DIR, 'manifests', 'GMKF_BAMsampleIDs.xlsx')
    df = pd.read_excel(filepath)
    df = df.loc[df['Cohort'] == 'GMKF-Seidman']
    return df

In [39]:
# Read genomic file info
gf_file_info_df = read_genomic_file_info()
# Sample and BAM File df
sample_gf_df = read_sample_gf_data()
# Merge with sequencing experiment df
df1 = pd.merge(sample_gf_df, seq_exp_df, left_on='dbgap_subject_id', right_on='sample_name')
# Merge with genomic file info df
df2 = pd.merge(df1, gf_file_info_df, left_on='BAM sample ID', right_on='file_name')
df2

Unnamed: 0,Cohort,BAM sample ID,Library,HGSC Sample ID,dbgap_subject_id,sample_name,library_name,rg_barcode,run_name,read_length,...,mean_insert_size,mean_read_length,total_reads,data_type,file_format,file_name,file_size,file_url,md5sum,uuid
0,GMKF-Seidman,HMNMFCCXX-1.hgv.bam,IWG_IND-GMKFCS.ST-00000936-1_2pA,ST-00000936,CG0023-9153,CG0023-9153,GMKFCS.ST-00000936-1_2AMP,HMNMFCCXX-1,160423_ST-E00212_0248_AHMNMFCCXX_1,150,...,392.684444,150.0,900,submitted aligned reads,hgv.bam,HMNMFCCXX-1.hgv.bam,64365517822,s3://kf-seq-data-bcm/seidman/HMNMFCCXX-1.hgv.bam,6f111ed7b35e421fce94571eba8e5be8,e88c2d14-0b7e-43e8-9f7e-7020793b6ebe
1,GMKF-Seidman,HMNMFCCXX-2.hgv.bam,IWG_IND-GMKFCS.ST-00000953-1_2pA,ST-00000953,CG0002-6325,CG0002-6325,GMKFCS.ST-00000953-1_2AMP,HMNMFCCXX-2,160423_ST-E00212_0248_AHMNMFCCXX_2,150,...,392.684444,150.0,900,submitted aligned reads,hgv.bam,HMNMFCCXX-2.hgv.bam,63435348733,s3://kf-seq-data-bcm/seidman/HMNMFCCXX-2.hgv.bam,fb82871da4cc68c70a971dd55677d933,40fb29d7-71e1-426e-9678-a54f22a4f80e
2,GMKF-Seidman,HMNMFCCXX-3.hgv.bam,IWG_IND-GMKFCS.ST-00000954-1_2pA,ST-00000954,CG0002-6931,CG0002-6931,GMKFCS.ST-00000954-1_2AMP,HMNMFCCXX-3,160423_ST-E00212_0248_AHMNMFCCXX_3,150,...,392.684444,150.0,900,submitted aligned reads,hgv.bam,HMNMFCCXX-3.hgv.bam,63739965918,s3://kf-seq-data-bcm/seidman/HMNMFCCXX-3.hgv.bam,748932ce3d9edfc9fba57291c6f632a5,d19eeeae-6acd-4f08-b5c8-6f200160ebed
3,GMKF-Seidman,HMNMFCCXX-4.hgv.bam,IWG_IND-GMKFCS.ST-00003538-1_2pA,ST-00003538,CG0002-7519,CG0002-7519,GMKFCS.ST-00003538-1_2AMP,HMNMFCCXX-4,160423_ST-E00212_0248_AHMNMFCCXX_4,150,...,392.684444,150.0,900,submitted aligned reads,hgv.bam,HMNMFCCXX-4.hgv.bam,62635938381,s3://kf-seq-data-bcm/seidman/HMNMFCCXX-4.hgv.bam,b60d5db835db5544e43fa2113a0cc08a,c5d91600-e286-4ddd-bbc0-e111b079f8bf
4,GMKF-Seidman,HMNMFCCXX-5.hgv.bam,IWG_IND-GMKFCS.ST-00003539-1_2pA,ST-00003539,CG0002-7976,CG0002-7976,GMKFCS.ST-00003539-1_2AMP,HMNMFCCXX-5,160423_ST-E00212_0248_AHMNMFCCXX_5,150,...,392.684444,150.0,900,submitted aligned reads,hgv.bam,HMNMFCCXX-5.hgv.bam,64626450198,s3://kf-seq-data-bcm/seidman/HMNMFCCXX-5.hgv.bam,068b52d7685a5e1916dbb784da850f3b,6904e887-901c-4df6-8579-0ae55cc11125
5,GMKF-Seidman,HMNMFCCXX-6.hgv.bam,IWG_IND-GMKFCS.ST-00003561-1_2pA,ST-00003561,CG0002-7848,CG0002-7848,GMKFCS.ST-00003561-1_2AMP,HMNMFCCXX-6,160423_ST-E00212_0248_AHMNMFCCXX_6,150,...,392.684444,150.0,900,submitted aligned reads,hgv.bam,HMNMFCCXX-6.hgv.bam,62979984038,s3://kf-seq-data-bcm/seidman/HMNMFCCXX-6.hgv.bam,e36e9551a23d5065cbfd9eea8069ee73,6b60464f-31cd-411c-91b4-c33d535cb01f
6,GMKF-Seidman,HMNVCCCXX-1.hgv.bam,IWG_IND-GMKFCS.ST-00000768-1_2pA,ST-00000768,CG0000-3840,CG0000-3840,IWG_IND-GMKFCS.ST-00000768-1_2pA,HMNVCCCXX-1,160402_E00380_0096_BHMNVCCCXX_1,150,...,392.684444,150.0,900,submitted aligned reads,hgv.bam,HMNVCCCXX-1.hgv.bam,68933421158,s3://kf-seq-data-bcm/seidman/HMNVCCCXX-1.hgv.bam,be2c10950413714413e35743187248c7,1ab280b9-e054-4e25-a88a-d7566b02510a
7,GMKF-Seidman,HMNVCCCXX-2.hgv.bam,IWG_IND-GMKFCS.ST-00000580-1_2pA,ST-00000580,CG0000-2758,CG0000-2758,IWG_IND-GMKFCS.ST-00000580-1_2pA,HMNVCCCXX-2,160402_E00380_0096_BHMNVCCCXX_2,150,...,392.684444,150.0,900,submitted aligned reads,hgv.bam,HMNVCCCXX-2.hgv.bam,70510006515,s3://kf-seq-data-bcm/seidman/HMNVCCCXX-2.hgv.bam,794bb59888d85fd092d45c1e7da9dc2e,a79803fc-372e-40fa-b62d-29bb5b5fbaf0
8,GMKF-Seidman,HMNVCCCXX-3.hgv.bam,IWG_IND-GMKFCS.ST-00000589-1_2pA,ST-00000589,CG0000-0344,CG0000-0344,IWG_IND-GMKFCS.ST-00000589-1_2pA,HMNVCCCXX-3,160402_E00380_0096_BHMNVCCCXX_3,150,...,392.684444,150.0,900,submitted aligned reads,hgv.bam,HMNVCCCXX-3.hgv.bam,70748511577,s3://kf-seq-data-bcm/seidman/HMNVCCCXX-3.hgv.bam,02f5e1d8fc3cf41a1993ea369a88411e,5cea8a2f-cfd6-45da-9cf0-0aff9bc3185c
9,GMKF-Seidman,HMNVCCCXX-4.hgv.bam,IWG_IND-GMKFCS.ST-00000991-1_2pA,ST-00000991,CG0000-4744,CG0000-4744,IWG_IND-GMKFCS.ST-00000991-1_2pA,HMNVCCCXX-4,160402_E00380_0096_BHMNVCCCXX_4,150,...,392.684444,150.0,900,submitted aligned reads,hgv.bam,HMNVCCCXX-4.hgv.bam,70496953371,s3://kf-seq-data-bcm/seidman/HMNVCCCXX-4.hgv.bam,abcdeb907c26445979b5ef79b327835e,1d6a481c-a86a-4cc2-acc1-45829d0dbb8e


In [40]:
df1.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
BAM sample ID,900,900,HN7NVCCXX-1.hgv.bam,1
Library,900,900,IWG_IND-GMKFCS.ST-00020970-1_2pA,1
HGSC Sample ID,900,900,ST-00019241,1
dbgap_subject_id,900,900,CG0008-8428,1
sample_name,900,900,CG0008-8428,1
library_name,900,900,IWG_IND-GMKFCS.ST-00020970-1_2pA,1
rg_barcode,900,900,HN7GNCCXX-3,1
run_name,900,900,160412_ST-E00238_0206_BHN7VCCCXX_3,1
date,900,18,2016-05-05 00:00:00,206
Cohort,900,1,GMKF-Seidman,900


In [45]:
df = read_genomic_file_info()
print(df['file_size'].max()/1000000000)
print(df['file_size'].min()/1000000000)
print(df['file_size'].mean()/1000000000)

73.775241806
47.377275825
65.6578559692


In [105]:
df = read_phenotype_data()
df['observed'].unique()

array(['negative', 'positive'], dtype=object)

In [106]:
df = pd.read_csv('/Users/singhn4/Desktop/phenotype.csv')

Unnamed: 0,uuid,created_at,modified_at,phenotype,hpo_id,observed,age_at_event_days,participant_id,kf_id
0,f77d6506-70d8-4075-b4ff-e54fff9a452e,2018-02-27 09:35:00.03638,2018-02-27 09:35:00.036388,ABDOMINAL_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_8JWM84CR
1,5cdea204-b001-4469-9934-5dfd8ce64f01,2018-02-27 09:35:00.038478,2018-02-27 09:35:00.038484,AIRWAY_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_4WAKYZ20
2,61c9f931-b47a-4849-bd40-09fc0eb614e6,2018-02-27 09:35:00.039441,2018-02-27 09:35:00.039446,CHEST_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_Z5KN5SN2
3,51cdd902-d508-48a0-a1c1-77a91d5e57b0,2018-02-27 09:35:00.040381,2018-02-27 09:35:00.040386,CLUB_FOOT,,positive,,PT_YJEZ38PS,PH_H17JQKYG
4,a1690ee3-3565-4d2c-8f04-6d903ceda423,2018-02-27 09:35:00.041276,2018-02-27 09:35:00.041281,DERMATOLOGICAL_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_PFVG3BAZ
5,b9f1735e-9de3-41d3-a4f1-3cfc16796ec2,2018-02-27 09:35:00.042194,2018-02-27 09:35:00.042199,DYSMORPHIC_FACIES_PRESENT,,negative,,PT_YJEZ38PS,PH_BY35ZYB5
6,ec15cd29-5631-4c38-8795-0bd68ec905cc,2018-02-27 09:35:00.043137,2018-02-27 09:35:00.043176,EAR_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_V672HJ6C
7,6b241509-9a2a-4bb9-bfbe-0c0b6844579a,2018-02-27 09:35:00.043983,2018-02-27 09:35:00.043989,ENDOCRINOLOGIC_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_JBK8BD5J
8,b7c951dd-989a-4e1a-b721-d55d22024b1d,2018-02-27 09:35:00.044848,2018-02-27 09:35:00.044854,EYE_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_E4YDYAEJ
9,14b79e40-a409-4c4c-a4ee-09da7f67aca7,2018-02-27 09:35:00.045662,2018-02-27 09:35:00.045667,FEET_ABNORMALITIES_PRESENT,,positive,,PT_YJEZ38PS,PH_BCRWHNSK


In [108]:
df = df.where((pd.notnull(df)), None)

Unnamed: 0,uuid,created_at,modified_at,phenotype,hpo_id,observed,age_at_event_days,participant_id,kf_id
0,f77d6506-70d8-4075-b4ff-e54fff9a452e,2018-02-27 09:35:00.03638,2018-02-27 09:35:00.036388,ABDOMINAL_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_8JWM84CR
1,5cdea204-b001-4469-9934-5dfd8ce64f01,2018-02-27 09:35:00.038478,2018-02-27 09:35:00.038484,AIRWAY_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_4WAKYZ20
2,61c9f931-b47a-4849-bd40-09fc0eb614e6,2018-02-27 09:35:00.039441,2018-02-27 09:35:00.039446,CHEST_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_Z5KN5SN2
3,51cdd902-d508-48a0-a1c1-77a91d5e57b0,2018-02-27 09:35:00.040381,2018-02-27 09:35:00.040386,CLUB_FOOT,,positive,,PT_YJEZ38PS,PH_H17JQKYG
4,a1690ee3-3565-4d2c-8f04-6d903ceda423,2018-02-27 09:35:00.041276,2018-02-27 09:35:00.041281,DERMATOLOGICAL_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_PFVG3BAZ
5,b9f1735e-9de3-41d3-a4f1-3cfc16796ec2,2018-02-27 09:35:00.042194,2018-02-27 09:35:00.042199,DYSMORPHIC_FACIES_PRESENT,,negative,,PT_YJEZ38PS,PH_BY35ZYB5
6,ec15cd29-5631-4c38-8795-0bd68ec905cc,2018-02-27 09:35:00.043137,2018-02-27 09:35:00.043176,EAR_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_V672HJ6C
7,6b241509-9a2a-4bb9-bfbe-0c0b6844579a,2018-02-27 09:35:00.043983,2018-02-27 09:35:00.043989,ENDOCRINOLOGIC_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_JBK8BD5J
8,b7c951dd-989a-4e1a-b721-d55d22024b1d,2018-02-27 09:35:00.044848,2018-02-27 09:35:00.044854,EYE_ABNORMALITIES_PRESENT,,negative,,PT_YJEZ38PS,PH_E4YDYAEJ
9,14b79e40-a409-4c4c-a4ee-09da7f67aca7,2018-02-27 09:35:00.045662,2018-02-27 09:35:00.045667,FEET_ABNORMALITIES_PRESENT,,positive,,PT_YJEZ38PS,PH_BCRWHNSK
