In [None]:
import os
import pandas as pd

from dataservice.util.data_import.utils import (
    dropna_rows_cols,
    reformat_column_names,
    read_json, 
    write_json
)

DATA_DIR = '/Users/singhn4/Projects/kids_first/data/Seidman_2015'
DBGAP_DIR = os.path.join(DATA_DIR, 'dbgap')
ALIQUOT_SHIP_DIR = os.path.join(DATA_DIR, 'manifests', 'shipping')

### Extraction - Methods

In [None]:
@reformat_column_names
@dropna_rows_cols
def read_study_file_data(filepaths=None):
    """
    Read in raw study files
    """
    if not filepaths:
        filepaths = os.listdir(DBGAP_DIR)

    study_files = [{"study_file_name": f}
                   for f in filepaths if 'dbGaP' in f]
    return pd.DataFrame(study_files)

@reformat_column_names
@dropna_rows_cols
def read_study_data(filepath=None):
    """
    Read study data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'study.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_investigator_data(filepath=None):
    """
    Read investigator data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'investigator.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_family_data(filepath=None):
    """
    Read family data for all participants
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR,
                                '7a_dbGaP_PedigreeDS.txt')
    df = pd.read_csv(filepath,
                     delimiter='\t',
                     dtype={'SUBJID': str})
    # Subset of columns
    df.drop(['SEX'], axis=1, inplace=True)

    # Add proband column
    def func(row): return bool(row['MOTHER'] and row['FATHER'])
    df['is_proband'] = df.apply(func, axis=1)

    return df

In [None]:
@reformat_column_names
@dropna_rows_cols
def read_phenotype_data(filepath=None):
    """
    Read phenotype data
    """
    # Read in cached phenotypes or create if they don't exist
    hpo_fp = os.path.join(DATA_DIR, 'phenotype_hpo_mapping.txt')
    if os.path.exists(hpo_fp):
        return pd.read_csv(hpo_fp,dtype={'SUBJID': str})
        
    filepath = os.path.join(
    DBGAP_DIR,
    '3a_dbGaP_SubjectPhenotypes_ExtracardiacFindingsDS.txt')

    # Read csv
    df = pd.read_csv(filepath,
                     delimiter='\t',
                     dtype={'SUBJID': str})

    # Convert age years to days
    df['LATEST_EXAM_AGE'] = df["LATEST_EXAM_AGE"].apply(
        lambda x: int(x) * 365)
    age_at_event_days = df[['LATEST_EXAM_AGE', 'SUBJID']]

    # Select string based phenotypes
    df = df.select_dtypes(include='object')

    # Make all values lower case
    for col in df.columns.tolist():
        df[col] = df[col].apply(lambda x: str(x).lower())

    # Reshape to build the phenotypes df
    cols = df.columns.tolist()[2:]
    phenotype_cols = [col for col in cols if not col.startswith('OTHER')]
    phenotype_df = pd.melt(df, id_vars='SUBJID', value_vars=phenotype_cols,
                           var_name='phenotype', value_name='orig_observed')

    # Remove unkonwns
    unknown_values = ['none', 'unknown', 'no/not checked', 'not applicable', 'absent']
    phenotype_df = phenotype_df[phenotype_df['orig_observed'].apply(lambda x: x not in unknown_values)]

    # Add HPOs
    from dataservice.util.data_import.etl.hpo import mapper
    hpo_mapper = mapper.HPOMapper(DATA_DIR)
    phenotype_df = hpo_mapper.add_hpo_id_col(phenotype_df)

    # Map to positive/negative
    def func(row): 
        return 'negative' if row['orig_observed'] == 'no' else 'positive'
    phenotype_df['observed'] = phenotype_df.apply(func, axis=1)

    # Merge back in age at event in days
    phenotype_df = pd.merge(phenotype_df, age_at_event_days, on='SUBJID')

    # Add unique col
    def func(row): return "_".join(['phenotype', str(row.name)])
    phenotype_df['phenotype_id'] = phenotype_df.apply(func, axis=1)
    
    # Write to file
    phenotype_df.to_csv(hpo_fp, index=False)
    
    return phenotype_df

In [None]:
# Gender
@reformat_column_names
@dropna_rows_cols
def read_gender_data(filepath=None):
    """
    Read gender data for all subjects
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR,
                                '3a_dbGaP_SubjectPhenotypes_GenderDS.txt')
    df = pd.read_csv(filepath,
                     delimiter='\t',
                     dtype={'SUBJID': str})

    return df

In [None]:
@reformat_column_names
@dropna_rows_cols
def read_demographic_data(filepaths=None):
    """
    Read demographic data for all subjects (child, mother, father)
    """
    if not filepaths:
        filenames = ['3a_dbGaP_SubjectPhenotypes_DemographicsDS.txt',
                     '3a_dbGaP_SubjectPhenotypes_MaternalDemographicsDS-fixed-03-09-2018.txt',
                     '3a_dbGaP_SubjectPhenotypes_PaternalDemographicsDS-fixed-03-09-2018.txt']

        filepaths = [os.path.join(DBGAP_DIR, filename)
                     for filename in filenames
                     ]

    child_demo_df = pd.read_csv(os.path.join(filepaths[0]),
                                delimiter='\t',
                                dtype={'SUBJID': str})

    mother_demo_df = pd.read_csv(os.path.join(filepaths[1]),
                                 delimiter='\t',
                                 dtype={'SUBJID': str})

    father_demo_df = pd.read_csv(os.path.join(filepaths[2]),
                                 delimiter='\t',
                                 dtype={'SUBJID': str})

    # Combine demographics of all subjects
    subject_demo_df = pd.concat(
        [child_demo_df, mother_demo_df, father_demo_df])

    subject_demo_df.drop_duplicates('SUBJID', inplace=True)

    # Subset of columns
    subject_demo_df = subject_demo_df[['RACE', 'ETHNICITY', 'SUBJID']]

    def func(row): return "_".join(['demographic', str(row.name)])
    subject_demo_df['demographic_id'] = subject_demo_df.apply(func, axis=1)

    return subject_demo_df

In [None]:
def fix_demographics():
    family_df = read_family_data()

    # Maternal demographics
    filepath = os.path.join(DBGAP_DIR, '3a_dbGaP_SubjectPhenotypes_MaternalDemographicsDS.txt')
    mother_demo_df = pd.read_csv(filepath,
                                     delimiter='\t',
                                     dtype={'SUBJID': str})
    mother_demo_df = pd.merge(mother_demo_df, family_df, left_on='SUBJID', right_on='subjid')
    mother_demo_df.drop(columns=['SUBJID', 'subjid', 'father', 'famid', 'is_proband'], inplace=True)
    mother_demo_df.rename(columns={'mother': 'SUBJID'}, inplace=True)
    mother_demo_df['SUBJID'] = mother_demo_df['SUBJID'].astype('str') 
    mother_demo_df.to_csv(filepath.split('.')[0] + '-fixed-03-09-2018.txt', sep='\t', index=False)

    # Paternal demographics
    filepath = os.path.join(DBGAP_DIR, '3a_dbGaP_SubjectPhenotypes_PaternalDemographicsDS.txt')
    father_demo_df = pd.read_csv(filepath,
                                     delimiter='\t',
                                     dtype={'SUBJID': str})
    father_demo_df = pd.merge(father_demo_df, family_df, left_on='SUBJID', right_on='subjid')
    father_demo_df.drop(columns=['SUBJID', 'subjid', 'mother', 'famid', 'is_proband'], inplace=True)
    father_demo_df.rename(columns={'father': 'SUBJID'}, inplace=True)
    father_demo_df['SUBJID'] = father_demo_df['SUBJID'].astype('str') 
    father_demo_df.to_csv(filepath.split('.')[0] + '-fixed-03-09-2018.txt', sep='\t', index=False)
fix_demographics()

In [None]:
demo_df = read_demographic_data()
demo_df.shape

In [None]:
# Diagnosis
@reformat_column_names
@dropna_rows_cols
def read_diagnosis_data(filepath=None):
    """
    Read diagnoses data for all subjects
    """
    if not filepath:
        filename = '3a_dbGaP_SubjectPhenotypes_PatientDiagnosisDS.txt'
        filepath = os.path.join(DBGAP_DIR, filename)

    diagnosis_df = pd.read_csv(filepath,
                               delimiter='\t',
                               dtype={'SUBJID': str})

    def func(row): return "_".join(['diagnosis', str(row.name)])
    diagnosis_df['diagnosis_id'] = diagnosis_df.apply(func, axis=1)

    return diagnosis_df

In [None]:
# Sample
@reformat_column_names
@dropna_rows_cols
def read_subject_sample_data(filepath=None):
    """
    Read sample metadata for all subjects
    """
    if not filepath:
        filename = '6a_dbGaP_SubjectSampleMappingDS.txt'
        filepath = os.path.join(DBGAP_DIR, filename)

    subject_sample_df = pd.read_csv(filepath,
                                    delimiter='\t',
                                    dtype={'SUBJID': str})
    subject_sample_df.drop_duplicates('SUBJID', inplace=True)

    return subject_sample_df

In [None]:
# Aliquot
@reformat_column_names
@dropna_rows_cols
def read_sample_shipping_manifest_data(*filepaths):
    """
    Read shipping manifest for samples (from PI/sample source center)
    """
    if not filepaths:
        filepaths = [os.path.join(ALIQUOT_SHIP_DIR, filename)

                     for filename in os.listdir(ALIQUOT_SHIP_DIR)
                     ]

    # Combine all manifest files
    dfs = [pd.read_excel(filepath,
                         delimiter='/t',
                         dtype={'*barcode': str},
                         skiprows=[0, 1],
                         header=[6])

           for filepath in filepaths

           if os.path.basename(filepath).startswith("PCGC")

           ]
    df = pd.concat(dfs)

    # Rename columns
    df.columns = map((lambda x: x.lower().lstrip("*")), df.columns)
    
    # Subset of columns
    df = df[['barcode',
             'external_id',
             'sample_collection_site',
             'sample_role',
             'concentration_ng_per_ul',
             'initial_volume_microliters']]

    # Drop rows where id cols are nan
    id_cols = [col for col in df.columns if "id" in col]
    df.dropna(subset=id_cols, inplace=True)

    return df

In [None]:
# Sequencing experiment (from read group metadata)
@reformat_column_names
@dropna_rows_cols
def read_seq_experiment_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DATA_DIR, "seidman_metadata.xlsx")

    df = pd.read_excel(filepath, dtype={"date": str})
    # Rename some columns
    df.rename(columns={"library_name (in original BAM header)":
                       "library_name",
                       "barcode": "rg_barcode"}, inplace=True)
    df["read_length"] = df["read_length"].apply(
        lambda x: int(x.split("x")[0]))
    
    # Create new columns
    df['max_insert_size'] = df['insert_size'].max()
    df['mean_insert_size'] = df['insert_size'].mean()
    df['mean_read_length'] = df['read_length'].mean()
    df['total_reads'] = df['read_length'].count()
    
    # Subset of columns
    df = df[['sample_name',
             'library_name',
             'rg_barcode',
             'run_name',
             'read_length',
             'date',
             'library_strategy',
             'library_source',
             'library_selection',
             'insert_size',
             'instrument',
             'library_layout',
             'max_insert_size',
             'mean_insert_size',
             'mean_read_length',
             'total_reads']]
    
    return df

### Extraction - Execution

In [None]:
# Study files

In [None]:
study_files_df = read_study_file_data()
study_files_df.head()

In [None]:
# Family 
family_df = read_family_data()
family_df.head()

In [None]:
# Phenotypes
phenotype_df = read_phenotype_data()
phenotype_df.head()

In [None]:
# Gender
gender_df = read_gender_data()
gender_df.head()

In [None]:
# Demographic
demographic_df = read_demographic_data()
demographic_df.head()

In [None]:
# Diagnosis data
diagnosis_df = read_diagnosis_data()
diagnosis_df.head()
# diagnosis_df[diagnosis_df['subjid'] == '279']

In [None]:
# Sample data
subject_sample_df = read_subject_sample_data()
subject_sample_df.head()

In [None]:
# Aliquot/Sample Shipping data
shipping_manifest_df = read_sample_shipping_manifest_data()
shipping_manifest_df.head()

In [None]:
# Sequencing experiments
seq_exp_df = read_seq_experiment_data()
seq_exp_df.head()

### Explore

In [None]:
# Participants
print("Family")
print(family_df.nunique())
print("\nDemographics")
print(demographic_df.nunique())
print("\nGender")
print(demographic_df.nunique())
print("\nDiagnosis")
print(demographic_df.nunique())

### Combine

In [None]:
# Investigator
investigator_df = read_investigator_data()
investigator_df.head()

In [None]:
# Study
study_df = read_study_data()
study_df.head()

In [None]:
# Study files
study_files_df = read_study_file_data()
study_files_df.head()

In [None]:
# Family
family_df = read_family_data()
family_df.head()

In [None]:
# Create participant df
# Merge Gender + Demographics
gender_demo_df = pd.merge(gender_df, demographic_df, on='subjid')
# Add Family
df1 = pd.merge(gender_demo_df, family_df, on='subjid')
df1

In [None]:
# Merge Diagnosis
df2 = pd.merge(df1, diagnosis_df, on='subjid')
df2.head()

In [None]:
# Merge Sample
df3 = pd.merge(df2, subject_sample_df, on='subjid')
df3.head()

In [None]:
# Merge Aliquot
df4 = pd.merge(df3, shipping_manifest_df, left_on='sampid', right_on='external_id')
df4.head()

In [None]:
# Merge Sequencing Experiment
full_participant_df = pd.merge(df4, seq_exp_df, left_on='external_id', right_on='sample_name')
full_participant_df.head()

In [None]:
# Create study
study = {
    'data_access_authority': 'dbGaP',
    'study_id': 'phs001138',
    'study_version': 'v1.p2',
    'study_name': 'Discovery of the Genetic Basis of Structural Heart'
    'and Other Birth Defects',
    'attribution': 'https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/'
    'GetAcknowledgementStatement.cgi?study_id=phs001138.v1.p2'
}
study_df = pd.DataFrame([study])
study_df.to_csv(os.path.join(DATA_DIR, 'study.txt'))

# Create investigator
invest = {
    'investigator_name': 'Christine E. Seidman',
    'institution': 'Harvard Medical School'
}
inv_df = pd.DataFrame([invest])
inv_df.to_csv(os.path.join(DATA_DIR, 'investigator.txt'))

In [None]:
def _add_study_cols(study_df, df):
    # Add study cols to a df
    cols = study_df.columns.tolist()
    row = study_df.iloc[0]
    for col in cols:
        df[col] = row[col]
    return df

In [None]:
# Add study to full participant df
_add_study_cols(study_df, full_participant_df)

# Add study to basic participant df
participant_df = _add_study_cols(study_df, family_df)

# Add study to investigator df
study_investigator_df =_add_study_cols(study_df, investigator_df)

# Add study to study files df
study_study_files_df = _add_study_cols(study_df, study_files_df)

# Phenotype df
phenotype_participant_df = pd.merge(phenotype_df, participant_df,
                                    on='subjid')

In [None]:
from pprint import pprint
n = 22
chunk_size = 10
entity_type ='participant'
entities = ['{}_{}'.format(entity_type, j) for j in range(n)]
for i in range(0, n, chunk_size):
    chunk = entities[i - chunk_size:i]
    if chunk:
        start = i - chunk_size + 1
        print('Adding {}:{} {}s to session'.format(start, i,
                                                   entity_type))
        pprint([e for e in entities[start:i]])
        print('Flushing {} {}\ns'.format(chunk_size, entity_type))

print('Flushing remaining {} {}s to session\n'.format(
    len(entities[i:]) + 1, entity_type))

remaining = entities[0:1] + entities[i:]

pprint([e for e in remaining])

In [None]:
import json
def read_json(filepath):
    with open(filepath, 'r') as json_file:
        return json.load(json_file)


def write_json(data, filepath):
    with open(filepath, 'w') as json_file:
        json.dump(data, json_file, sort_keys=True, indent=4, separators=(',', ':'))

In [None]:
fp = os.path.join(DATA_DIR, 'genomic_file_uuid.json')
file_json = read_json(fp)
write_json(file_json, fp)

In [None]:
# Genomic file info df
def read_genomic_file_info(filepath=None):
    if not filepath:
        filepath = os.path.join(DATA_DIR, 'genomic_file_uuid.json')
        
    def get_ext(fp):
        filename = os.path.basename(fp)
        parts = filename.split('.')
        if len(parts) > 2:
            ext = '.'.join(parts[1:])
        else:
            ext = parts[-1]
        return ext

    with open(filepath, 'r') as json_file:
        uuid_dict = json.load(json_file)

    gf_dicts = []
    for k, v in uuid_dict.items():
        file_info = {
            'uuid': v['did'],
            'md5sum': v['hashes']['md5'],
            'file_url': v['urls'][0],
            'file_size': v['size'],
            'data_type': 'submitted aligned reads',
            'file_format': get_ext(v['urls'][0]),
            'file_name': os.path.basename(v['urls'][0])
        }
        gf_dicts.append(file_info)
        
    return pd.DataFrame(gf_dicts)

def read_sample_gf_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DATA_DIR, 'manifests', 'GMKF_BAMsampleIDs.xlsx')
    df = pd.read_excel(filepath)
    df = df.loc[df['Cohort'] == 'GMKF-Seidman']
    return df

In [None]:
# Read genomic file info
gf_file_info_df = read_genomic_file_info()
# Sample and BAM File df
sample_gf_df = read_sample_gf_data()
# Merge with sequencing experiment df
df1 = pd.merge(sample_gf_df, seq_exp_df, left_on='dbgap_subject_id', right_on='sample_name')
# Merge with genomic file info df
df2 = pd.merge(df1, gf_file_info_df, left_on='BAM sample ID', right_on='file_name')
df2

In [None]:
df1.describe(include=['O']).T.sort_values('unique', ascending=False)

In [None]:
df = read_genomic_file_info()
print(df['file_size'].max()/1000000000)
print(df['file_size'].min()/1000000000)
print(df['file_size'].mean()/1000000000)

In [None]:
df = read_phenotype_data()
df['observed'].unique()

In [None]:
df = pd.read_csv('/Users/singhn4/Desktop/phenotype.csv')

In [None]:
df = df.where((pd.notnull(df)), None)