In [None]:
import os
import json
from pprint import pprint
import pandas as pd
pd.set_option('display.max_colwidth', -1)

from dataservice.util.data_import.utils import (
    dropna_rows_cols,
    reformat_column_names,
    cols_to_lower,
    read_json, 
    write_json,
    extract_uncompressed_file_ext
)

DATA_DIR = '/Users/singhn4/Projects/kids_first/data/Rios_Wise_2016'
DBGAP_DIR = os.path.join(DATA_DIR, 'dbgap')
MANIFESTS_DIR = os.path.join(DATA_DIR, 'manifests')

In [None]:
# Create study
study = {
    'data_access_authority': 'dbGaP',
    'study_id': 'phs001410',
    'study_version': 'v1.p2',
    'study_name': 'Genomics of Orthopaedic Disease Program',
    'attribution': None
}
study_df = pd.DataFrame([study])
study_df.to_csv(os.path.join(DATA_DIR, 'study.txt'), index=False)

# Create investigator
invest = {
    'investigator_name': 'Jonathan Rios',
    'institution': 'UT Southwestern Medical Center'
}
inv_df = pd.DataFrame([invest])
inv_df.to_csv(os.path.join(DATA_DIR, 'investigator.txt'))

In [None]:
@reformat_column_names
@dropna_rows_cols
def read_study_file_data(filepaths=None):
    """
    Read in raw study files
    """
    if not filepaths:
        filepaths = os.listdir(DBGAP_DIR)

    study_files = [{"study_file_name": f}
                   for f in filepaths if 'dbGaP' in f]
    return pd.DataFrame(study_files)

@reformat_column_names
@dropna_rows_cols
def read_study_data(filepath=None):
    """
    Read study data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'study.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_investigator_data(filepath=None):
    """
    Read investigator data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'investigator.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_subject_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL13237501A1_V3_SubjectDS.txt')
    df = pd.read_csv(filepath, delimiter='\t', dtype={'SUBJID': str})
    df = df[['SUBJECT_ID', 'CONSENT']]
    
    # Decode consent ints to consent strings
    def func(row): 
        _map = {0:None, 
                1: "Health/Medical/Biomedical (IRB)", 
                2: "Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)"}
        return _map[row['CONSENT']]
    df['CONSENT'] = df.apply(func, axis=1)
    
    return df

@reformat_column_names
@dropna_rows_cols
def read_phenotype_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SubjectPhenotypesDS.txt')
    df = pd.read_csv(filepath, 
                    delimiter='\t', 
                    dtype={'SUBJID': str})
    
    # Decode sex ints to gender strings
    def func(row): 
        _map = {1: "male", 2: "female"}
        return _map[row['Sex']]
    df['Sex'] = df.apply(func, axis=1)

    # Decode affected status ints to strings
    def func(row): 
        _map = {0:'unknown', 1: "not affected", 2: "affected"}
        return _map[row['AFFSTAT']]
    df['AFFSTAT'] = df.apply(func, axis=1)

    # Decode proband ints to booleans
    def func(row): 
        _map = {1: True, 2: False}
        return _map[row['Proband']]
    df['Proband'] = df.apply(func, axis=1)
    
    # Create ethnicity column
    _map = {'Hispanic': 'hispanic or latino'}
    df['ethnicity'] = df['Race'].apply(lambda x: _map.get(x, 'not hispanic or latino'))
    
    
    return df

@reformat_column_names
@dropna_rows_cols
def read_family_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_PedgreeDS.txt')
    df = pd.read_csv(filepath, delimiter='\t', dtype={'SUBJID': str})
    del df['SEX']
    return df

@reformat_column_names
@dropna_rows_cols
def create_participant_data():
    """
    Create participant data from 
    """
    # Subject file
    subject_df = read_subject_data()
    # Phenotype file
    phenotypes_df = read_phenotype_data()
    # Family file
    family_df = read_family_data()
    
    # Merge subject + phenotype
    df1 = pd.merge(subject_df, phenotypes_df, on='subject_id')
    
    # Merge family
    df = pd.merge(df1, family_df, on='subject_id')
    
    return df

def create_diagnosis_df(phenotype_df):
    """
    Create diagnosis df from phenotype df
    """
    def func(row): 
        _map = {'affected':'adolescent idiopathic scoliosis', 
                'not affected': None}
        return _map.get(row['affstat'], row['affstat'])
    phenotype_df['diagnosis'] = phenotype_df.apply(func, axis=1)
    
    return phenotype_df[['subject_id', 'diagnosis']]

def create_phenotype_df(phenotype_df):
    """
    Create phenotype df from original phenotype_df
    """
    # Extract columns
    phenotype_df = phenotype_df[['subject_id', 'affstat']]
    # Drop unknowns
    phenotype_df = phenotype_df[phenotype_df.affstat != 'unknown']
    
    # Add columns
    def func(row): 
        _map = {'affected':'positive', 
                'not affected': 'negative'}
        return _map.get(row['affstat'], row['affstat'])
    
    phenotype_df['observed'] = phenotype_df.apply(func, axis=1)
    phenotype_df['hpo_id'] = 'HP:0002650'
    phenotype_df['phenotype'] = 'adolescent idiopathic scoliosis'
    return phenotype_df
df = create_diagnosis_df(read_phenotype_data())
df.diagnosis.unique()

In [None]:
@reformat_column_names
@dropna_rows_cols
def read_sample_attr_data(filepath=None):
    """
    Read sample attributes file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SampleAttributesDS.txt')
    return pd.read_csv(filepath, delimiter='\t')

@reformat_column_names
@dropna_rows_cols
def read_subject_sample_data(filepath=None):
    """
    Read subject sample mapping file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SubjectSampleMappingDS.txt')
    return pd.read_csv(filepath, delimiter='\t')

# Sample attributes file
sample_attr_df = read_sample_attr_data()
sample_attr_df.shape
# Subject sample file
subject_sample_df = read_subject_sample_data()
# Subject file
subject_df = read_subject_data()

# Merge sample attributes w subject sample
df1 = pd.merge(sample_attr_df, subject_sample_df, on='sample_id')
# Merge sample with subject
sample_df = pd.merge(df1, subject_df, on='subject_id')
sample_df

@reformat_column_names
@dropna_rows_cols
def read_seq_exp_data(filepath=None):
    """
    Read sequencing experiment data
    """
    if not filepath:
        filepath = os.path.join(MANIFESTS_DIR, 'manifest_171210.csv')

    df = pd.read_csv(filepath)
    df['Sample Description'] = df['Sample Description'].apply(lambda x: x.split(':')[-1].strip())

    # Add unique col
    def func(row): return "_".join(['seq_exp', str(row.name)])
    df['seq_exp_id'] = df.apply(func, axis=1)

    return df

@reformat_column_names
@dropna_rows_cols
def create_biospecimen_data(participant_df):
    """
    Create biospeciment df
    """
    # Sample attributes file
    sample_attr_df = read_sample_attr_data()
    # Subject sample file
    subject_sample_df = read_subject_sample_data()
    # Merge sample attributes w subject sample
    df1 = pd.merge(subject_sample_df, sample_attr_df, how='left', on='sample_id')
    # Merge sample with participant_df
    biospecimen_df = pd.merge(df1, participant_df[['subject_id', 'sex']], on='subject_id')

    return biospecimen_df

In [None]:
def read_genomic_files_info(filepath):
    """
    Read genomic file info json produced by Gen3 registration
    and convert into genomic file table for dataservice
    """
    data = read_json(filepath)
    df = pd.DataFrame(list(data.values()))

    # Reformat
    df['md5sum'] = df['hashes'].apply(lambda x: x['md5'])
    df['file_url'] = df['urls'].apply(lambda x: x[0])
    df['file_name'] = df['file_url'].apply(
        lambda file_url: os.path.basename(file_url))
    df['file_format'] = df['file_name'].apply(
        extract_uncompressed_file_ext)
    df.rename(columns={'did': 'uuid', 'size': 'file_size'}, inplace=True)

    # Data type
    def func(x):
        x = x.strip()
        if x.endswith('cram') or x.endswith('bam'):
            val = 'submitted aligned reads'
        elif x.endswith('crai'):
            val = 'submitted aligned reads index'
        elif 'fastq' in x:
            val = 'submitted reads'
        elif 'vcf' in x:
            val = 'simple nucleotide variation'
        else:
            val = None
        return val

    df['data_type'] = df['file_name'].apply(func)

    return df


## Explore

In [None]:
# Db gap files
files = {f:os.path.join(DBGAP_DIR, f) for f in os.listdir(DBGAP_DIR)}
pprint(list(files.keys()))

### Subject 

In [None]:
df1 = pd.read_csv(files['HL13237501A1_V3_SubjectDS.txt'], delimiter='\t', dtype={'SUBJID': str})
df1.head()

In [None]:
df1.describe(include=['O']).T.sort_values('unique', ascending=False)

In [None]:
# Subject 
df2 = pd.read_csv(files['HL132375-01A1_V2_SubjectDS.txt'], delimiter='\t', dtype={'SUBJID': str})
df2.head()

In [None]:
df2.describe(include=['O']).T.sort_values('unique', ascending=False)

In [None]:
df3 = pd.merge(df1, df2, on='SUBJECT_ID')
df3.describe(include=['O']).T.sort_values('unique', ascending=False)

### Family/Pedigree

In [None]:
df = pd.read_csv(files['HL132375-01A1_V2_PedgreeDS.txt'], delimiter='\t', dtype={'SUBJID': str})
df.head()

In [None]:
df.describe(include=['O']).T.sort_values('unique', ascending=False)

### Phenotypes

In [None]:
df = pd.read_csv(files['HL132375-01A1_V2_SubjectPhenotypesDS.txt'], delimiter='\t', dtype={'SUBJID': str})
df.head()

### Samples

In [None]:
# Sample attributes
filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SampleAttributesDS.txt')
df = pd.read_csv(filepath, delimiter='\t')
cols_to_lower(df)
df.head()

In [None]:
df.histological_type.unique()

In [None]:
# Subject sample mapping
filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SubjectSampleMappingDS.txt')
df = pd.read_csv(filepath, delimiter='\t')
df.head()

## Extract

In [None]:
# Participant + Demographic df
participant_df = create_participant_data()
participant_df.head()
participant_df

#### Families and Proband

In [None]:
print('# of families {}'.format(participant_df['family_id'].nunique()))
print('# of probands {}'.format(participant_df[participant_df['proband'] == True]['proband'].count()))


#### Families without a Proband

In [None]:
g = participant_df.groupby(['family_id'])[['subject_id', 'affstat', 'proband', 'family_id']]
p = g.describe()['proband']
p[p['unique'] != 2]

#### Participants Affected but NOT a Proband

In [None]:
c = participant_df[(participant_df['affstat'] == 'affected') & (participant_df['proband'] == False)]['subject_id'].nunique()
print('# of affected participants that are not probands {} '.format(c))

In [None]:
# Family df
family_df = read_family_data()
mothers = pd.merge(family_df[['subject_id', 'family_id']], family_df[['mother', 'father']], left_on='subject_id', right_on='mother')
mothers.shape

In [None]:
fathers = pd.merge(family_df[['subject_id', 'family_id']], family_df[['mother', 'father']], left_on='subject_id', right_on='father')
fathers.shape

In [None]:
# Phenotype df
phenotype_df = read_phenotype_data()
phenotype_df.head()

In [None]:
# Diagnosis
diagnosis_df = create_diagnosis_df(phenotype_df)
diagnosis_df.head()

In [None]:
# Sequencing Experiments
seq_exp_df = read_seq_exp_data()
print(seq_exp_df.nunique())
seq_exp_df.head()

In [None]:
biospecimen_df = create_biospecimen_data(participant_df)
print(biospecimen_df.nunique())
biospecimen_df.head()

In [None]:
@reformat_column_names
@dropna_rows_cols
def create_genomic_file_df(seq_exp_df, biospecimen_df):
    """
    Create genomic file df
    """
    # Genomic file info
    filepath = os.path.join(DATA_DIR, 'genomic_files_by_uuid.json')
    gf_df = read_genomic_files_info(filepath)
    # Add library
    gf_df['library'] = gf_df['file_url'].apply(
        lambda file_url: os.path.dirname(file_url).split('/')[-1])

    # Merge sequencing experiments
    df1 = pd.merge(seq_exp_df, gf_df, on='library')
    
    # Merge biospecimens
    genomic_file_df = pd.merge(biospecimen_df, df1, left_on='sample_id', right_on='sample_description')

    return genomic_file_df
genomic_file_df = create_genomic_file_df(seq_exp_df, biospecimen_df)
# genomic_file_df.describe(include='O')
genomic_file_df[['sample_id', 'library', 'file_name']].describe(include='O')