In [1]:
import os
import json
from pprint import pprint
import pandas as pd
pd.set_option('display.max_colwidth', -1)

DATA_DIR = '/Users/singhn4/Projects/kids_first/data/Rios_Wise_2016'
DBGAP_DIR = os.path.join(DATA_DIR, 'dbgap')
MANIFESTS_DIR = os.path.join(DATA_DIR, 'manifests')

In [2]:
# Helper functions
def read_json(filepath):
    with open(filepath, 'r') as json_file:
        return json.load(json_file)

def write_json(data, filepath):
    with open(filepath, 'w') as json_file:
        json.dump(data, json_file, sort_keys=True, indent=4, separators=(',', ':'))
        
def cols_to_lower(df):
    df.columns = map((lambda x: x.replace(" ", "_").lower()), df.columns)
        
def dropna_rows_cols(df_func):
    """
    Decorator to drop rows and cols w all nan values
    Replace NaN values with None
    """

    def wrapper(*args, **kwargs):
        df = df_func(*args, **kwargs)

        # None or empty df
        try:
            if df.empty:
                return df
        except AttributeError:
            return df

        # Rows
        df.dropna(how="all", inplace=True)
        # Cols
        df.dropna(how="all", axis=1, inplace=True)
        # Replace NaN values with None
        df = df.where((pd.notnull(df)), None)
        return df

    return wrapper

def reformat_column_names(df_func):
    """
    Decorator to reformat DataFrame column names.

    Replace all column names having whitespace with underscore
    and make lowercase
    """

    def wrapper(*args, **kwargs):
        df = df_func(*args, **kwargs)
        # None or empty df
        try:
            if df.empty:
                return df
        except AttributeError:
            return df
        
        cols_to_lower(df)
        
        return df

    return wrapper

In [30]:
# Create study
study = {
    'data_access_authority': 'dbGaP',
    'study_id': 'phs001410',
    'study_version': 'v1.p2',
    'study_name': 'Genomics of Orthopaedic Disease Program',
    'attribution': None
}
study_df = pd.DataFrame([study])
study_df.to_csv(os.path.join(DATA_DIR, 'study.txt'), index=False)

# Create investigator
invest = {
    'investigator_name': 'Jonathan Rios',
    'institution': 'UT Southwestern Medical Center'
}
inv_df = pd.DataFrame([invest])
inv_df.to_csv(os.path.join(DATA_DIR, 'investigator.txt'))

In [4]:
@reformat_column_names
@dropna_rows_cols
def read_study_file_data(filepaths=None):
    """
    Read in raw study files
    """
    if not filepaths:
        filepaths = os.listdir(DBGAP_DIR)

    study_files = [{"study_file_name": f}
                   for f in filepaths if 'dbGaP' in f]
    return pd.DataFrame(study_files)

@reformat_column_names
@dropna_rows_cols
def read_study_data(filepath=None):
    """
    Read study data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'study.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_investigator_data(filepath=None):
    """
    Read investigator data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'investigator.txt')
    df = pd.read_csv(filepath)

    return df

@reformat_column_names
@dropna_rows_cols
def read_subject_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL13237501A1_V3_SubjectDS.txt')
    df = pd.read_csv(filepath, delimiter='\t', dtype={'SUBJID': str})
    df = df[['SUBJECT_ID', 'CONSENT']]
    
    # Decode consent ints to consent strings
    def func(row): 
        _map = {0:None, 
                1: "Health/Medical/Biomedical (IRB)", 
                2: "Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)"}
        return _map[row['CONSENT']]
    df['CONSENT'] = df.apply(func, axis=1)
    
    return df

@reformat_column_names
@dropna_rows_cols
def read_phenotype_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SubjectPhenotypesDS.txt')
    df = pd.read_csv(filepath, 
                    delimiter='\t', 
                    dtype={'SUBJID': str})
    
    # Decode sex ints to gender strings
    def func(row): 
        _map = {1: "male", 2: "female"}
        return _map[row['Sex']]
    df['Sex'] = df.apply(func, axis=1)

    # Decode affected status ints to strings
    def func(row): 
        _map = {0:'unknown', 1: "not affected", 2: "affected"}
        return _map[row['AFFSTAT']]
    df['AFFSTAT'] = df.apply(func, axis=1)

    # Decode proband ints to booleans
    def func(row): 
        _map = {1: True, 2: False}
        return _map[row['Proband']]
    df['Proband'] = df.apply(func, axis=1)
    
    # Create ethnicity column
    _map = {'Hispanic': 'hispanic or latino'}
    df['ethnicity'] = df['Race'].apply(lambda x: _map.get(x, 'not hispanic or latino'))
    
    
    return df

@reformat_column_names
@dropna_rows_cols
def read_family_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_PedgreeDS.txt')
    df = pd.read_csv(filepath, delimiter='\t', dtype={'SUBJID': str})
    del df['SEX']
    return df

@reformat_column_names
@dropna_rows_cols
def create_participant_data():
    """
    Create participant data from 
    """
    # Subject file
    subject_df = read_subject_data()
    # Phenotype file
    phenotypes_df = read_phenotype_data()
    # Family file
    family_df = read_family_data()
    
    # Merge subject + phenotype
    df1 = pd.merge(subject_df, phenotypes_df, on='subject_id')
    
    # Merge family
    df = pd.merge(df1, family_df, on='subject_id')
    
    return df

def create_diagnosis_df(phenotype_df):
    """
    Create diagnosis df from phenotype df
    """
    def func(row): 
        _map = {'affected':'adolescent idiopathic scoliosis', 
                'not affected': None}
        return _map.get(row['affstat'], row['affstat'])
    phenotype_df['diagnosis'] = phenotype_df.apply(func, axis=1)
    
    return phenotype_df[['subject_id', 'diagnosis']]

def create_phenotype_df(phenotype_df):
    """
    Create phenotype df from original phenotype_df
    """
    # Extract columns
    phenotype_df = phenotype_df[['subject_id', 'affstat']]
    # Drop unknowns
    phenotype_df = phenotype_df[phenotype_df.affstat != 'unknown']
    
    # Add columns
    def func(row): 
        _map = {'affected':'positive', 
                'not affected': 'negative'}
        return _map.get(row['affstat'], row['affstat'])
    
    phenotype_df['observed'] = phenotype_df.apply(func, axis=1)
    phenotype_df['hpo_id'] = 'HP:0002650'
    phenotype_df['phenotype'] = 'adolescent idiopathic scoliosis'
    return phenotype_df
df = create_diagnosis_df(read_phenotype_data())
df.diagnosis.unique()

array(['adolescent idiopathic scoliosis', None, 'unknown'], dtype=object)

In [5]:
@reformat_column_names
@dropna_rows_cols
def read_sample_attr_data(filepath=None):
    """
    Read sample attributes file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SampleAttributesDS.txt')
    return pd.read_csv(filepath, delimiter='\t')

@reformat_column_names
@dropna_rows_cols
def read_subject_sample_data(filepath=None):
    """
    Read subject sample mapping file
    """
    if not filepath:
        filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SubjectSampleMappingDS.txt')
    return pd.read_csv(filepath, delimiter='\t')

# Sample attributes file
sample_attr_df = read_sample_attr_data()
sample_attr_df.shape
# Subject sample file
subject_sample_df = read_subject_sample_data()
# Subject file
subject_df = read_subject_data()

# Merge sample attributes w subject sample
df1 = pd.merge(sample_attr_df, subject_sample_df, on='sample_id')
# Merge sample with subject
sample_df = pd.merge(df1, subject_df, on='subject_id')
sample_df

@reformat_column_names
@dropna_rows_cols
def read_seq_exp_data(filepath=None):
    """
    Read sequencing experiment data
    """
    if not filepath:
        filepath = os.path.join(MANIFESTS_DIR, 'manifest_171210.csv')

    df = pd.read_csv(filepath)
    df['Sample Description'] = df['Sample Description'].apply(lambda x: x.split(':')[-1].strip())
    df.describe(include=['O']).T.sort_values('unique', ascending=False)
    
    # Subject sample mapping
    filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SubjectSampleMappingDS.txt')
    subject_sample_df = pd.read_csv(filepath, delimiter='\t')

    # Merge with subject samples
    df = pd.merge(subject_sample_df, df, left_on='SAMPLE_ID', right_on='Sample Description')

    # Add unique col
    def func(row): return "_".join(['seq_exp', str(row.name)])
    df['seq_exp_id'] = df.apply(func, axis=1)

    return df
df = read_seq_exp_data()
df.shape

(395, 11)

## Explore

In [6]:
# Db gap files
files = {f:os.path.join(DBGAP_DIR, f) for f in os.listdir(DBGAP_DIR)}
pprint(list(files.keys()))

['HL132375-01A1_V2_PedgreeDD.txt',
 'HL132375-01A1_V2_PedgreeDS.txt',
 'HL132375-01A1_V2_SampleAttributesDD.txt',
 'HL132375-01A1_V2_SampleAttributesDS.txt',
 'HL132375-01A1_V2_SubjectDD.txt',
 'HL132375-01A1_V2_SubjectDS.txt',
 'HL132375-01A1_V2_SubjectPhenotypesDD.txt',
 'HL132375-01A1_V2_SubjectPhenotypesDS.txt',
 'HL132375-01A1_V2_SubjectSampleMappingDD.txt',
 'HL132375-01A1_V2_SubjectSampleMappingDS.txt',
 'HL13237501A1_V3_SubjectDS.txt',
 'HL13237501A1_V3_SubjectSampleMappingDS.txt']


### Subject 

In [7]:
df1 = pd.read_csv(files['HL13237501A1_V3_SubjectDS.txt'], delimiter='\t', dtype={'SUBJID': str})
df1.head()

Unnamed: 0,SUBJECT_ID,CONSENT,SUBJECT_SOURCE,SOURCE_SUBJECT_ID,AFFSTAT
0,6355001,2,"Washington University, St. Louis",6355001,2.0
1,6355002,2,"Washington University, St. Louis",6355002,2.0
2,6355004,2,"Washington University, St. Louis",6355004,2.0
3,6355005,2,"Washington University, St. Louis",6355005,2.0
4,6355006,2,"Washington University, St. Louis",6355006,2.0


In [8]:
df1.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,351,351,IS1076-3,1
SOURCE_SUBJECT_ID,351,351,IS1076-3,1
SUBJECT_SOURCE,351,2,Texas Scottish Rite Hospital For Children,318


In [9]:
# Subject 
df2 = pd.read_csv(files['HL132375-01A1_V2_SubjectDS.txt'], delimiter='\t', dtype={'SUBJID': str})
df2.head()

Unnamed: 0,SUBJECT_ID,CONSENT,SUBJECT_SOURCE,SOURCE_SUBJECT_ID,AFFSTAT
0,6355001,2,"Washington University, St. Louis",6355001,2.0
1,6355002,2,"Washington University, St. Louis",6355002,2.0
2,6355004,2,"Washington University, St. Louis",6355004,2.0
3,6355005,2,"Washington University, St. Louis",6355005,2.0
4,6355006,2,"Washington University, St. Louis",6355006,2.0


In [10]:
df2.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,351,351,IS1076-3,1
SOURCE_SUBJECT_ID,300,300,IS1076-3,1
SUBJECT_SOURCE,351,2,Texas Scottish Rite Hospital For Children,318


In [11]:
df3 = pd.merge(df1, df2, on='SUBJECT_ID')
df3.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,351,351,IS1076-3,1
SOURCE_SUBJECT_ID_x,351,351,IS1076-3,1
SOURCE_SUBJECT_ID_y,300,300,IS1076-3,1
SUBJECT_SOURCE_x,351,2,Texas Scottish Rite Hospital For Children,318
SUBJECT_SOURCE_y,351,2,Texas Scottish Rite Hospital For Children,318


### Family/Pedigree

In [12]:
df = pd.read_csv(files['HL132375-01A1_V2_PedgreeDS.txt'], delimiter='\t', dtype={'SUBJID': str})
df.head()

Unnamed: 0,FAMILY_ID,SUBJECT_ID,MOTHER,FATHER,SEX
0,6355,6355001,6355002,6355201,2
1,6355,6355002,6355302,6355301,2
2,6355,6355004,6355302,6355301,2
3,6355,6355005,6355004,6355203,1
4,6355,6355006,6355004,6355203,2


In [13]:
df.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
SUBJECT_ID,351,351,IS1076-3,1
FATHER,351,103,0,175
MOTHER,351,100,0,175
FAMILY_ID,351,73,IS0031,16


### Phenotypes

In [14]:
df = pd.read_csv(files['HL132375-01A1_V2_SubjectPhenotypesDS.txt'], delimiter='\t', dtype={'SUBJID': str})
df.head()

Unnamed: 0,Subject_ID,Sex,AFFSTAT,Proband,Race,Treatment
0,6355001,2,2,1,Caucasian,Surgery
1,6355002,2,2,2,Caucasian,Surgery
2,6355004,2,2,2,Caucasian,Observation
3,6355005,1,2,2,Caucasian,Surgery
4,6355006,2,2,2,Caucasian,Observation


### Samples

In [15]:
# Sample attributes
filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SampleAttributesDS.txt')
df = pd.read_csv(filepath, delimiter='\t')
cols_to_lower(df)
df.head()

Unnamed: 0,sample_id,body_site,analyte_type,is_tumor,histological_type
0,6355001,Blood,DNA,N,Blood
1,6355002,Blood,DNA,N,Blood
2,6355004,Blood,DNA,N,Blood
3,6355005,Blood,DNA,N,Blood
4,6355006,Blood,DNA,N,Blood


In [16]:
df.histological_type.unique()

array(['Blood', 'Saliva'], dtype=object)

In [17]:
# Subject sample mapping
filepath = os.path.join(DBGAP_DIR, 'HL132375-01A1_V2_SubjectSampleMappingDS.txt')
df = pd.read_csv(filepath, delimiter='\t')
df

Unnamed: 0,SUBJECT_ID,SAMPLE_ID,SAMPLE_SOURCE,SOURCE_SAMPLE_ID,SAMPLE_USE
0,6355001,6355001,"Washington University, St. Louis",6355001,Seq_DNA_WholeGenome
1,6355002,6355002,"Washington University, St. Louis",6355002,Seq_DNA_WholeGenome
2,6355004,6355004,"Washington University, St. Louis",6355004,Seq_DNA_WholeGenome
3,6355005,6355005,"Washington University, St. Louis",6355005,Seq_DNA_WholeGenome
4,6355006,6355006,"Washington University, St. Louis",6355006,Seq_DNA_WholeGenome
5,6409001,6409001,"Washington University, St. Louis",6409001,Seq_DNA_WholeGenome
6,6409002,6409002,"Washington University, St. Louis",6409002,Seq_DNA_WholeGenome
7,6409003,6409003,"Washington University, St. Louis",6409003,Seq_DNA_WholeGenome
8,6409004,6409004,"Washington University, St. Louis",6409004,Seq_DNA_WholeGenome
9,6446001,6446001,"Washington University, St. Louis",6446001,Seq_DNA_WholeGenome


## Extract

In [18]:
# Participant + Demographic df
participant_df = create_participant_data()
participant_df.head()
participant_df

Unnamed: 0,subject_id,consent,sex,affstat,proband,race,treatment,ethnicity,family_id,mother,father
0,6355001,"Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)",female,affected,True,Caucasian,Surgery,not hispanic or latino,6355,6355002,6355201
1,6355002,"Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)",female,affected,False,Caucasian,Surgery,not hispanic or latino,6355,6355302,6355301
2,6355004,"Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)",female,affected,False,Caucasian,Observation,not hispanic or latino,6355,6355302,6355301
3,6355005,"Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)",male,affected,False,Caucasian,Surgery,not hispanic or latino,6355,6355004,6355203
4,6355006,"Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)",female,affected,False,Caucasian,Observation,not hispanic or latino,6355,6355004,6355203
5,6409001,"Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)",male,affected,True,Caucasian,Observation,not hispanic or latino,6409,6409002,6409003
6,6409002,"Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)",female,affected,False,Caucasian,Observation,not hispanic or latino,6409,0,0
7,6409003,"Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)",male,not affected,False,Caucasian,,not hispanic or latino,6409,0,0
8,6409004,"Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)",male,affected,False,Caucasian,Surgery,not hispanic or latino,6409,6409002,6409003
9,6446001,"Disease-Specific (Musculoskeletal Diseases, IRB)(DS-MUS-SKEL-IRB)",male,affected,True,Caucasian,Surgery,not hispanic or latino,6446,6446002,6446201


In [19]:
# Family df
family_df = read_family_data()
mothers = pd.merge(family_df[['subject_id', 'family_id']], family_df[['mother', 'father']], left_on='subject_id', right_on='mother')
mothers.shape

(176, 4)

In [20]:
fathers = pd.merge(family_df[['subject_id', 'family_id']], family_df[['mother', 'father']], left_on='subject_id', right_on='father')
fathers.shape

(176, 4)

In [21]:
# Phenotype df
phenotype_df = read_phenotype_data()
phenotype_df.head()

Unnamed: 0,subject_id,sex,affstat,proband,race,treatment,ethnicity
0,6355001,female,affected,True,Caucasian,Surgery,not hispanic or latino
1,6355002,female,affected,False,Caucasian,Surgery,not hispanic or latino
2,6355004,female,affected,False,Caucasian,Observation,not hispanic or latino
3,6355005,male,affected,False,Caucasian,Surgery,not hispanic or latino
4,6355006,female,affected,False,Caucasian,Observation,not hispanic or latino


In [22]:
# Diagnosis
diagnosis_df = create_diagnosis_df(phenotype_df)
diagnosis_df.head()

Unnamed: 0,subject_id,diagnosis
0,6355001,adolescent idiopathic scoliosis
1,6355002,adolescent idiopathic scoliosis
2,6355004,adolescent idiopathic scoliosis
3,6355005,adolescent idiopathic scoliosis
4,6355006,adolescent idiopathic scoliosis


In [26]:
# Genomic files

# Seq Exp Data
seq_exp_df = read_seq_exp_data()
seq_exp_df = seq_exp_df[['subject_id', 'library', 'sample_id', 'sample_description', 'seq_exp_id']]
seq_exp_df

# Genomic file info
uuid_dict= read_json(os.path.join(DATA_DIR,'genomic_files_by_uuid.json'))
gf_df = pd.DataFrame(list(uuid_dict.values()))
gf_df['library'] = gf_df['file_name'].apply(lambda fn: fn.split('.')[0])

# Merge
df = pd.merge(seq_exp_df, gf_df, on='library')

# Reformat
df['md5sum'] = df['hashes'].apply(lambda x: x['md5'])
df['file_url'] = df['urls'].apply(lambda x: x[0])
df['file_format'] = df['file_name'].apply(lambda x: '.'.join(x.split('.')[1:]))
df.rename(columns={'did':'uuid', 'size':'file_size'}, inplace=True)
df['data_type'] = df.apply(lambda row: 'submitted aligned reads' if row['file_format'] == 'bam' else 'variant calling', axis=1)
df

Unnamed: 0,subject_id,library,sample_id,sample_description,seq_exp_id,uuid,file_name,form,hashes,file_size,urls,md5sum,file_url,file_format,data_type
0,6355001,SL248491,6355001,6355001,seq_exp_0,18866395-d3e4-492d-a83c-1cefee4564f0,SL248491.bam,object,{'md5': '8d383e4bd96750933a24aaf80fe5e9cc'},88990671195,[s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.bam],8d383e4bd96750933a24aaf80fe5e9cc,s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.bam,bam,submitted aligned reads
1,6355001,SL248491,6355001,6355001,seq_exp_0,429bf3b2-2119-41f1-af50-553954b82954,SL248491.hard-filtered.vcf,object,{'md5': 'e784d364c8bf4387d532bce0b13573d9'},1039857926,[s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.hard-filtered.vcf],e784d364c8bf4387d532bce0b13573d9,s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.hard-filtered.vcf,hard-filtered.vcf,variant calling
2,6355001,SL248491,6355001,6355001,seq_exp_0,a99b0e34-00d1-4f9e-9042-e5a8f2ff8480,SL248491.bam,object,{'md5': '8d383e4bd96750933a24aaf80fe5e9cc'},88990671195,[s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.bam],8d383e4bd96750933a24aaf80fe5e9cc,s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.bam,bam,submitted aligned reads
3,6355001,SL248491,6355001,6355001,seq_exp_1,18866395-d3e4-492d-a83c-1cefee4564f0,SL248491.bam,object,{'md5': '8d383e4bd96750933a24aaf80fe5e9cc'},88990671195,[s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.bam],8d383e4bd96750933a24aaf80fe5e9cc,s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.bam,bam,submitted aligned reads
4,6355001,SL248491,6355001,6355001,seq_exp_1,429bf3b2-2119-41f1-af50-553954b82954,SL248491.hard-filtered.vcf,object,{'md5': 'e784d364c8bf4387d532bce0b13573d9'},1039857926,[s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.hard-filtered.vcf],e784d364c8bf4387d532bce0b13573d9,s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.hard-filtered.vcf,hard-filtered.vcf,variant calling
5,6355001,SL248491,6355001,6355001,seq_exp_1,a99b0e34-00d1-4f9e-9042-e5a8f2ff8480,SL248491.bam,object,{'md5': '8d383e4bd96750933a24aaf80fe5e9cc'},88990671195,[s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.bam],8d383e4bd96750933a24aaf80fe5e9cc,s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248491/SL248491.bam,bam,submitted aligned reads
6,6355002,SL248472,6355002,6355002,seq_exp_2,2e30118e-98ff-446d-ab31-4c3e501d9b5e,SL248472.bam,object,{'md5': 'a6741d9591739f74529529da2928c288'},97546026743,[s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248472/SL248472.bam],a6741d9591739f74529529da2928c288,s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248472/SL248472.bam,bam,submitted aligned reads
7,6355002,SL248472,6355002,6355002,seq_exp_2,310a22cd-cf52-4df8-8a89-5aecdaeb407d,SL248472.hard-filtered.vcf,object,{'md5': '78650fab8221c979eb13aba9997bc4b7'},1038379761,[s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248472/SL248472.hard-filtered.vcf],78650fab8221c979eb13aba9997bc4b7,s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248472/SL248472.hard-filtered.vcf,hard-filtered.vcf,variant calling
8,6355002,SL248472,6355002,6355002,seq_exp_2,adfdd5d3-41fc-4e6f-a54a-602735d39173,SL248472.bam,object,{'md5': 'a6741d9591739f74529529da2928c288'},97546026743,[s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248472/SL248472.bam],a6741d9591739f74529529da2928c288,s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248472/SL248472.bam,bam,submitted aligned reads
9,6355002,SL248472,6355002,6355002,seq_exp_3,2e30118e-98ff-446d-ab31-4c3e501d9b5e,SL248472.bam,object,{'md5': 'a6741d9591739f74529529da2928c288'},97546026743,[s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248472/SL248472.bam],a6741d9591739f74529529da2928c288,s3://kf-seq-data-hudsonalpha/haib17JR4599/SL248472/SL248472.bam,bam,submitted aligned reads


In [24]:
urls = ['s3://kf-seq-data-hudsonalpha/haib17JR4599/SL250571/HJKGVALXX_s1_2_GSLv3-7_82_SL250571.fastq.gz']
dirname = os.path.dirname(urls[0]).split('/')[-1]
dirname

'SL250571'