In [1]:
import os
from pprint import pprint
import pandas as pd
pd.set_option('display.max_colwidth', -1)

DATA_DIR = '/Users/singhn4/Projects/kids_first/data/Schiffman'

In [2]:
# Helper functions
def dropna_rows_cols(df_func):
    """
    Decorator to drop rows and cols w all nan values
    """

    def wrapper(*args, **kwargs):
        df = df_func(*args, **kwargs)

        # None or empty df
        try:
            if df.empty:
                return df
        except AttributeError:
            return df

        # Rows
        df.dropna(how="all", inplace=True)
        # Cols
        df.dropna(how="all", axis=1, inplace=True)
        return df

    return wrapper

def reformat_column_names(df_func):
    """
    Decorator to reformat DataFrame column names.

    Replace all column names having whitespace with underscore
    and make lowercase
    """

    def wrapper(*args, **kwargs):
        df = df_func(*args, **kwargs)
        # None or empty df
        try:
            if df.empty:
                return df
        except AttributeError:
            return df
        df.columns = map((lambda x: x.replace(" ", "_").lower()),
                         df.columns)
        return df

    return wrapper

In [3]:
# Create study
study = {
    'data_access_authority': 'dbGaP',
    'study_id': 'phs001194',
    'study_version': 'v1.p2',
    'study_name': 'Expanded Ewing sarcoma cohort for tumor genomics and association with DNA repair deficiences, clinical presentation, and outcome',
    'attribution': 'https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/GetAcknowledgementStatement.cgi?study_id=phs001194.v1.p2'
}
study_df = pd.DataFrame([study])
study_df.to_csv(os.path.join(DATA_DIR, 'study.txt'))

# Create investigator
invest = {
    'investigator_name': 'Joshua Schiffman',
    'institution': 'University of Utah'
}
inv_df = pd.DataFrame([invest])
inv_df.to_csv(os.path.join(DATA_DIR, 'investigator.txt'))

In [4]:
# Read study files data
@reformat_column_names
@dropna_rows_cols
def read_study_file_data(filepaths=None):
    """
    Read in raw study files
    """
    if not filepaths:
        filepaths = os.listdir(DATA_DIR)

    study_files = [{"study_file_name": f}
                   for f in filepaths]
    return pd.DataFrame(study_files)

In [5]:
@reformat_column_names
@dropna_rows_cols
def read_study_data(filepath=None):
    """
    Read study data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'study.txt')
    df = pd.read_csv(filepath)

    return df

In [6]:
@reformat_column_names
@dropna_rows_cols
def read_investigator_data(filepath=None):
    """
    Read investigator data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'investigator.txt')
    df = pd.read_csv(filepath)

    return df

In [7]:
def cols_to_lower(df):
    df.columns = map((lambda x: x.replace(" ", "_").lower()), df.columns)

In [8]:
@reformat_column_names
@dropna_rows_cols
def read_participant_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DATA_DIR, 'Schiffman_X01 Sample List.xlsx')
    # Participants
    df = pd.read_excel(filepath)
    cols_to_lower(df)
    # Extract participant columns
    participant_df = df[['individual_name', 'ewing_trio_number', 'relationship_to_proband']]

    # Create is_proband col
    def func(row): return row.relationship_to_proband == 'Self/Case'
    participant_df['relationship_to_proband'] = participant_df.apply(func, axis=1)

    # Create family_id column
    participant_df.rename(columns={'ewing_trio_number': 'family_id'})
    
    return participant_df

In [9]:
# Participants
filepath = os.path.join(DATA_DIR, 'Schiffman_X01 Sample List.xlsx')
all_data_df = pd.read_excel(filepath)
cols_to_lower(all_data_df)

def create_diagnosis_df(df):
    # Create diagnosis df
    df.loc[(df['morphology'] == "9260/3: Ewing's sarcoma\r\nEwing's tumor"),'morphology']= 'Ewing Sarcoma'    
    def func(row): return "_".join(['diagnosis', str(row.name)])
    df['diagnosis_id'] = df.apply(func, axis=1)
    df = df[['individual_name', 'age_at_diagnosis_(days)', 'morphology']]
    return df

In [10]:
# Sequencing experiment
@reformat_column_names
@dropna_rows_cols
def read_genomic_data(filepath=None):
    if not filepath:
        df = pd.read_excel(os.path.join(DATA_DIR, 'Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx'))
        del df['instrument_data_ids']
    return df

def create_seq_exp_data(df):
    df = df[['build_id', 'mean_insert_size', 'pf_reads', 'phenotype_sheet_sample_name']]
    return df

In [11]:
# Genomic files
import json

df = read_genomic_data()
df.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
c,1112,1112,H_UM-Schiffman-467-SS-470,1
prefix,1112,1112,H_UM-Schiffman-1181,1
coded,1112,1112,SS-522,1
phenotype_sheet_sample_name,1112,1112,SS-522,1
individual_name,1112,1112,Schiffman-905,1
build_id,1112,1112,db2c90e8acdc4974bfe9b2cc44199e17,1
bam_path,1112,1112,/gscmnt/gc13034/prod-builder/build_merged_alignments/merged-alignment-blade17-3-1.gsc.wustl.edu-prod-builder-3431-5059d31e024f4c94a71bf581ac401ea8/5059d31e024f4c94a71bf581ac401ea8.bam,1
topography,454,33,C40.2: Long bones of lower limb and associated joints,86
morphology,452,11,9260/3: Ewing's sarcoma\n\nEwing's tumor,368
notes_from_washu,1107,6,Sequenced,974


In [12]:
# Create genomic file df
def create_genomic_file_df(df):
    df = df[['build_id', 'bam_path']]
    df['file_name'] = df['bam_path'].apply(lambda p: os.path.basename(p))

    def get_ext(fp):
        filename = os.path.basename(fp)
        parts = filename.split('.')
        if len(parts) > 2:
            ext = '.'.join(parts[1:])
        else:
            ext = parts[-1]
        return ext

    with open(os.path.join(DATA_DIR, 'genomic_file_uuid.json'), 'r') as json_file:
        uuid_dict = json.load(json_file)

    gf_dicts = []
    for k, v in uuid_dict.items():
        file_info = {
            'uuid': v['did'],
            'md5sum': v['hashes']['md5'],
            'file_url': v['urls'][0],
            'data_type': 'submitted aligned reads',
            'file_format': get_ext(v['urls'][0]),
            'file_name': os.path.basename(v['urls'][0])
        }
        gf_dicts.append(file_info)

    gf_df = pd.DataFrame(gf_dicts)

    genomic_file_df = pd.merge(df, gf_df, on='file_name')
    
    return genomic_file_df


### Explore

In [13]:
pprint([f for f in os.listdir(DATA_DIR)])

['All_Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx',
 'Copy of Pilot Score Sheet 2017.xlsx',
 'EwingSarcoma_QC.1112samples.csv',
 'genomic_file_uuid.json',
 'investigator.txt',
 'Schiffman_EwingSarcoma_QC_vs_Phenotype.csv',
 'Schiffman_EwingSarcoma_QC_vs_Phenotype.txt',
 'Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx',
 'Schiffman_EwingSarcoma_QC_vs_Phenotype_trios_annotated.txt',
 'Schiffman_X01 Sample List.xlsx',
 'study.txt',
 '~$All_Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx']


#### Sample List (Schiffman_X01 Sample List.xlsx)

In [14]:
sample_list_df = pd.read_excel(os.path.join(DATA_DIR, 'Schiffman_X01 Sample List.xlsx'))
cols_to_lower(sample_list_df)
print(sample_list_df.shape)
sample_list_df.head()

(1207, 12)


Unnamed: 0,sample_name,individual_name,ewing_trio_number,relationship_to_proband,gender,age_at_diagnosis_(days),age_at_enrollment_(days),topography,morphology,full_trio/singleton/dropped,sample_type,notes_from_washu
0,SS-1,Schiffman-1,189,Father,Male,,12678.0,,,Full Trio,Saliva,Sequenced
1,SS-2,Schiffman-2,189,Mother,Female,,12919.0,,,Full Trio,Saliva,Sequenced
2,SS-3,Schiffman-3,189,Self/Case,Male,3761.0,4424.0,C40.2: Long bones of lower limb and associated joints,9260/3: Ewing's sarcoma\r\nEwing's tumor,Full Trio,Saliva,Sequenced
3,SS-4,Schiffman-4,684,Mother,Female,,16554.0,,,Full Trio,Saliva,Sequenced
4,SS-5,Schiffman-5,684,Father,Male,,16613.0,,,Full Trio,Saliva,Sequenced


In [15]:
sample_list_df.describe(include=['O']).T.sort_values('unique', ascending=False)

Unnamed: 0,count,unique,top,freq
sample_name,1207,1207,SS-522,1
individual_name,1207,1203,Schiffman-449,2
ewing_trio_number,1207,485,681,6
topography,381,32,C40.2: Long bones of lower limb and associated joints,86
morphology,380,9,9260/3: Ewing's sarcoma\r\nEwing's tumor,371
notes_from_washu,1200,8,Sequenced,974
full_trio/singleton/dropped,1207,6,Full Trio,989
relationship_to_proband,1207,4,Self/Case,487
sample_type,1207,3,Saliva,1153
gender,1207,2,Male,618


In [16]:
# Unique morphologies
print('Morphologies')
pprint(sample_list_df['morphology'].unique().tolist())
print('\nTopographies')
# Unique topographies
pprint(sample_list_df['topography'].unique().tolist())

Morphologies
[nan,
 "9260/3: Ewing's sarcoma\r\nEwing's tumor",
 '170.9 - Ewing Sarcoma & 171.0 - Rhabdomyosarcoma',
 'Ewing sarcoma with EWSR1 (22q12) rearrangement',
 'Most consistent with Ewing (EWS-FLI negative)',
 '9364/3: Neuroectodermal Tumor, NOS',
 '9364/3: Peripheral neuroectodermal tumor\r\nNeuroectodermal tumor, NOS',
 'Ewing Sarcoma/PNET',
 '171.8 - Malignant Neoplasm of other specified sites of connective and other '
 'soft tissue. Undifferentiated sarcoma of the left thigh',
 '170.9: Malignant neoplasm of bon and articular cartilage, Ewing Sarcoma']

Topographies
[nan,
 'C40.2: Long bones of lower limb and associated joints',
 'C41.2: Vertebral column',
 'C41.4: Pelvic bones, sacrum, coccyx and associated joints',
 'C40.0: Long bones of upper limb, scapula and associated joints',
 'C41.3: Rib, sternum, clavicle and associated joints',
 'C76.5: Lower limb, NOS',
 'C40.3: Short bones of lower limb and associated joints',
 'C41.1: Mandible',
 'C76.3: Pelvis, NOS',
 'C40.9: 

#### QC Phenotype (Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx)

In [17]:
qc_phenotype_df = pd.read_excel(os.path.join(DATA_DIR, 'Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx'))
del qc_phenotype_df['instrument_data_ids']
cols_to_lower(qc_phenotype_df)
qc_phenotype_df.head()

Unnamed: 0,c,prefix,coded,phenotype_sheet_sample_name,individual_name,ewing_trio_number,relationship_to_proband,relationship_coded,gender,age_at_diagnosis_(days),...,reads_mapped_in_interchromosomal_pairs_percentage,reads_mapped_as_singleton_percentage,pct_10x,pct_20x,discordant_rate,qc_comment,pass/fail,proband_pass,mother_pass,father_pass
0,H_UM-Schiffman-856-SS-859,H_UM-Schiffman-856,SS-859,SS-859,Schiffman-856,1,Self/Case,1,Female,5377.0,...,0.01848,0.17,0.988625,0.924837,2.448235,pass,0,1,0,0
1,H_UM-Schiffman-857-SS-860,H_UM-Schiffman-857,SS-860,SS-860,Schiffman-857,1,Mother,2,Female,,...,0.021641,0.17,0.989257,0.95956,2.708281,pass,0,0,1,0
2,H_UM-Schiffman-858-SS-861,H_UM-Schiffman-858,SS-861,SS-861,Schiffman-858,1,Father,3,Male,,...,0.021698,0.16,0.992999,0.923224,2.803853,pass,0,0,0,1
3,H_UM-Schiffman-827-SS-830,H_UM-Schiffman-827,SS-830,SS-830,Schiffman-827,2,Self/Case,1,Female,5455.0,...,0.018074,0.21,0.988363,0.907885,2.401644,pass,0,1,0,0
4,H_UM-Schiffman-826-SS-829,H_UM-Schiffman-826,SS-829,SS-829,Schiffman-826,2,Mother,2,Female,,...,0.018384,0.25,0.988576,0.927557,2.597264,pass,0,0,1,0


In [18]:
qc_phenotype_df.describe(include=['O']).T.sort_values('unique', ascending=False)
# qc_phenotype_df.dtypes

Unnamed: 0,count,unique,top,freq
c,1112,1112,H_UM-Schiffman-467-SS-470,1
prefix,1112,1112,H_UM-Schiffman-1181,1
coded,1112,1112,SS-522,1
phenotype_sheet_sample_name,1112,1112,SS-522,1
individual_name,1112,1112,Schiffman-905,1
build_id,1112,1112,db2c90e8acdc4974bfe9b2cc44199e17,1
bam_path,1112,1112,/gscmnt/gc13034/prod-builder/build_merged_alignments/merged-alignment-blade17-3-1.gsc.wustl.edu-prod-builder-3431-5059d31e024f4c94a71bf581ac401ea8/5059d31e024f4c94a71bf581ac401ea8.bam,1
topography,454,33,C40.2: Long bones of lower limb and associated joints,86
morphology,452,11,9260/3: Ewing's sarcoma\n\nEwing's tumor,368
notes_from_washu,1107,6,Sequenced,974


#### Trio Summary

In [19]:
trio_df = pd.read_excel(os.path.join(DATA_DIR, 'Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx'), sheet_name=1)
# trio_df.head()

### Extract

In [20]:
# Study
study_df = read_study_data()
study_df.head()

Unnamed: 0,unnamed:_0,attribution,data_access_authority,study_id,study_name,study_version
0,0,https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/GetAcknowledgementStatement.cgi?study_id=phs001194.v1.p2,dbGaP,phs001194,"Expanded Ewing sarcoma cohort for tumor genomics and association with DNA repair deficiences, clinical presentation, and outcome",v1.p2


In [21]:
# Investigator
investigator_df = read_investigator_data()
investigator_df.head()

Unnamed: 0,unnamed:_0,institution,investigator_name
0,0,University of Utah,Joshua Schiffman


In [22]:
# Study files
study_file_df = read_study_file_data()
study_file_df.head()

Unnamed: 0,study_file_name
0,All_Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx
1,Copy of Pilot Score Sheet 2017.xlsx
2,EwingSarcoma_QC.1112samples.csv
3,genomic_file_uuid.json
4,investigator.txt


In [24]:
# Participants
participant_df = read_participant_data()
participant_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,individual_name,ewing_trio_number,relationship_to_proband
0,Schiffman-1,189,False
1,Schiffman-2,189,False
2,Schiffman-3,189,True
3,Schiffman-4,684,False
4,Schiffman-5,684,False


In [25]:
# Diagnosis
diagnosis_df = create_diagnosis_df(all_data_df)
diagnosis_df.head()
diagnosis_df = diagnosis_df.where((pd.notnull(diagnosis_df)), None)
diagnosis_df.head()

Unnamed: 0,individual_name,age_at_diagnosis_(days),morphology
0,Schiffman-1,,
1,Schiffman-2,,
2,Schiffman-3,3761.0,Ewing Sarcoma
3,Schiffman-4,,
4,Schiffman-5,,
