In [None]:
import os
from pprint import pprint
import pandas as pd
pd.set_option('display.max_colwidth', -1)

from dataservice.util.data_import.utils import (
    dropna_rows_cols,
    reformat_column_names,
    cols_to_lower,
    read_json, 
    write_json,
    extract_uncompressed_file_ext
)

DATA_DIR = '/Users/singhn4/Projects/kids_first/data/Schiffman'

In [None]:
# Create study
study = {
    'data_access_authority': 'dbGaP',
    'study_id': 'phs001194',
    'study_version': 'v1.p2',
    'study_name': 'Expanded Ewing sarcoma cohort for tumor genomics and association with DNA repair deficiences, clinical presentation, and outcome',
    'attribution': 'https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/GetAcknowledgementStatement.cgi?study_id=phs001194.v1.p2'
}
study_df = pd.DataFrame([study])
study_df.to_csv(os.path.join(DATA_DIR, 'study.txt'))

# Create investigator
invest = {
    'investigator_name': 'Joshua Schiffman',
    'institution': 'University of Utah'
}
inv_df = pd.DataFrame([invest])
inv_df.to_csv(os.path.join(DATA_DIR, 'investigator.txt'))

In [None]:
# Read study files data
@reformat_column_names
@dropna_rows_cols
def read_study_file_data(filepaths=None):
    """
    Read in raw study files
    """
    if not filepaths:
        filepaths = os.listdir(DATA_DIR)

    study_files = [{"study_file_name": f}
                   for f in filepaths]
    return pd.DataFrame(study_files)

In [None]:
@reformat_column_names
@dropna_rows_cols
def read_study_data(filepath=None):
    """
    Read study data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'study.txt')
    df = pd.read_csv(filepath)

    return df

In [None]:
@reformat_column_names
@dropna_rows_cols
def read_investigator_data(filepath=None):
    """
    Read investigator data
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR,
                                'investigator.txt')
    df = pd.read_csv(filepath)

    return df

In [None]:
@reformat_column_names
@dropna_rows_cols
def read_data(filepath=None):
    """
    Read all the data into a dataframe
    """
    if not filepath:
        filepath = os.path.join(DATA_DIR, 'Schiffman_X01 Sample List.xlsx')

    df = pd.read_excel(filepath)

    return df

In [None]:
@reformat_column_names
@dropna_rows_cols
def read_participant_data(filepath=None):
    if not filepath:
        filepath = os.path.join(DATA_DIR, 'Schiffman_X01 Sample List.xlsx')
    # Participants
    df = pd.read_excel(filepath)
    cols_to_lower(df)
    # Extract participant columns
    participant_df = df[['individual_name', 'ewing_trio_number', 'relationship_to_proband']]

    # Create is_proband col
    def func(row): return row.relationship_to_proband == 'Self/Case'
    participant_df['relationship_to_proband'] = participant_df.apply(func, axis=1)

    # Create family_id column
    participant_df.rename(columns={'ewing_trio_number': 'family_id'})
    
    return participant_df

In [None]:
@reformat_column_names
@dropna_rows_cols
def create_diagnosis_df(df):
    # Create diagnosis df
    df.loc[(df['morphology'] == "9260/3: Ewing's sarcoma\r\nEwing's tumor"),'morphology']= 'Ewing Sarcoma'    
    def func(row): return "_".join(['diagnosis', str(row.name)])
    df['diagnosis_id'] = df.apply(func, axis=1)
    df = df[['individual_name', 'age_at_diagnosis_(days)', 'morphology']]
    return df

In [None]:
@reformat_column_names
@dropna_rows_cols
def read_genomic_data(filepath=None):
    """
    Read genomic data
    """
    if not filepath:
        filepath = os.path.join(
            DATA_DIR,
            'Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx')

    df = pd.read_excel(filepath)

    return df

In [None]:
def read_genomic_files_info(filepath):
    """
    Read genomic file info json produced by Gen3 registration
    and convert into genomic file table for dataservice
    """
    data = read_json(filepath)
    df = pd.DataFrame(list(data.values()))

    # Reformat
    df['md5sum'] = df['hashes'].apply(lambda x: x['md5'])
    df['file_url'] = df['urls'].apply(lambda x: x[0])
    df['file_name'] = df['file_url'].apply(
        lambda file_url: os.path.basename(file_url))
    df['file_format'] = df['file_name'].apply(
        extract_uncompressed_file_ext)
    df.rename(columns={'did': 'uuid', 'size': 'file_size'}, inplace=True)

    # Data type
    def func(x):
        x = x.strip()
        if x.endswith('cram') or x.endswith('bam'):
            val = 'submitted aligned reads'
        elif x.endswith('crai'):
            val = 'submitted aligned reads index'
        elif 'fastq' in x:
            val = 'submitted reads'
        elif 'vcf' in x:
            val = 'simple nucleotide variation'
        else:
            val = None
        return val

    df['data_type'] = df['file_name'].apply(func)

    return df


In [None]:
# Genomic files
def _create_genomic_file_df(genomic_df, biospecimen_df):
    filepath = os.path.join(DATA_DIR, 'genomic_file_uuid.json')
    gf_info_df = read_genomic_files_info(filepath)
    genomic_df = genomic_df[['build_id', 'phenotype_sheet_sample_name', 'bam_path']]
    genomic_df['file_name'] = genomic_df['bam_path'].apply(
        lambda p: os.path.basename(p))

    # Merge sequencing experiment data
    df1 = pd.merge(genomic_df, gf_info_df, on='file_name')
    # Merge biospecimen data
    genomic_file_df = pd.merge(df1, biospecimen_df, left_on='phenotype_sheet_sample_name', right_on='sample_name')
    genomic_file_df = genomic_file_df[['build_id', 
                     'sample_name', 
                     'file_name',
                    'file_format',
                     'uuid', 
                     'form', 
                     'hashes',
                     'file_size',
                     'file_url',
                     'data_type',
                     'md5sum']]
    return genomic_file_df

In [None]:
def create_seq_exp_data(df):
    df = df[['build_id', 'mean_insert_size', 'pf_reads',
             'phenotype_sheet_sample_name']]
    return df

### Explore

In [None]:
pprint([f for f in os.listdir(DATA_DIR)])

#### Sample List (Schiffman_X01 Sample List.xlsx)

In [None]:
sample_list_df = pd.read_excel(os.path.join(DATA_DIR, 'Schiffman_X01 Sample List.xlsx'))
cols_to_lower(sample_list_df)
print(sample_list_df.shape)
sample_list_df.head()

In [None]:
sample_list_df.groupby('relationship_to_proband').describe()

In [None]:
# Unique morphologies
print('Morphologies')
pprint(sample_list_df['morphology'].unique().tolist())
print('\nTopographies')
# Unique topographies
pprint(sample_list_df['topography'].unique().tolist())

#### QC Phenotype (Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx)

In [None]:
qc_phenotype_df = pd.read_excel(os.path.join(DATA_DIR, 'Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx'))
del qc_phenotype_df['instrument_data_ids']
cols_to_lower(qc_phenotype_df)
qc_phenotype_df.head()

In [None]:
qc_phenotype_df.describe(include=['O']).T.sort_values('unique', ascending=False)
# qc_phenotype_df.dtypes

#### Trio Summary

In [None]:
trio_df = pd.read_excel(os.path.join(DATA_DIR, 'Schiffman_EwingSarcoma_QC_vs_Phenotype.xlsx'), sheet_name=1)
trio_df.head()

### Extract

In [None]:
all_data_df = read_data()

In [None]:
# Study
study_df = read_study_data()
study_df.head()

In [None]:
# Investigator
investigator_df = read_investigator_data()
investigator_df.head()

In [None]:
# Study files
study_file_df = read_study_file_data()
study_file_df.head()

In [None]:
# Participants
participant_df = read_participant_data()
participant_df.head()

In [None]:
# Diagnosis
diagnosis_df = create_diagnosis_df(all_data_df)
diagnosis_df.head()
diagnosis_df = diagnosis_df.where((pd.notnull(diagnosis_df)), None)
diagnosis_df.head()

In [None]:
# Phenotypes
def create_phenotype_df(df):
    df['phenotype'] = "Ewing's Sarcoma"
    df['hpo_id'] = "HP:0012254"
    df['observed'] = pd.notnull(df['morphology'])
    return df
phenotype_df = create_phenotype_df(diagnosis_df)
phenotype_df.head()

In [None]:
# Family relationships
df = all_data_df[['individual_name', 'relationship_to_proband', 'ewing_trio_number']]
df.drop_duplicates('individual_name', inplace=True)
df['ewing_trio_number'] = df['ewing_trio_number'].astype(str)
fams = []
# For each group
for name, g in df.groupby('ewing_trio_number'):
    fam = {}
    # For each individual in family group
    for idx, row in g.iterrows():
        rel = row['relationship_to_proband']
        if rel == 'Self/Case':
            col = 'relative_id'
        else:
            col = 'participant_id'
        fam[col] = row['individual_name']
        fam['participant_to_relative_relation'] = rel
    if rel != 'Self/Case':    
        fams.append(fam)
df = pd.DataFrame(fams)
def func(row): return "_".join(['rel', str(row.name)])
df['rel_id'] = df.apply(func, axis=1)

In [None]:
# Create family relationships
df = all_data_df[['individual_name', 'relationship_to_proband', 'ewing_trio_number']]
family_dict = {}
for idx, row in df.iterrows():
    fam_id = row['ewing_trio_number']
    if fam_id not in family_dict:
        family_dict[fam_id] = {}
    family_dict[fam_id][row['relationship_to_proband']] = row['individual_name']
df = pd.DataFrame(list(family_dict.values()))
df.head()

In [None]:
# Biospecimens
biospecimen_df = read_data()
biospecimen_df.head()

In [None]:
# Sequencing Experiments
genomic_df = read_genomic_data()
genomic_df.head()
seq_exp_df = create_seq_exp_data(genomic_df)
seq_exp_df.head()

In [None]:
genomic_df.head()

In [None]:
df = _create_genomic_file_df(genomic_df, biospecimen_df)
df.head()