# Sample Somatic Alterations Widget Generator
2022-11-08 ZD

This notebook will input CHoP Somatic Alterations evidence files and generate Excel outputs formatted to mimic MTP widgets.   
Input can be either a list of targets (across all diseases) or a list of targets across a single disease.

In [None]:
import pandas as pd

In [None]:
# Define local path for CHoP data
CHOP_VERSION = 'v12pre2-20230406/'
CHOP_FOLDER = CHOP_VERSION + 'raw/'
OUTPUT_FOLDER = CHOP_VERSION + 'sampleSAwidgets/'

# Define jsonl file local paths
cnvPath = 'gene-level-cnv-consensus-annotated-mut-freq.jsonl.gz'
snvGenePath = 'gene-level-snv-consensus-annotated-mut-freq.jsonl.gz'
fusionGenePath = 'putative-oncogene-fused-gene-freq.jsonl.gz'
fusionPath = 'putative-oncogene-fusion-freq.jsonl.gz'
snvPath = 'variant-level-snv-consensus-annotated-mut-freq.jsonl.gz'

In [None]:
def load_chop_data(file, folder= CHOP_FOLDER):
    """ Load data from raw datasource jsonl or tsv
    files downloaded from CHoP S3 bucket."""
    
    path = folder + file
    if '.jsonl' in path:
        df = pd.read_json(path, lines=True)
    elif '.tsv' in path:
        df = pd.read_csv(path, sep='\t', compression='gzip', na_filter=False, low_memory=False)
    else: 
        print('Unknown filetype:', path)
    
    print('---')
    print('File loaded successfully:', file)
    print('Dataframe rows, columns:', df.shape)
    return df


In [None]:
# Load JSONL v11 data
cnv = load_chop_data(cnvPath)
snvGene = load_chop_data(snvGenePath)
fusionGene = load_chop_data(fusionGenePath)
fusion = load_chop_data(fusionPath)
snv = load_chop_data(snvPath)

In [None]:
# Group evidence dfs into list for iteration
dfList = [cnv, snvGene, fusionGene, fusion, snv]

# Define target list
targetList = ['ALK','BRAF','FLT3','KMT2C','MYCN','PCDHA9']

In [None]:
def search_sa(df, target, disease='all'):
    """Generic search for gene symbol and/or disease
    within a given dataset. Output df for exploration."""

    if disease=='all':
        df1 = df[df['Gene_symbol'] == target]

    else: 
        df1 = df[
            (df['Gene_symbol'] == target) & 
            (df['Disease'].str.lower() == disease.lower())]

    return df1

In [None]:
def output_sample_sa_widget(dfList:list, targetList:list, disease:str='all', output=OUTPUT_FOLDER):
    """Outputs Excel file(s) representing expected data within MTP Somatic
    Alterations widgets on target page (if no disease is specified) or on 
    evidence page (if disease is specified). Each tab within output files 
    represents a tab view within the widget. Note that this queries using
    gene symbol and disease names rather than IDs, so some variation is
    possible."""

    for target in targetList:
        fileName = output + target + 'SomaticAlterationsDisplay.xlsx'
        writer = pd.ExcelWriter(fileName, engine='xlsxwriter')

        for df in dfList:
            if disease=='all':
                df1 = df[df['Gene_symbol'] == target]

            else: 
                df1 = df[
                    (df['Gene_symbol'] == target) & 
                    (df['Disease'].str.lower() == disease.lower())]

            df1.to_excel(writer, sheet_name=df.datasourceId[0][0:30], index=False)

        writer.save()

In [None]:
output_sample_sa_widget(dfList, targetList)