## Setup

In [27]:
# Importing modules 
import argparse
import os
import sys
import numpy as np 
import pandas as pd
from scipy.stats import iqr

In [28]:
# Reading in table from results folder without the subgroup column
EPN_final = pd.read_csv("results/EPN_all_data.tsv", sep="\t")
EPN_final["subgroup"] = ""

# Output to write 
outfile = "results/EPN_all_data_withsubgroup.tsv"

## Function Definitions

######  prioritized_fusion function
1. This function will take every row with fusions that are associated with a subgroup
2. It returns the subgroup and adds sample_id to the sample_list. Every sample in sample_list will not be considered for further subgrouping
3. This function is being used for RELA and YAP1 fusions based on evidence for these subgroups from [Pajtler. et al. Fig.6](https://marlin-prod.literatumonline.com/cms/attachment/1593a1fc-bd9f-4476-b2f3-1abf631ccdbd/gr6.jpg)


In [29]:

def prioritized_fusion(row, subgroupname, fusionlist, sample_list):
    fusion_boolean = []
    for fusionname in fusionlist:
        #print(row[fusionname])
        if(row[fusionname] > 0):
            sample_list.append(row["sample_id"])
            fusion_boolean.append(True)
        else:
            fusion_boolean.append(False)
    if any(fusion_boolean):
        return(subgroupname)
    else:
        return(row["subgroup"]) 

###### subgroup_func
1. This function takes tuples of the feature/column name along with the threshold value as input
2. If the column has higher than the threshold value, then the corresponding subgroup is assigned
3. This also checks for samples within `sample_list` and leaves out those samples from subgrouping


In [30]:

def subgroup_func(row, subgroupname, column_values, sample_list):
    current_subgroup = row["subgroup"]
    for columnname, value in column_values:
        if row[columnname] > value and row["sample_id"] not in sample_list:
            if current_subgroup == '':
                current_subgroup = subgroupname
            elif subgroupname in current_subgroup.split(","):
                pass
            else:
                current_subgroup = current_subgroup + "," + subgroupname
    return(current_subgroup)

### Processing EPN Samples

In [31]:
# This list will hold samples that have been assigned subgroups
# that are of priority and cannot be used to assign multiple subgroup names
samples_assigned = []

- RELA gene fusions are prioritized in RELA subgroups as discussed above. This bit of code creates a list for ST_EPN_RELA fusions and uses those as input to call `prioritized_fusion` function.
- Input - fusion list and every row
- Output from function - Value under `subgroup` column for each row
- Number of samples assigned - 28 </p>

In [32]:

st_epn_rela_fusions = ["C11orf95--RELA", "LTBP3--RELA"]
EPN_final["subgroup"] = EPN_final.apply(prioritized_fusion,
                                        axis=1,
                                        subgroupname="EPN, ST RELA",
                                        fusionlist=st_epn_rela_fusions,
                                        sample_list=samples_assigned)
print(len(samples_assigned))

display(EPN_final.head(5))

28


Unnamed: 0,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,disease_group,1q_loss,1q_gain,9p_loss,9q_loss,6p_loss,...,consensus_focal_CN_CDKN2,RELA_expr_zscore,L1CAM_expr_zscore,ARL4D_expr_zscore,CLDN1_expr_zscore,CXorf67_expr_zscore,TKTL1_expr_zscore,GPBP1_expr_zscore,IFT46_expr_zscore,subgroup
0,PT_06H29FCG,7316-1944,BS_0W8AWY10,BS_4T0HPZDC,infratentorial,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.066802,-0.524313,0.014816,0.006466,0.593914,-0.240955,-1.320692,-0.849661,
1,PT_0CVRX4SJ,7316-764,,BS_H4NXBD2D,supratentorial,,,,,,...,,0.986055,2.009381,-0.300196,-0.257418,-0.702449,-0.378184,1.065878,-0.128672,"EPN, ST RELA"
2,PT_0NY38X3W,7316-1706,BS_HQFNQHVW,BS_S0175QKX,infratentorial,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.468171,-0.532576,-0.367122,-0.265112,0.105598,-0.168236,0.388599,-0.282618,
3,PT_0WKX8Q5X,7316-88,BS_QSMFVHSB,BS_KXMYBQ6N,infratentorial,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.485965,-0.536819,-0.062494,-0.062775,-0.699543,-0.372319,-1.125484,-0.701807,
4,PT_164RNWTT,7316-1078,BS_5D24XV4T,BS_0BXY0F9N,infratentorial,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.653142,-0.525095,0.963316,-0.257418,0.634607,-0.180551,-1.123385,0.013901,


- YAP1 gene fusions are prioritized in YAP1 subgroups as discussed above. This bit of code creates a list for ST_EPN_YAP1 fusions and uses those as input to call `prioritized_fusion` function.
- Input - fusion list and every row
- Output from function - Value under `subgroup` column for each row
- Total number of samples assigned till this point - 29 

In [33]:
st_epn_yap1_fusions = ["C11orf95--YAP1", "YAP1--MAMLD1", "YAP1--MAML2", "YAP1--FAM118B"]
EPN_final["subgroup"] = EPN_final.apply(prioritized_fusion,
                                        axis=1,
                                        subgroupname="EPN, ST YAP1",
                                        fusionlist=st_epn_yap1_fusions,
                                        sample_list=samples_assigned)
print(len(samples_assigned))
# This step assigned one sample to ST_EPN_YAP1
display(EPN_final[EPN_final["subgroup"]=="EPN, ST YAP1"])

29


Unnamed: 0,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,disease_group,1q_loss,1q_gain,9p_loss,9q_loss,6p_loss,...,consensus_focal_CN_CDKN2,RELA_expr_zscore,L1CAM_expr_zscore,ARL4D_expr_zscore,CLDN1_expr_zscore,CXorf67_expr_zscore,TKTL1_expr_zscore,GPBP1_expr_zscore,IFT46_expr_zscore,subgroup
83,PT_W17NV5YG,7316-2079,BS_FVPMPMRJ,BS_V4W81SFC,undetermined,0.0,0.0,0.0,0.0,0.0,...,0,-0.386376,-0.535423,7.804747,6.365073,-0.746049,-0.382289,-0.723175,-0.497899,"EPN, ST YAP1"


<p> Adding `SV instability` and `CNV_instability` columns to the final dataframe </p>  

In [34]:
display(EPN_final.head(5)) # This will print dataframe without the columns
EPN_final["SV instability"] = ""
EPN_final["CNV_instability"] = ""
sv_iqr = iqr(np.array(EPN_final["breaks_density-chromosomal_instability_SV"].dropna()))
sv_median = np.median(np.array(EPN_final["breaks_density-chromosomal_instability_SV"].dropna()))
cnv_iqr = iqr(np.array(EPN_final["breaks_density-chromosomal_instability_CNV"].dropna()))
cnv_median = np.median(np.array(EPN_final["breaks_density-chromosomal_instability_CNV"].dropna()))
EPN_final["SV instability"] = EPN_final.apply(lambda x: (x["breaks_density-chromosomal_instability_SV"]-sv_median)/sv_iqr, axis=1)
EPN_final["CNV_instability"] = EPN_final.apply(lambda x: (x["breaks_density-chromosomal_instability_CNV"]-cnv_median)/cnv_iqr, axis=1)

# Sort final table
EPN_final = EPN_final.sort_values(by = ["Kids_First_Participant_ID", "sample_id"])

display(EPN_final.head(5)) # Dataframe after the columns are added  


Unnamed: 0,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,disease_group,1q_loss,1q_gain,9p_loss,9q_loss,6p_loss,...,consensus_focal_CN_CDKN2,RELA_expr_zscore,L1CAM_expr_zscore,ARL4D_expr_zscore,CLDN1_expr_zscore,CXorf67_expr_zscore,TKTL1_expr_zscore,GPBP1_expr_zscore,IFT46_expr_zscore,subgroup
0,PT_06H29FCG,7316-1944,BS_0W8AWY10,BS_4T0HPZDC,infratentorial,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.066802,-0.524313,0.014816,0.006466,0.593914,-0.240955,-1.320692,-0.849661,
1,PT_0CVRX4SJ,7316-764,,BS_H4NXBD2D,supratentorial,,,,,,...,,0.986055,2.009381,-0.300196,-0.257418,-0.702449,-0.378184,1.065878,-0.128672,"EPN, ST RELA"
2,PT_0NY38X3W,7316-1706,BS_HQFNQHVW,BS_S0175QKX,infratentorial,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.468171,-0.532576,-0.367122,-0.265112,0.105598,-0.168236,0.388599,-0.282618,
3,PT_0WKX8Q5X,7316-88,BS_QSMFVHSB,BS_KXMYBQ6N,infratentorial,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.485965,-0.536819,-0.062494,-0.062775,-0.699543,-0.372319,-1.125484,-0.701807,
4,PT_164RNWTT,7316-1078,BS_5D24XV4T,BS_0BXY0F9N,infratentorial,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.653142,-0.525095,0.963316,-0.257418,0.634607,-0.180551,-1.123385,0.013901,


Unnamed: 0,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,disease_group,1q_loss,1q_gain,9p_loss,9q_loss,6p_loss,...,L1CAM_expr_zscore,ARL4D_expr_zscore,CLDN1_expr_zscore,CXorf67_expr_zscore,TKTL1_expr_zscore,GPBP1_expr_zscore,IFT46_expr_zscore,subgroup,SV instability,CNV_instability
0,PT_06H29FCG,7316-1944,BS_0W8AWY10,BS_4T0HPZDC,infratentorial,0.0,0.0,0.0,0.0,0.0,...,-0.524313,0.014816,0.006466,0.593914,-0.240955,-1.320692,-0.849661,,-0.140351,-0.3
1,PT_0CVRX4SJ,7316-764,,BS_H4NXBD2D,supratentorial,,,,,,...,2.009381,-0.300196,-0.257418,-0.702449,-0.378184,1.065878,-0.128672,"EPN, ST RELA",,
2,PT_0NY38X3W,7316-1706,BS_HQFNQHVW,BS_S0175QKX,infratentorial,0.0,0.0,0.0,0.0,0.0,...,-0.532576,-0.367122,-0.265112,0.105598,-0.168236,0.388599,-0.282618,,-0.350877,0.2
3,PT_0WKX8Q5X,7316-88,BS_QSMFVHSB,BS_KXMYBQ6N,infratentorial,0.0,0.0,0.0,0.0,0.0,...,-0.536819,-0.062494,-0.062775,-0.699543,-0.372319,-1.125484,-0.701807,,-0.421053,-0.4
4,PT_164RNWTT,7316-1078,BS_5D24XV4T,BS_0BXY0F9N,infratentorial,0.0,0.0,0.0,0.0,0.0,...,-0.525095,0.963316,-0.257418,0.634607,-0.180551,-1.123385,0.013901,,-0.333333,0.5


In [35]:
# Replacing all Nan values with NA so they are not empty when writing to a file
EPN_final = EPN_final.replace(np.nan, 'NA', regex=True)

with open(outfile, "w") as out:
    EPN_final.to_csv(out, sep="\t", header=True, index=False)

## Summary

In [36]:
rela_samples = EPN_final[EPN_final["subgroup"]=="EPN, ST RELA"].shape[0]
yap1_samples = EPN_final[EPN_final["subgroup"]=="EPN, ST YAP1"].shape[0]

no_assigned_samples = EPN_final[EPN_final["subgroup"]==""].shape[0]

total_samples = EPN_final.shape[0]

print("There are a total of "+str(total_samples)+" samples out of which "+str(
    no_assigned_samples)+" samples were not assigned any subgroup")
print("Number of samples under each subgroup\nEPN, ST RELA : "+str(
    rela_samples)+"\nEPN, ST YAP1 : "+str(yap1_samples))

There are a total of 98 samples out of which 69 samples were not assigned any subgroup
Number of samples under each subgroup
EPN, ST RELA : 28
EPN, ST YAP1 : 1
