## Setup

In [1]:
# Importing modules 
import argparse
import os
import sys
import numpy as np 
import pandas as pd
from scipy.stats import iqr

In [2]:
# Reading in table from results folder without the subgroup column
EPN_final = pd.read_csv("results/EPN_all_data.tsv", sep="\t")
EPN_final["subgroup"] = ""

# Output to write 
outfile = "results/EPN_all_data_withsubgroup.tsv"

## Function Definitions

######  prioritized_fusion function
1. This function will take every row with fusions that are associated with a subgroup
2. It returns the subgroup and adds sample_id to the sample_list. Every sample in sample_list will not be considered for further subgrouping
3. This function is being used for RELA and YAP1 fusions based on evidence for these subgroups from [Pajtler. et al. Fig.6](https://marlin-prod.literatumonline.com/cms/attachment/1593a1fc-bd9f-4476-b2f3-1abf631ccdbd/gr6.jpg)


In [3]:

def prioritized_fusion(row, subgroupname, fusionlist, sample_list):
    fusion_boolean = []
    for fusionname in fusionlist:
        #print(row[fusionname])
        if(row[fusionname] > 0):
            sample_list.append(row["sample_id"])
            fusion_boolean.append(True)
        else:
            fusion_boolean.append(False)
    if any(fusion_boolean):
        return(subgroupname)
    else:
        return(row["subgroup"]) 

###### prioritizing_PT_EPN
1. This function is the same as above but except for fusions it assigns subgroups if certain column are higher than a value in the input table columns
2. Here also sample_ids are added to a list that are not not considered for subgrouping in the future
3. This function is mainly used for PT_EPN subgroups. 
4. Based on [Pajtler et al. fig. 4](https://www.sciencedirect.com/science/article/pii/S153561081500135X?via%3Dihub#fig4), over expression of CXorf67 and TKTL1 along with 1q gain is seen under PT_EPN_A subgroup
5. Similarly GPBP1 and IFT46 shows over expression along with 6p and 6q loss in PT_EPN_B subgroup

In [4]:

def prioritizing_PT_EPN(row, sample_list):
    if( row["CXorf67_expr_zscore"]>3 or
        (row["CXorf67_expr_zscore"]>3 and row["1q_gain"]>0) or
        (row["TKTL1_expr_zscore"]>3  and row["1q_gain"]>0)):
        sample_list.append(row["sample_id"])
        return("EPN, PF A")
    elif((row["GPBP1_expr_zscore"]>3 and row["6q_loss"]>0) or
          (row["GPBP1_expr_zscore"]>3 and row["6p_loss"]>0) or 
          (row["IFT46_expr_zscore"]>3 and row["6q_loss"]>0) or
          (row["IFT46_expr_zscore"]>3 and row["6p_loss"]>0)):
        sample_list.append(row["sample_id"])
        return("EPN, PF B")
          
    else:
        return(row["subgroup"])

###### subgroup_func
1. This function takes tuples of the feature/column name along with the threshold value as input
2. If the column has higher than the threshold value, then the corresponding subgroup is assigned
3. This also checks for samples within `sample_list` and leaves out those samples from subgrouping


In [5]:

def subgroup_func(row, subgroupname, column_values, sample_list):
    current_subgroup = row["subgroup"]
    for columnname, value in column_values:
        if row[columnname] > value and row["sample_id"] not in sample_list:
            if current_subgroup == '':
                current_subgroup = subgroupname
            elif subgroupname in current_subgroup.split(","):
                pass
            else:
                current_subgroup = current_subgroup + "," + subgroupname
    return(current_subgroup)

### Processing EPN Samples

In [6]:
# This list will hold samples that have been assigned subgroups
# that are of priority and cannot be used to assign multiple subgroup names
samples_assigned = []

- RELA gene fusions are prioritized in RELA subgroups as discussed above. This bit of code creates a list for ST_EPN_RELA fusions and uses those as input to call `prioritized_fusion` function.
- Input - fusion list and every row
- Output from function - Value under `subgroup` column for each row
- Number of samples assigned - 28 </p>

In [7]:

st_epn_rela_fusions = ["C11orf95--RELA", "LTBP3--RELA"]
EPN_final["subgroup"] = EPN_final.apply(prioritized_fusion,
                                        axis=1,
                                        subgroupname="EPN, ST RELA",
                                        fusionlist=st_epn_rela_fusions,
                                        sample_list=samples_assigned)
print(len(samples_assigned))

display(EPN_final.head(5))

28


Unnamed: 0,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,disease_group,1q_loss,1q_gain,9p_loss,9q_loss,6p_loss,...,consensus_focal_CN_CDKN2,RELA_expr_zscore,L1CAM_expr_zscore,ARL4D_expr_zscore,CLDN1_expr_zscore,CXorf67_expr_zscore,TKTL1_expr_zscore,GPBP1_expr_zscore,IFT46_expr_zscore,subgroup
0,PT_S4H6KA09,7316-2134,BS_K6A9Z04J,BS_07ANYSYQ,supratentorial,0.0,0.0,0.0,0.0,0.0,...,loss,0.667614,0.48747,0.166855,-0.216807,-0.73355,-0.37914,0.230703,-0.798158,ST_EPN_RELA
1,PT_164RNWTT,7316-1078,BS_5D24XV4T,BS_0BXY0F9N,infratentorial,0.0,0.0,0.0,0.0,0.0,...,0,-0.669468,-0.536139,0.943212,-0.253911,0.606873,-0.184479,-1.135558,-0.004348,
2,PT_V3Q78E6F,7316-455,BS_7RQCH5Y7,BS_0QYS36NR,undetermined,0.0,0.0,0.0,0.0,0.0,...,0,-0.441425,-0.547948,0.146395,-0.259969,0.249427,0.4538,-1.603039,-0.851185,
3,PT_Y6Y9JJ9P,7316-425,,BS_0WQJP6ZG,supratentorial,,,,,,...,,-0.145624,-0.303262,0.893197,-0.223623,-0.759494,-0.37914,0.6222,-1.025133,ST_EPN_RELA
4,PT_82A9SDRN,7316-2313,BS_NWYBD9CA,BS_0XEG6SNV,supratentorial,0.0,0.0,0.0,0.0,0.0,...,0,1.30249,3.919198,-0.209387,-0.186519,-0.171437,-0.375096,-0.408285,-0.195971,ST_EPN_RELA


- YAP1 gene fusions are prioritized in YAP1 subgroups as discussed above. This bit of code creates a list for ST_EPN_YAP1 fusions and uses those as input to call `prioritized_fusion` function.
- Input - fusion list and every row
- Output from function - Value under `subgroup` column for each row
- Total number of samples assigned till this point - 29 

In [8]:
st_epn_yap1_fusions = ["C11orf95--YAP1", "YAP1--MAMLD1", "YAP1--FAM118B"]
EPN_final["subgroup"] = EPN_final.apply(prioritized_fusion,
                                        axis=1,
                                        subgroupname="EPN, ST YAP1",
                                        fusionlist=st_epn_yap1_fusions,
                                        sample_list=samples_assigned)
print(len(samples_assigned))
# This step assigned one sample to ST_EPN_YAP1
display(EPN_final[EPN_final["subgroup"]=="EPN, ST YAP1"])

29


Unnamed: 0,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,disease_group,1q_loss,1q_gain,9p_loss,9q_loss,6p_loss,...,consensus_focal_CN_CDKN2,RELA_expr_zscore,L1CAM_expr_zscore,ARL4D_expr_zscore,CLDN1_expr_zscore,CXorf67_expr_zscore,TKTL1_expr_zscore,GPBP1_expr_zscore,IFT46_expr_zscore,subgroup
78,PT_W17NV5YG,7316-2079,BS_FVPMPMRJ,BS_V4W81SFC,undetermined,0.0,0.0,0.0,0.0,0.0,...,0,-0.404841,-0.546348,7.682619,6.264276,-0.762376,-0.383183,-0.721627,-0.510522,ST_EPN_YAP1


- Using `prioritizing_PT_EPN` to assign `ST_EPN_A` and `ST_EPN_B` subgroup names to samples 
- Input - Every row from dataframe 
- Output - New subgroup names under `subgroup` column of the dataframe
- Number of samples assigned at. this point - 31

In [9]:

EPN_final["subgroup"] = EPN_final.apply(prioritizing_PT_EPN, axis=1, sample_list=samples_assigned)
print(len(samples_assigned))

display(EPN_final[EPN_final["subgroup"]=="EPN, PF A"].head())
display(EPN_final[EPN_final["subgroup"]=="EPN, PF B"].head())

31


Unnamed: 0,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,disease_group,1q_loss,1q_gain,9p_loss,9q_loss,6p_loss,...,consensus_focal_CN_CDKN2,RELA_expr_zscore,L1CAM_expr_zscore,ARL4D_expr_zscore,CLDN1_expr_zscore,CXorf67_expr_zscore,TKTL1_expr_zscore,GPBP1_expr_zscore,IFT46_expr_zscore,subgroup
22,PT_ZA95JQEB,7316-384,BS_ZZJF26C4,BS_8RHPJ740,undetermined,0.0,1.0,0.0,0.0,0.0,...,gain,0.364269,-0.547396,-0.117316,-0.236495,0.396441,5.493027,-1.045101,-0.101164,PT_EPN_A
89,PT_ZZRBX5JT,7316-3319,BS_9N3B3HZB,BS_YE1MAQYJ,infratentorial,0.0,1.0,0.0,0.0,0.0,...,0,-0.646265,-0.546845,-0.151416,-0.211507,2.515174,3.836389,-0.322893,0.879046,PT_EPN_A


Unnamed: 0,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,disease_group,1q_loss,1q_gain,9p_loss,9q_loss,6p_loss,...,consensus_focal_CN_CDKN2,RELA_expr_zscore,L1CAM_expr_zscore,ARL4D_expr_zscore,CLDN1_expr_zscore,CXorf67_expr_zscore,TKTL1_expr_zscore,GPBP1_expr_zscore,IFT46_expr_zscore,subgroup


- This next part calls `subgroup_func`  to assign sunbgroup name based on the feature provided and the threshold value
- Every feature and threshold value is given as a list of tuples.
- `st_epn_rela_tests`  and `st_epn_yap1_tests` have inputs that will not be prioritized because this function will not be adding samples to `samples_assigned` list. This means if a sample has `PTEN--TAS2R1` fusion and `C11orf95--MAML2` fusion, it will have both `ST_EPN_RELA` and `ST_EPN_YAP1` assigned to it 
- Inputs - each row of dataframe aling with list of tuples
- Output - subgroup value
- Number of samples assigned - 31

In [10]:

st_epn_rela_tests = [("PTEN--TAS2R1",  0),
                     ("9p_loss", 0),
                     ("9q_loss", 0),
                     ("RELA_expr_zscore", 3),
                     ("L1CAM_expr_zscore",3)]
# Calling function subgroup_func to  set  the values for last column "subgroup"
EPN_final["subgroup"] = EPN_final.apply(subgroup_func,
                                        axis=1,
                                        subgroupname="EPN, ST RELA",
                                        column_values=st_epn_rela_tests,
                                        sample_list=samples_assigned)

print(len(samples_assigned))


31


In [11]:
#### Looking for ST_EPN_YAP1 sub-group samples
st_epn_yap1_tests = [("C11orf95--MAML2", 0),
             ("11q_loss",  0),
             ("11q_gain", 0),
             ("ARL4D_expr_zscore", 3), 
             ("CLDN1_expr_zscore", 3)]  

EPN_final["subgroup"] = EPN_final.apply(subgroup_func,
                                        axis=1,
                                        subgroupname="EPN, ST YAP1",
                                        column_values=st_epn_yap1_tests,
                                        sample_list=samples_assigned)
print(len(samples_assigned))


31


<p> Adding `SV instability` and `CNV_instability` columns to the final dataframe </p>  

In [12]:
display(EPN_final.head(5)) # This will print dataframe without the columns
EPN_final["SV instability"] = ""
EPN_final["CNV_instability"] = ""
sv_iqr = iqr(np.array(EPN_final["breaks_density-chromosomal_instability_SV"].dropna()))
sv_median = np.median(np.array(EPN_final["breaks_density-chromosomal_instability_SV"].dropna()))
cnv_iqr = iqr(np.array(EPN_final["breaks_density-chromosomal_instability_CNV"].dropna()))
cnv_median = np.median(np.array(EPN_final["breaks_density-chromosomal_instability_CNV"].dropna()))
EPN_final["SV instability"] = EPN_final.apply(lambda x: (x["breaks_density-chromosomal_instability_SV"]-sv_median)/sv_iqr, axis=1)
EPN_final["CNV_instability"] = EPN_final.apply(lambda x: (x["breaks_density-chromosomal_instability_CNV"]-cnv_median)/cnv_iqr, axis=1)

# Sort final table
EPN_final = EPN_final.sort_values(by = ["Kids_First_Participant_ID", "sample_id"])

display(EPN_final.head(5)) # Dataframe after the columns are added  


Unnamed: 0,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,disease_group,1q_loss,1q_gain,9p_loss,9q_loss,6p_loss,...,consensus_focal_CN_CDKN2,RELA_expr_zscore,L1CAM_expr_zscore,ARL4D_expr_zscore,CLDN1_expr_zscore,CXorf67_expr_zscore,TKTL1_expr_zscore,GPBP1_expr_zscore,IFT46_expr_zscore,subgroup
0,PT_S4H6KA09,7316-2134,BS_K6A9Z04J,BS_07ANYSYQ,supratentorial,0.0,0.0,0.0,0.0,0.0,...,loss,0.667614,0.48747,0.166855,-0.216807,-0.73355,-0.37914,0.230703,-0.798158,ST_EPN_RELA
1,PT_164RNWTT,7316-1078,BS_5D24XV4T,BS_0BXY0F9N,infratentorial,0.0,0.0,0.0,0.0,0.0,...,0,-0.669468,-0.536139,0.943212,-0.253911,0.606873,-0.184479,-1.135558,-0.004348,
2,PT_V3Q78E6F,7316-455,BS_7RQCH5Y7,BS_0QYS36NR,undetermined,0.0,0.0,0.0,0.0,0.0,...,0,-0.441425,-0.547948,0.146395,-0.259969,0.249427,0.4538,-1.603039,-0.851185,
3,PT_Y6Y9JJ9P,7316-425,,BS_0WQJP6ZG,supratentorial,,,,,,...,,-0.145624,-0.303262,0.893197,-0.223623,-0.759494,-0.37914,0.6222,-1.025133,ST_EPN_RELA
4,PT_82A9SDRN,7316-2313,BS_NWYBD9CA,BS_0XEG6SNV,supratentorial,0.0,0.0,0.0,0.0,0.0,...,0,1.30249,3.919198,-0.209387,-0.186519,-0.171437,-0.375096,-0.408285,-0.195971,ST_EPN_RELA


Unnamed: 0,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,disease_group,1q_loss,1q_gain,9p_loss,9q_loss,6p_loss,...,L1CAM_expr_zscore,ARL4D_expr_zscore,CLDN1_expr_zscore,CXorf67_expr_zscore,TKTL1_expr_zscore,GPBP1_expr_zscore,IFT46_expr_zscore,subgroup,SV instability,CNV_instability
13,PT_06H29FCG,7316-1944,BS_0W8AWY10,BS_4T0HPZDC,infratentorial,0.0,0.0,0.0,0.0,0.0,...,-0.535366,0.008856,0.005816,0.566516,-0.243975,-1.339629,-0.858416,,0.021127,-0.296296
40,PT_0CVRX4SJ,7316-764,,BS_H4NXBD2D,supratentorial,,,,,,...,1.968995,-0.301459,-0.253911,-0.719137,-0.37914,1.128759,-0.145354,ST_EPN_RELA,,
68,PT_0NY38X3W,7316-1706,BS_HQFNQHVW,BS_S0175QKX,infratentorial,0.0,0.0,0.0,0.0,0.0,...,-0.543534,-0.367387,-0.261484,0.082234,-0.172349,0.428261,-0.297608,,0.267606,0.222222
47,PT_0WKX8Q5X,7316-88,BS_QSMFVHSB,BS_KXMYBQ6N,infratentorial,0.0,0.0,0.0,0.0,0.0,...,-0.547728,-0.067302,-0.062334,-0.716254,-0.373364,-1.137729,-0.712188,,-0.204225,-0.296296
1,PT_164RNWTT,7316-1078,BS_5D24XV4T,BS_0BXY0F9N,infratentorial,0.0,0.0,0.0,0.0,0.0,...,-0.536139,0.943212,-0.253911,0.606873,-0.184479,-1.135558,-0.004348,,-0.556338,0.148148


In [13]:
# Replacing all Nan values with NA so they are not empty when writing to a file
EPN_final = EPN_final.replace(np.nan, 'NA', regex=True)

with open(outfile, "w") as out:
    EPN_final.to_csv(out, sep="\t", header=True, index=False)

## Summary

In [14]:

rela_samples = EPN_final[EPN_final["subgroup"]=="EPN, ST RELA"].shape[0]
yap1_samples = EPN_final[EPN_final["subgroup"]=="EPN, ST YAP1"].shape[0]

epna_samples = EPN_final[EPN_final["subgroup"]=="EPN, PF A"].shape[0]
epnb_samples = EPN_final[EPN_final["subgroup"]=="EPN, PF B"].shape[0]

no_assigned_samples = EPN_final[EPN_final["subgroup"]==""].shape[0]

total_samples = EPN_final.shape[0]

print("There are a total of "+str(total_samples)+" samples out of which "+str(
    no_assigned_samples)+" samples were not assigned any subgroup")
print("Number of samples under each subgroup\nEPN, ST RELA : "+str(
    rela_samples)+"\nEPN, ST YAP1 : "+str(yap1_samples)+" \nEPN, PF A : "+str(
    epna_samples)+"\nEPN, PF B : "+str(epnb_samples))


There are a total of 91 samples out of which 56 samples were not assigned any subgroup
Number of samples under each subgroup
ST_EPN_RELA : 29
ST_EPN_YAP1 : 4 
PT_EPN_A : 2
PT_EPN_B : 0
