In [1]:

# Importing modules 
import argparse
import os
import sys
import numpy as np 
import pandas as pd
from scipy.stats import iqr

# Reading in table from results folder without the subgroup column
EPN_final = pd.read_csv("results/EPN_all_data.tsv", sep="\t")
EPN_final["subgroup"] = ""

# Output to write 
outfile = open("results/EPN_all_data_withsubgroup.tsv", "w")

# This list will hold samples that have been assigned subgroups
# that are of priority and cannot be used to assign multiple subgroup names
samples_assigned = []

# This function will take every row with a list of tuples of size two
def prioritized_fusion(row, subgroupname, fusionlist, sample_list):
    fusion_boolean = []
    for fusionname in fusionlist:
        #print(row[fusionname])
        if(row[fusionname] > 0):
            sample_list.append(row["sample_id"])
            fusion_boolean.append(1)
        else:
            fusion_boolean.append(0)
    if 1 in fusion_boolean:
        return(subgroupname)
    else:
        return(row["subgroup"]) 

def prioritizing_PT_EPN(row, sample_list):
    if((row["CXorf67_expr_zscore"]>3 and row["1q_gain"]>0) or
        (row["TKTL1_expr_zscore"]>3  and row["1q_gain"]>0)):
        return("PT_EPN_A")
        sample_list.append(row["sample_id"])
    elif((row["GPBP1_expr_zscore"]>3 and row["6q_loss"]>0) or
          (row["GPBP1_expr_zscore"]>3 and row["6p_loss"]>0) or 
          (row["IFT46_expr_zscore"]>3 and row["6q_loss"]>0) or
          (row["IFT46_expr_zscore"]>3 and row["6p_loss"]>0)):
        return("PT_EPN_B")
        sample_list.append(row["sample_id"])
    else:
        return(row["subgroup"])

# Certain fusion  are prioritized in RELA and YAP1 subgroups
# The fusions are gievn as a list, every row here is given as input
# If the samples have the fusions, the subgroup is assigned and sample_id 
# is added to samples_assigned  and they will omitted when assessing other categories
st_epn_rela_fusions = ["C11orf95--RELA", "LTBP3--RELA"]
EPN_final["subgroup"] = EPN_final.apply(prioritized_fusion,
                                        axis=1,
                                        subgroupname="ST_EPN_RELA",
                                        fusionlist=st_epn_rela_fusions,
                                        sample_list=samples_assigned)

st_epn_yap1_fusions = ["C11orf95--YAP1", "YAP1--MAMLD1", "YAP1--FAM118B"]
EPN_final["subgroup"] = EPN_final.apply(prioritized_fusion,
                                        axis=1,
                                        subgroupname="ST_EPN_YAP1",
                                        fusionlist=st_epn_yap1_fusions,
                                        sample_list=samples_assigned)

# Same as the two. subrgoups above ET_EPN A and B are prioritized for certain gene expressions and 
# CNV changes, after these are assigned, samples are not considered for further categories
EPN_final["subgroup"] = EPN_final.apply(prioritizing_PT_EPN, axis=1, sample_list=samples_assigned)
EPN_final["subgroup"] = EPN_final.apply(prioritizing_PT_EPN, axis=1, sample_list=samples_assigned)

# Things to note at this point of the script - 31 samples are categorized with priority subgrouping 
# and will not be considered for further. subgrouping



def subgroup_func(row, subgroupname, column_values, sample_list):
    current_subgroup = row["subgroup"]
    for columnname, value in column_values:
        if row[columnname] > value and row["sample_id"] not in sample_list:
            if current_subgroup == '':
                current_subgroup = subgroupname
            elif subgroupname in current_subgroup.split(","):
                pass
            else:
                current_subgroup = current_subgroup + "," + subgroupname
    return(current_subgroup)

st_epn_rela_tests = [("PTEN--TAS2R1",  0),
                     ("9p_loss", 0),
                     ("9q_loss", 0),
                     ("RELA_expr_zscore", 3),
                     ("L1CAM_expr_zscore",3)]
# Calling function subgroup_func to  set  the values for last column "subgroup"
EPN_final["subgroup"] = EPN_final.apply(subgroup_func,
                                        axis=1,
                                        subgroupname="ST_EPN_RELA",
                                        column_values=st_epn_rela_tests,
                                        sample_list=samples_assigned)

#### Looking for ST_EPN_YAP1 sub-group samples
st_epn_yap1_tests = [("C11orf95--MAML2", 0),
             ("11q_loss",  0),
             ("11q_gain", 0),
             ("ARL4D_expr_zscore", 3), 
             ("CLDN1_expr_zscore", 3)]  

EPN_final["subgroup"] = EPN_final.apply(subgroup_func,
                                        axis=1,
                                        subgroupname="ST_EPN_YAP1",
                                        column_values=st_epn_yap1_tests,
                                        sample_list=samples_assigned)




## Creating columns for SV and CNV breaks density
EPN_final["SV instability"] = ""
EPN_final["CNV_instability"] = ""
sv_iqr = iqr(np.array(EPN_final["breaks_density-chromosomal_instability_SV"].dropna()))
sv_median = np.median(np.array(EPN_final["breaks_density-chromosomal_instability_SV"].dropna()))
cnv_iqr = iqr(np.array(EPN_final["breaks_density-chromosomal_instability_CNV"].dropna()))
cnv_median = np.median(np.array(EPN_final["breaks_density-chromosomal_instability_CNV"].dropna()))
EPN_final["SV instability"] = EPN_final.apply(lambda x: (x["breaks_density-chromosomal_instability_SV"]-sv_median)/sv_iqr, axis=1)
EPN_final["CNV_instability"] = EPN_final.apply(lambda x: (x["breaks_density-chromosomal_instability_CNV"]-cnv_median)/cnv_iqr, axis=1)

EPN_final.to_csv(outfile, sep="\t", header=True, index=False)
outfile.close()


