### Import Packages

In [77]:
import pandas as pd
from modules.utils import load_json
import warnings
warnings.filterwarnings("ignore")

### Prepare Data For PCA

In [61]:
df = pd.read_csv('../BENCHMARKING/Selected_Biomarker_Panels.csv') 
df.rename(columns = {
        'randomforest_feature_importance': 'RF-FI',
        'xgb_feature_importance': 'XGB-FI',
        'rf_permutation_feature_importance': 'RF-PFI',
        'xgb_permutation_feature_importance': 'XGB-PFI'
    }, inplace = True) 
df = df[df.OmicsLevel == 'TripleOmics']
df.head()

Unnamed: 0,Feature,Cohort,OmicsLevel,OmicsTypes,MOGONET:Ranker,MORE:Ranker,boruta,elasticnet,geom.mean_rank,geom.mean_weight,...,RF-PFI,ridge,rra_rank,shap,stuart_rank,svm_rfe,t_test,ta_weight,XGB-FI,XGB-PFI
1800,hsa-miR-30a,ROSMAP,TripleOmics,miRNA+mRNA+Meth,294,1,500,3,64,214,...,565,3,29,309,51,135,552,109,399,178
1801,A2ML1,ROSMAP,TripleOmics,miRNA+mRNA+Meth,1,317,1,89,31,125,...,251,138,57,344,39,201,56,37,418,222
1802,hsa-miR-129-5p,ROSMAP,TripleOmics,miRNA+mRNA+Meth,57,192,432,2,2,1,...,569,2,1,1,1,2,3,1,40,3
1803,hsa-miR-132,ROSMAP,TripleOmics,miRNA+mRNA+Meth,44,45,438,19,1,2,...,8,17,2,2,2,1,1,2,13,7
1804,ARRDC2,ROSMAP,TripleOmics,miRNA+mRNA+Meth,409,591,14,253,14,93,...,274,211,15,52,14,214,7,36,1,201


In [62]:
cohorts = ['ROSMAP', 'BRCA', 'MayoRNASeq'] 
rankers = df.columns[4:]
Selection_Sets = {cohort: {ranker: [] for ranker in rankers} for cohort in cohorts}
for cohort in cohorts:
    df_cohort = df[df.Cohort==cohort]
    all_features = df_cohort.Feature.to_list()
    
    for k in range(10,101,10):
        for ranker in rankers:
            feature_set = df_cohort[df_cohort[ranker] <= k].Feature.to_list() 
            for feat in all_features: 
                if feat in feature_set:
                    Selection_Sets[cohort][ranker].append(1)
                else:
                    Selection_Sets[cohort][ranker].append(0)

In [63]:
# Save PCA data
for cohort in cohorts:
    df2 = pd.DataFrame(Selection_Sets[cohort])
    df2 = df2[df2.sum(axis=1)>0]
    df2.to_csv(f'../BENCHMARKING/{cohort}_PCA_data.csv', index = False)

### Prepare Features For Heatmap

In [64]:
# select top features for each Cohort: Features that got ranked amongst top 20 in atleast 1 ranker
top_features = {"ROSMAP": [], "BRCA":[], "MayoRNASeq": []}
for cohort in top_features:
    cohort_df = df[df["Cohort"]==cohort]
    for feat in cohort_df.Feature.to_list():
        if sum(cohort_df[cohort_df.Feature==feat].iloc[0,4:] <= 20) > 5:
            top_features[cohort].append(feat) 

for key, value in top_features.items():
    print(key, len(value))

ROSMAP 24
BRCA 27
MayoRNASeq 24


In [84]:
def find_name(x):
    result = None
    for y in names:
        if x in feature_names[names[y]]:  
            result = y
            break
    return result


# prepare Cohort data
cohort_data_filtered = {}
for cohort in top_features:
    cohort_df = df[df["Cohort"]==cohort][df.Feature.isin(top_features[cohort])].drop(columns = df.columns[1:4])
    cohort_df["rank"] = cohort_df.iloc[:,1:].sum(axis=1)
    cohort_df.sort_values(by="rank", ascending = True)
    cohort_df.drop(columns = "rank", inplace = True) 

    feature_names = load_json(f"../results/{cohort}/featurenames.json") 

    if cohort == "ROSMAP" or cohort=="BRCA":
        names = {"miRNA": "1", "mRNA":"2", "Meth":"3"}
    else:
        names = {"Metab": "1", "Prot":"2", "mRNA":"3"}
    cohort_df.insert(0,"FeatureType", cohort_df['Feature'].apply(find_name))
    cohort_df.to_csv(f"../BENCHMARKING/{cohort}_Ranks_data.csv")

In [85]:
cohort_df

Unnamed: 0,FeatureType,Feature,MOGONET:Ranker,MORE:Ranker,boruta,elasticnet,geom.mean_rank,geom.mean_weight,lasso,lime,...,RF-PFI,ridge,rra_rank,shap,stuart_rank,svm_rfe,t_test,ta_weight,XGB-FI,XGB-PFI
4201,Metab,"1,2-dilinoleoyl-GPC (18:2/18:2)",1,563,1,16,10,20,14,84,...,60,21,9,37,9,16,83,15,413,11
4202,Metab,1-linoleoyl-GPC (18:2),403,587,8,13,3,4,15,151,...,17,39,4,1,3,7,27,5,10,2
4203,Metab,stachydrine,197,393,590,1,1,1,3,1,...,15,13,1,4,1,2,2,1,1,12
4204,Metab,trigonelline (N'-methylnicotinate),204,262,597,2,4,14,5,8,...,335,12,5,27,5,1,1,4,51,389
4205,mRNA,E7EVA0,544,214,171,19,35,33,10,584,...,1,61,41,513,44,509,231,20,278,594
4206,Metab,N-acetyl-3-methylhistidine*,500,89,292,4,2,2,6,2,...,550,1,2,2,2,3,4,2,5,1
4207,Prot,TBCK,41,455,435,3,6,13,1,9,...,233,8,6,274,6,262,7,3,467,258
4212,Metab,trimethylamine N-oxide,205,553,598,9,8,7,2,166,...,16,31,8,6,8,5,10,7,21,590
4213,Metab,2-methylserine,414,302,38,10,11,25,13,582,...,66,2,11,39,12,18,57,11,415,14
4218,Prot,SNTB2,20,117,413,532,18,18,488,17,...,189,379,19,3,19,244,40,10,4,240
