### Import Packages

In [1]:
import pandas as pd
from modules.utils import load_json
import warnings
warnings.filterwarnings("ignore")

### Prepare Data For PCA

In [2]:
df = pd.read_csv('../BENCHMARKING/Selected_Biomarker_Panels.csv') 
df.rename(columns = {
        'randomforest_feature_importance': 'RF-FI',
        'xgb_feature_importance': 'XGB-FI',
        'rf_permutation_feature_importance': 'RF-PFI',
        'xgb_permutation_feature_importance': 'XGB-PFI'
    }, inplace = True) 
df = df[df.OmicsLevel == 'TripleOmics']
df.head()

Unnamed: 0,Feature,Cohort,OmicsLevel,OmicsTypes,MOGONET:Ranker,MORE:Ranker,boruta,elasticnet,geom.mean_rank,geom.mean_weight,...,RF-PFI,ridge,rra_rank,shap,stuart_rank,svm_rfe,t_test,ta_weight,XGB-FI,XGB-PFI
1800,hsa-miR-30a,ROSMAP,TripleOmics,miRNA+mRNA+Meth,294,1,500,3,64,214,...,565,3,29,309,51,135,552,109,399,178
1801,A2ML1,ROSMAP,TripleOmics,miRNA+mRNA+Meth,1,317,1,89,31,125,...,251,138,57,344,39,201,56,37,418,222
1802,hsa-miR-129-5p,ROSMAP,TripleOmics,miRNA+mRNA+Meth,57,192,432,2,2,1,...,569,2,1,1,1,2,3,1,40,3
1803,hsa-miR-132,ROSMAP,TripleOmics,miRNA+mRNA+Meth,44,45,438,19,1,2,...,8,17,2,2,2,1,1,2,13,7
1804,ARRDC2,ROSMAP,TripleOmics,miRNA+mRNA+Meth,409,591,14,253,14,93,...,274,211,15,52,14,214,7,36,1,201


In [3]:
cohorts = ['ROSMAP', 'BRCA', 'MayoRNASeq'] 
rankers = df.columns[4:]
Selection_Sets = {cohort: {ranker: [] for ranker in rankers} for cohort in cohorts}
for cohort in cohorts:
    df_cohort = df[df.Cohort==cohort]
    all_features = df_cohort.Feature.to_list()
    
    for k in range(10,101,10):
        for ranker in rankers:
            feature_set = df_cohort[df_cohort[ranker] <= k].Feature.to_list() 
            for feat in all_features: 
                if feat in feature_set:
                    Selection_Sets[cohort][ranker].append(1)
                else:
                    Selection_Sets[cohort][ranker].append(0)

In [4]:
# Save PCA data
for cohort in cohorts:
    df2 = pd.DataFrame(Selection_Sets[cohort])
    df2 = df2[df2.sum(axis=1)>0]
    df2.to_csv(f'../BENCHMARKING/{cohort}_PCA_data.csv', index = False)

### Prepare Features For Heatmap

In [5]:
# select top features for each Cohort: Features that got ranked amongst top 20 in atleast 1 ranker
top_features = {"ROSMAP": [], "BRCA":[], "MayoRNASeq": []}
for cohort in top_features:
    cohort_df = df[df["Cohort"]==cohort]
    for feat in cohort_df.Feature.to_list():
        if sum(cohort_df[cohort_df.Feature==feat].iloc[0,4:] <= 20) > 5:
            top_features[cohort].append(feat) 

for key, value in top_features.items():
    print(key, len(value))

ROSMAP 24
BRCA 27
MayoRNASeq 24


In [6]:
def find_name(x):
    result = None
    for y in names:
        if x in feature_names[names[y]]:  
            result = y
            break
    return result


# prepare Cohort data
cohort_data_filtered = {}
for cohort in top_features:
    cohort_df = df[df["Cohort"]==cohort][df.Feature.isin(top_features[cohort])].drop(columns = df.columns[1:4])
    cohort_df["rank"] = cohort_df.iloc[:,1:].sum(axis=1)
    cohort_df.sort_values(by="rank", ascending = True)
    cohort_df.drop(columns = "rank", inplace = True) 

    feature_names = load_json(f"../results/{cohort}/featurenames.json") 

    if cohort == "ROSMAP" or cohort=="BRCA":
        names = {"miRNA": "1", "mRNA":"2", "Meth":"3"}
    else:
        names = {"Metab": "1", "Prot":"2", "mRNA":"3"}
    cohort_df.insert(0,"FeatureType", cohort_df['Feature'].apply(find_name))
    cohort_df.to_csv(f"../BENCHMARKING/{cohort}_Ranks_data.csv", index = False)

### Prepare Crosvalidation Results for Overleap

In [31]:
df = pd.read_csv('../BENCHMARKING/Cross_Validation_Results.csv') 
df = df[df.featureSelector=='NONE'] 
df.drop(columns = ['numFeatures','featureSelector','MeanLR_PLUS', 'StdLR_PLUS', 'MeanLR_MINUS', 'StdLR_MINUS'], inplace = True)

for metric in ['Accuracy', 'Precision', 'Recall', 'F1', 'AUC', 'Specificity', 'NPV']:
    df[metric] = df.apply(lambda x: f'{x[f"Mean{metric}"]:.2f} ± {x[f"Std{metric}"]:.2f}' , axis = 1)


df.drop(columns = [column for column in df.columns if ('Mean' in column or 'Std' in column)], inplace =True)

In [35]:
df[df.OmicsLevel=='TripleOmics'].to_latex()

'\\begin{tabular}{llllllllllll}\n\\toprule\n & Cohort & OmicsLevel & OmicsType & modelName & Accuracy & Precision & Recall & F1 & AUC & Specificity & NPV \\\\\n\\midrule\n17886 & ROSMAP & TripleOmics & miRNA+mRNA+Meth & Logistic Regression & 0.73 ± 0.06 & 0.71 ± 0.08 & 0.71 ± 0.06 & 0.71 ± 0.05 & 0.84 ± 0.05 & 0.75 ± 0.10 & 0.76 ± 0.05 \\\\\n17887 & ROSMAP & TripleOmics & miRNA+mRNA+Meth & Random Forest & 0.70 ± 0.04 & 0.67 ± 0.04 & 0.65 ± 0.09 & 0.66 ± 0.06 & 0.78 ± 0.04 & 0.74 ± 0.05 & 0.73 ± 0.05 \\\\\n17888 & ROSMAP & TripleOmics & miRNA+mRNA+Meth & XGBClassifier & 0.71 ± 0.04 & 0.68 ± 0.05 & 0.68 ± 0.08 & 0.68 ± 0.05 & 0.80 ± 0.04 & 0.73 ± 0.06 & 0.74 ± 0.05 \\\\\n17889 & ROSMAP & TripleOmics & miRNA+mRNA+Meth & Decision Tree & 0.61 ± 0.07 & 0.55 ± 0.09 & 0.59 ± 0.16 & 0.56 ± 0.13 & 0.61 ± 0.08 & 0.63 ± 0.03 & 0.66 ± 0.07 \\\\\n17890 & ROSMAP & TripleOmics & miRNA+mRNA+Meth & Gradient Boosting & 0.71 ± 0.05 & 0.67 ± 0.07 & 0.69 ± 0.06 & 0.68 ± 0.04 & 0.79 ± 0.05 & 0.72 ± 0.09 & 0.