### Import Packages

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
model_names_map = {
  "AdaBoost Classifier"    : "AdaBoost",
  "CatBoosting Classifier" : "CatBoost",
  "Decision Tree"          : "D.Tree",
  "Gradient Boosting"      : "Gradient Boost",
  "Logistic Regression"    : "L.Regression",
  "MLPClassifier"          : "MLP",
  "MOGONET"                : "MOGONET",
  "MORE"                   : "MORE",
  "Random Forest"          : "Random Forest",
  "SVC"                    : "SVM",
  "XGBClassifier"          : "XGBoost"
}

In [3]:
df = pd.read_csv('Cross_Validation_Results.csv') 
df = df[df.OmicsLevel=='TripleOmics']
df = df[df.featureSelector=='NONE'] 
df.drop(columns = ['MeanLR_PLUS', 'StdLR_PLUS', 'MeanLR_MINUS', 'StdLR_MINUS'], inplace = True)
df['modelName'] = df['modelName'].apply(lambda x: model_names_map[x])
for metric in ['AUC', 'Accuracy', 'Precision', 'Recall', 'F1', 'Specificity', 'NPV']:
    df[metric] = df.apply(lambda x: f'{x[f"Mean{metric}"]:.2f} ± {x[f"Std{metric}"]:.2f}' , axis = 1)


df.drop(columns = [column for column in df.columns if ('Mean' in column or 'Std' in column)], inplace =True)

In [74]:
for cohort in ['ROSMAP', 'MayoRNASeq', 'BRCA']:
    df_cohort = df[df.Cohort==cohort].reset_index(drop=True)
    for i in df_cohort.index:
        x = df_cohort.iloc[i].to_dict()
        y = f"& {x['modelName']} & {x['AUC']} & {x['Accuracy']} & {x['Precision']} & {x['Recall']} & {x['F1']} & {x['Specificity']} & {x['NPV']} \\"
        print(y)

& L.Regression & 0.84 ± 0.05 & 0.73 ± 0.06 & 0.71 ± 0.08 & 0.71 ± 0.06 & 0.71 ± 0.05 & 0.75 ± 0.10 & 0.76 ± 0.05 \
& Random Forest & 0.78 ± 0.04 & 0.70 ± 0.04 & 0.67 ± 0.04 & 0.65 ± 0.09 & 0.66 ± 0.06 & 0.74 ± 0.05 & 0.73 ± 0.05 \
& XGBoost & 0.80 ± 0.04 & 0.71 ± 0.04 & 0.68 ± 0.05 & 0.68 ± 0.08 & 0.68 ± 0.05 & 0.73 ± 0.06 & 0.74 ± 0.05 \
& D.Tree & 0.61 ± 0.08 & 0.61 ± 0.07 & 0.55 ± 0.09 & 0.59 ± 0.16 & 0.56 ± 0.13 & 0.63 ± 0.03 & 0.66 ± 0.07 \
& Gradient Boost & 0.79 ± 0.05 & 0.71 ± 0.05 & 0.67 ± 0.07 & 0.69 ± 0.06 & 0.68 ± 0.04 & 0.72 ± 0.09 & 0.74 ± 0.04 \
& CatBoost & 0.81 ± 0.05 & 0.73 ± 0.05 & 0.71 ± 0.06 & 0.68 ± 0.08 & 0.69 ± 0.05 & 0.76 ± 0.09 & 0.75 ± 0.05 \
& AdaBoost & 0.79 ± 0.06 & 0.73 ± 0.06 & 0.71 ± 0.06 & 0.67 ± 0.08 & 0.69 ± 0.07 & 0.77 ± 0.05 & 0.75 ± 0.06 \
& MLP & 0.85 ± 0.05 & 0.74 ± 0.06 & 0.70 ± 0.07 & 0.74 ± 0.05 & 0.72 ± 0.06 & 0.73 ± 0.08 & 0.78 ± 0.05 \
& SVM & 0.81 ± 0.04 & 0.72 ± 0.04 & 0.68 ± 0.02 & 0.71 ± 0.13 & 0.69 ± 0.06 & 0.72 ± 0.06 & 0.77 ± 0.07 \

### Prepare Biomarker Lists for Results

In [75]:
map_long_names = {
    'randomforest_feature_importance': 'RF-FI',
    'xgb_feature_importance': 'XGB-FI',
    'rf_permutation_feature_importance': 'RF-PFI',
    'xgb_permutation_feature_importance': 'XGB-PFI'
}
df = pd.read_csv('Selected_Biomarker_Panels.csv') 
df = df[df.OmicsLevel=='TripleOmics']
df.columns = [map_long_names[x] if x in map_long_names else x for x in df.columns]
df.head()

Unnamed: 0,Feature,Cohort,OmicsLevel,OmicsTypes,MOGONET:Ranker,MORE:Ranker,boruta,elasticnet,geom.mean_rank,geom.mean_weight,...,RF-PFI,ridge,rra_rank,shap,stuart_rank,svm_rfe,t_test,ta_weight,XGB-FI,XGB-PFI
1800,hsa-miR-30a,ROSMAP,TripleOmics,miRNA+mRNA+Meth,294,1,500,3,64,214,...,565,3,29,309,51,135,552,109,399,178
1801,A2ML1,ROSMAP,TripleOmics,miRNA+mRNA+Meth,1,317,1,89,31,125,...,251,138,57,344,39,201,56,37,418,222
1802,hsa-miR-129-5p,ROSMAP,TripleOmics,miRNA+mRNA+Meth,57,192,432,2,2,1,...,569,2,1,1,1,2,3,1,40,3
1803,hsa-miR-132,ROSMAP,TripleOmics,miRNA+mRNA+Meth,44,45,438,19,1,2,...,8,17,2,2,2,1,1,2,13,7
1804,ARRDC2,ROSMAP,TripleOmics,miRNA+mRNA+Meth,409,591,14,253,14,93,...,274,211,15,52,14,214,7,36,1,201


In [76]:
import json

def load_json(path: str):
    """Load a JSON file and return its content as a Python object."""
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

In [77]:
rosmap_feats = load_json('ROSMAP_featurenames.json') 
mayo_feats = load_json('MayoRNASeq_featurenames.json') 
brca_feats = load_json('BRCA_featurenames.json')  

rosmap_feats['miRNA'] = rosmap_feats['1']
rosmap_feats['mRNA'] = rosmap_feats['2']
rosmap_feats['Meth'] = rosmap_feats['3'] 

mayo_feats['Metab'] = mayo_feats['1']
mayo_feats['Prot'] = mayo_feats['2']
mayo_feats['mRNA'] = mayo_feats['3']


brca_feats['miRNA'] = brca_feats['1']
brca_feats['mRNA'] = brca_feats['2']
brca_feats['Meth'] = brca_feats['3'] 

del brca_feats['1']
del brca_feats['2']
del brca_feats['3']

del mayo_feats['1']
del mayo_feats['2']
del mayo_feats['3'] 

del rosmap_feats['1']
del rosmap_feats['2']
del rosmap_feats['3'] 

feat_types = {'ROSMAP':rosmap_feats,
              'MayoRNASeq':mayo_feats ,
              'BRCA': brca_feats
             } 

In [8]:
cohorts = list(df['Cohort'].unique())
selectors = df.columns[4:].to_list()
results = {
    cohort:{selector: None for selector in selectors} for cohort in cohorts
} 

for cohort in cohorts:
    df_cohort = df[df.Cohort==cohort] 
    for selector in selectors:
        df2 = df_cohort.sort_values(selector).head(30) 
        feat_list = {}
        for feat_type, feat_type_list in feat_types[cohort].items():
            if feat_type not in ['1', '2', '3']:
                feat_list[feat_type] = [feat for feat in df2.Feature.to_list() if feat in feat_type_list] 

        results[cohort][selector] = feat_list

In [9]:
data = {
    'Cohort': [], 
    'Ranking Method': [],
    'FeatureType': [],
    'Selected Biomarkers': []
} 

for cohort, cohort_data in results.items():
    for selector, selector_data in cohort_data.items():
        for feat_type, feats in selector_data.items():
            data['Cohort'].append(cohort)
            data['Ranking Method'].append(selector)
            data['FeatureType'].append(feat_type) 

            if feats == []:
               data['Selected Biomarkers'].append('---')
            else:
                data['Selected Biomarkers'].append(' '.join(feats)) 

data = pd.DataFrame(data)

In [10]:
# save
data[data.Cohort=='ROSMAP'].drop('Cohort', axis = 1).to_csv('data.csv',  index=False)

In [12]:

# for cohort in cohorts:
#     df3= data[data.Cohort==cohort].drop('Cohort', axis = 1) 
#     # assuming df3 has columns: Ranking Method, FeatureType, Selected Biomarkers
#     df_wide = df3.pivot_table(
#         index="Ranking Method",
#         columns="FeatureType",
#         values="Selected Biomarkers",
#         aggfunc=lambda x: "; ".join([str(v) for v in x if pd.notna(v)])
#     ).reset_index()

#     ftypes = list(feat_types[cohort].keys())
#     # Reorder columns if you want (mRNA, miRNA, Meth)
#     cols = ["Ranking Method"] + [c for c in ftypes if c in df_wide.columns]
#     df3_wide = df_wide[cols]
#     df3_wide.columns.name = None 

#     print(cohort)
#     for i in df3_wide.index:
#         print(df3_wide.iloc[i,0])
#         print(df3_wide.iloc[i,1])
#         print(df3_wide.iloc[i,2])
#         print(df3_wide.iloc[i,3], '\n')
        

### Prepare Top Biomarkers

In [86]:
map_long_names = {
    'randomforest_feature_importance': 'RF-FI',
    'xgb_feature_importance': 'XGB-FI',
    'rf_permutation_feature_importance': 'RF-PFI',
    'xgb_permutation_feature_importance': 'XGB-PFI'
}
df = pd.read_csv('Selected_Biomarker_Panels.csv') 
df.columns = [map_long_names[x] if x in map_long_names else x for x in df.columns]
df.head()

Unnamed: 0,Feature,Cohort,OmicsLevel,OmicsTypes,MOGONET:Ranker,MORE:Ranker,boruta,elasticnet,geom.mean_rank,geom.mean_weight,...,RF-PFI,ridge,rra_rank,shap,stuart_rank,svm_rfe,t_test,ta_weight,XGB-FI,XGB-PFI
0,hsa-miR-200a,ROSMAP,SingleOmics,miRNA,175,1,77,134,51,121,...,168,149,73,169,69,31,31,46,123,99
1,ebv-miR-BART2-5p,ROSMAP,SingleOmics,miRNA,1,86,1,51,115,60,...,164,47,92,35,141,186,186,150,89,194
2,hsa-miR-132,ROSMAP,SingleOmics,miRNA,114,138,38,28,1,1,...,1,36,2,1,1,1,1,1,4,1
3,hsa-miR-151-3p,ROSMAP,SingleOmics,miRNA,181,84,55,186,112,164,...,98,183,78,118,98,117,117,106,1,124
4,hsa-miR-885-5p,ROSMAP,SingleOmics,miRNA,61,149,186,1,4,4,...,29,1,6,6,4,9,9,4,25,70


In [124]:
N = 10 
K = 20
for cohort in cohorts: 
    cohort_df = df[df.Cohort==cohort]

    all_unique_features = list(cohort_df.Feature.unique())
    properties = {
        'Feature': [], 
        'Selectors': [],
        'OmicsLevels': [], 
        'Selection Frequency': []
    }

    for feat in all_unique_features: 
        feat_df = cohort_df[cohort_df.Feature==feat].copy()
        feat_df = feat_df[(feat_df.iloc[:, 4:] <= N).sum(axis=1) >= 1] 

        if feat_df.shape[0] > 0: 
            properties['Feature'].append(feat)
            properties['OmicsLevels'].append('+'.join(list(feat_df.OmicsLevel.unique()))) 
            properties['Selectors'].append(' '.join(feat_df.columns[4:][(feat_df.iloc[:, 4:] <= N).sum() > 0].to_list()))
            properties['Selection Frequency'].append((feat_df.iloc[:, 4:] <= N).sum().sum())

    properties_df = pd.DataFrame(properties) 
    properties_df = properties_df[properties_df['Selection Frequency'] >= K]  
    properties_df['N(Selectors)'] = properties_df['Selectors'].apply(lambda x: len(x.split(' '))) 
    properties_df['OmicsLevels'] = properties_df['OmicsLevels'].apply(lambda x:x.replace('Omics', ''))
    properties_df['FeatureType'] = properties_df["Feature"].apply(
    lambda x: next((y for y in feat_types[cohort] if x in feat_types[cohort][y]), None)
    )
    properties_df = properties_df.sort_values(['FeatureType','Selection Frequency','N(Selectors)'], ascending = [True, False, False])
    properties_df[['Feature', 'FeatureType', 'OmicsLevels', 'Selection Frequency', 	'N(Selectors)']].to_csv(f'table_{cohort}.csv', index =False) 

    all_selectors = []
    for query in properties_df.Selectors.to_list():
        for selector in query.split(' '):
            if selector not in all_selectors:
                all_selectors.append(selector)

    all_features = list(properties_df.Feature.unique())

    plot_data = {selector: [] for selector in all_selectors}
    for selector in all_selectors:
        for feature in all_features:
            plot_data[selector].append(cohort_df[cohort_df.Feature==feature][[selector]].min().min()) 
    plot_data_df = pd.DataFrame(plot_data)
    plot_data_df.index = all_features
    plot_data_df.index.name = 'Feature' 

    annotation= properties_df[properties_df.Feature.isin(plot_data_df.index.to_list())][['FeatureType','OmicsLevels']]
    annotation.index = properties_df.Feature
    annotation.index.name = 'Feature' 
    pd.concat([annotation, plot_data_df ], axis =1).to_csv(f'plot_data_{cohort}.csv')

### Prepare Recommendations

In [32]:
df = pd.read_csv('Cross_Validation_Results.csv') 
df['modelName'] = df['modelName'].apply(lambda x: model_names_map[x])  
df = df[['Cohort',	'OmicsLevel', 'OmicsType', 'featureSelector','modelName','numFeatures', 'MeanF1']]
df = (
    df.groupby(["OmicsLevel", "numFeatures", "modelName", "featureSelector"], as_index=False)
      .agg({"MeanF1": "mean"})
) 
df.head()

Unnamed: 0,OmicsLevel,numFeatures,modelName,featureSelector,MeanF1
0,DualOmics,10,AdaBoost,MOGONET:Ranker,0.80534
1,DualOmics,10,AdaBoost,MORE:Ranker,0.781825
2,DualOmics,10,AdaBoost,RF-FI,0.871917
3,DualOmics,10,AdaBoost,RF-PFI,0.830866
4,DualOmics,10,AdaBoost,XGB-FI,0.863195


In [48]:
for omicslevel in ['SingleOmics', 'DualOmics', 'TripleOmics']: 
    print(f'\n\n\n{omicslevel.upper()}')
    for k in range(10,101,10):
        
        df2= df[(df["OmicsLevel"] == omicslevel) & 
        (df["numFeatures"] == k)].sort_values('MeanF1', ascending = False).head(5) 
        models = ', '.join(list(df2.modelName.unique()))
        selectors = ', '.join(list(df2.featureSelector.unique()))
        min_f1 = df2.MeanF1.min()
        max_f1 = df2.MeanF1.max() 
        score = f'{min_f1: .3f} -- {max_f1: .3f}'

        row = f"  & {k}  & {selectors} & {models} & {score}  \\\\" 
        row = row.replace('_', '\\_')
        print(row)





SINGLEOMICS
  & 10  & shap, mean\_weight, ta\_weight, geom.mean\_weight, geom.mean\_rank & CatBoost, SVM, MLP &  0.863 --  0.867  \\
  & 20  & ta\_weight, mean\_weight, shap & MLP, AdaBoost, CatBoost, SVM &  0.874 --  0.876  \\
  & 30  & median\_weight, elasticnet, mean\_weight, ta\_weight & MLP, SVM &  0.880 --  0.887  \\
  & 40  & lasso, mra\_rank, elasticnet, median\_weight, ta\_weight & MLP &  0.884 --  0.888  \\
  & 50  & elasticnet, median\_weight, mra\_rank, ridge, rra\_rank & MLP &  0.879 --  0.890  \\
  & 60  & lasso, elasticnet, ta\_weight, ridge & MLP, L.Regression &  0.882 --  0.888  \\
  & 70  & median\_weight, elasticnet, ridge, lasso, mean\_weight & MLP &  0.882 --  0.890  \\
  & 80  & ridge, geom.mean\_weight, median\_weight, stuart\_rank, geom.mean\_rank & MLP &  0.880 --  0.885  \\
  & 90  & median\_weight, ta\_weight, geom.mean\_rank, mean\_weight & MLP, L.Regression &  0.874 --  0.881  \\
  & 100  & median\_weight, ta\_weight, mra\_rank, ridge, mean\_weight & MLP

In [43]:
score

' 0.934 --  0.956 \\\\'

In [25]:
min_f1

0.9343765641513314

In [26]:
max_f1

0.9563617991720447

In [27]:
models

['MLP', 'L.Regression']

In [28]:
selectors

['elasticnet',
 'lasso',
 'ta_weight',
 'median_weight',
 'mean_weight',
 'rra_rank',
 'mra_rank']