### Import Packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from modules.utils import load_json, save_json
from tqdm import tqdm  
import os 
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

In [4]:
def get_view_column_splits(data: pd.DataFrame) -> dict:
    """
    Dynamically split columns into groups of 200.
    Adapts automatically for datasets with 201, 401, or 601 columns.
    """
    view_column_splits = {}
    step = 200  # size per chunk

    # Start from column index 1 to skip 'Sample ID' or first column
    total_cols = data.shape[1] - 1
    num_groups = (total_cols + step - 1) // step  # ceiling division

    for i in range(num_groups):
        start = 1 + i * step
        end = min(1 + (i + 1) * step, data.shape[1])
        view_column_splits[i + 1] = data.columns[start:end]

    return view_column_splits 

def ensure_dir(path):
    Path(path).mkdir(parents=True, exist_ok=True)
    print(f"[mkdir] ensured: {path}")

def run_pipeline(dataset_name: str, experiment_name: str, view_column_splits: dict = None):
    """
    Pipeline that:
      - Loads baseline JSON results and converts to a DataFrame
      - Loads post-feature-selection cv results CSV
      - Concatenates baseline + fs results, shortens featureSelector names, saves combined CSV
      - Copies ranks and validation CSVs into results folder, normalizes names, augments validation with ML metrics
      - Saves val_results and small JSON view mapping
    
    Inputs:
      - dataset_name (str)
      - experiment_name (str)
      - view_column_splits (dict) optional (if not provided an empty dict will be used)
    """
    view_column_splits = view_column_splits or {}
    base_bench = Path(f"../BENCHMARKING/{dataset_name}/{experiment_name}")
    out_dir = Path(f"../results/{dataset_name}/{experiment_name}")
    ensure_dir(out_dir)
    
    print(f"[run_pipeline] dataset={dataset_name}, experiment={experiment_name}")
    # --- Load Baseline Performance ---
    baseline_path = base_bench / "ML-Baseline.json"
    print(f"[run_pipeline] loading baseline from: {baseline_path}")
    baseline = load_json(baseline_path)
    
    baseline_results = {
        'featureSelector': [],
        'modelName': [], 
        'numFeatures': [],  
        'MeanAccuracy': [],
        'StdAccuracy': [], 
        'MeanPrecision': [], 
        'StdPrecision':[], 
        'MeanRecall':[], 
        'StdRecall': [], 
        'MeanF1': [], 
        'StdF1': [], 
        'MeanAUC': [],
        'StdAUC':[],
        "MeanSpecificity": [],
        "StdSpecificity": [], 
        "MeanNPV": [],
        "StdNPV": [],
        "MeanLR_PLUS": [], 
        "StdLR_PLUS": [],
        "MeanLR_MINUS": [],
        "StdLR_MINUS": []
    }
    
    if baseline:
        for modelname, model_data in baseline.items(): 
            baseline_results['featureSelector'].append('NONE') 
            baseline_results['modelName'].append(modelname)
            baseline_results['numFeatures'].append(600)
            
            if modelname in ['MORE', 'MOGONET']:
                src = model_data  # top-level keys for these models
            else:
                src = model_data.get('cross_val_report', {})  # nested for other models
            
            # now read everything from src (safe .get with defaults)
            baseline_results['MeanAccuracy'].append(src.get('accuracy', {}).get('mean'))
            baseline_results['StdAccuracy'].append(src.get('accuracy', {}).get('std'))
            baseline_results['MeanPrecision'].append(src.get('precision', {}).get('mean'))
            baseline_results['StdPrecision'].append(src.get('precision', {}).get('std'))
            baseline_results['MeanRecall'].append(src.get('recall', {}).get('mean'))
            baseline_results['StdRecall'].append(src.get('recall', {}).get('std'))
            baseline_results['MeanF1'].append(src.get('f1', {}).get('mean'))
            baseline_results['StdF1'].append(src.get('f1', {}).get('std'))
            baseline_results['MeanAUC'].append(src.get('roc_auc', {}).get('mean'))
            baseline_results['StdAUC'].append(src.get('roc_auc', {}).get('std'))
            baseline_results['MeanSpecificity'].append(src.get('specificity', {}).get('mean'))
            baseline_results['StdSpecificity'].append(src.get('specificity', {}).get('std'))
            baseline_results['MeanNPV'].append(src.get('npv', {}).get('mean'))
            baseline_results['StdNPV'].append(src.get('npv', {}).get('std'))
            baseline_results['MeanLR_PLUS'].append(src.get('lr_plus', {}).get('mean'))
            baseline_results['StdLR_PLUS'].append(src.get('lr_plus', {}).get('std'))
            baseline_results['MeanLR_MINUS'].append(src.get('lr_minus', {}).get('mean'))
            baseline_results['StdLR_MINUS'].append(src.get('lr_minus', {}).get('std'))

        baseline_results_df = pd.DataFrame(baseline_results)
        print(f"[run_pipeline] baseline -> DataFrame with shape {baseline_results_df.shape}")
    else:
        baseline_results_df = pd.DataFrame(baseline_results)
        print("[run_pipeline] baseline empty -> created empty baseline DataFrame")
    
    # --- Load Performance after feature selection ---
    fs_cv_path = base_bench / "cross-validation-results.csv"
    print(f"[run_pipeline] loading feature-selection CV results from: {fs_cv_path}")
    if fs_cv_path.exists():
        fs_performance = pd.read_csv(fs_cv_path)
        print(f"[run_pipeline] loaded fs_performance shape={fs_performance.shape}")
    else:
        fs_performance = pd.DataFrame()
        print(f"[run_pipeline] WARNING: {fs_cv_path} not found. Using empty DataFrame for fs_performance.")
    
    # --- combine performance ---
    cv_performance = pd.concat([baseline_results_df, fs_performance], axis=0, ignore_index=True, sort=False)
    print(f"[run_pipeline] combined cv_performance shape={cv_performance.shape}")
    
    # Shorten long names
    map_long_names = {
        'randomforest_feature_importance': 'RF-FI',
        'xgb_feature_importance': 'XGB-FI',
        'rf_permutation_feature_importance': 'RF-PFI',
        'xgb_permutation_feature_importance': 'XGB-PFI'
    }
    if 'featureSelector' in cv_performance.columns:
        cv_performance['featureSelector'] = cv_performance['featureSelector'].apply(lambda x: map_long_names[x] if x in map_long_names else x)
    
    cv_out_path = out_dir / "cross-validation-results.csv"
    cv_performance.to_csv(cv_out_path, index=False)
    print(f"[run_pipeline] saved combined CV performance to: {cv_out_path}")
    
    # --- copy Ranks ---
    ranks_in = base_bench / "BiomarkerRanks.csv"
    ranks_out = out_dir / "BiomarkerRanks.csv"
    if ranks_in.exists():
        ranks = pd.read_csv(ranks_in)
        ranks.to_csv(ranks_out, index=False)
        print(f"[run_pipeline] copied ranks to: {ranks_out} (shape={ranks.shape})")
    else:
        print(f"[run_pipeline] WARNING: ranks file not found at {ranks_in}.")
    
    # --- validation results ---
    val_in = base_bench / "Biomarker-validation-results.csv"
    if val_in.exists():
        val_results = pd.read_csv(val_in)
        # normalize names and columns
        val_results = val_results.rename(columns={"Method": "featureSelector"}) if "Method" in val_results.columns else val_results
        if 'featureSelector' in val_results.columns:
            val_results['featureSelector'] = val_results['featureSelector'].apply(lambda x: map_long_names[x] if x in map_long_names else x)
        # Save a cleaned copy to results folder
        val_out = out_dir / "Biomarker-validation-results.csv"
        val_results.to_csv(val_out, index=False)
        print(f"[run_pipeline] saved validation results to: {val_out} (shape={val_results.shape})")
    else:
        val_results = pd.DataFrame()
        print(f"[run_pipeline] WARNING: validation results not found at {val_in}.")
    
    # --- Add ML Performance to validation results (full_val_results) ---
    full_val_results = pd.DataFrame()
    if not cv_performance.empty and not val_results.empty:
        # columns to merge from cv_performance (drop featureSelector and numFeatures)
        cv_columns = [c for c in cv_performance.columns if c not in ("featureSelector", "numFeatures")]
        print(f"[run_pipeline] preparing to augment validation with cv columns: {cv_columns}")
        
        # Ensure numeric numFeatures in cv_performance
        if 'numFeatures' in cv_performance.columns:
            # If numFeatures read as float, convert to int where appropriate
            try:
                cv_performance['numFeatures'] = cv_performance['numFeatures'].astype(int)
            except Exception:
                pass
        
        for i in range(val_results.shape[0]):
            selector = val_results.at[i, 'featureSelector'] if 'featureSelector' in val_results.columns else None
            num_features = val_results.at[i, 'method_cutoff'] if 'method_cutoff' in val_results.columns else None
            
            # only process expected cutoffs
            if num_features in range(10, 101, 10) and selector is not None:
                df2 = cv_performance.loc[
                    (cv_performance['featureSelector'] == selector) & 
                    (cv_performance['numFeatures'] == num_features),
                    cv_columns
                ].copy()
                
                if df2.empty:
                    # no matching rows found
                    continue
                
                # rename Mean -> Model (as in original)
                df2 = df2.rename(columns={col: col.replace("Mean", "Model") for col in df2.columns})
                
                # duplicate the validation row to match df2 number of rows
                row = val_results.iloc[i:i+1, :].reset_index(drop=True)
                repeated_row = pd.concat([row]*df2.shape[0], ignore_index=True).reset_index(drop=True)
                
                df_chunk = pd.concat([repeated_row.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
                full_val_results = pd.concat([full_val_results, df_chunk], axis=0, ignore_index=True, sort=False)
        
        # final tidy
        full_val_results = full_val_results.rename(columns={"method_cutoff": "numFeatures"}) if not full_val_results.empty else full_val_results
        # drop fully empty columns/rows
        full_val_results = full_val_results.T.dropna(how='all').T
        val_results_out = out_dir / "val_results.csv"
        full_val_results.to_csv(val_results_out, index=False)
        print(f"[run_pipeline] saved augmented validation results to: {val_results_out} (shape={full_val_results.shape})")
    else:
        print("[run_pipeline] Skipping ML augmentation: cv_performance or val_results is empty.")
    
    # --- save view_column_splits ---
    vjson_out = out_dir / "featurenames.json"
    save_json(vjson_out, {k: list(v) for k, v in (view_column_splits or {}).items()})
    
    # --- final prints ---
    if 'validationsource' in val_results.columns:
        try:
            counts = val_results['validationsource'].value_counts()
            print("[run_pipeline] validation source counts:")
            print(counts.to_string())
        except Exception as e:
            print(f"[run_pipeline] could not print validationsource counts: {e}")
    else:
        print("[run_pipeline] 'validationsource' column not present in validation results (no counts printed).")
    
    print("[run_pipeline] finished.")
    # return a dict of produced DataFrames for convenience if the caller wants them
    return {
        "cv_performance": cv_performance,
        "baseline_results_df": baseline_results_df,
        "ranks": (ranks if 'ranks' in locals() else pd.DataFrame()),
        "val_results": (val_results if 'val_results' in locals() else pd.DataFrame()),
        "full_val_results": (full_val_results if 'full_val_results' in locals() else pd.DataFrame()),
    }


### Load and Prepare Results

In [5]:
# set dataset name
experimental_designs = {"ROSMAP":['miRNA_data',
                                  'dna_methylation_data',
                                  'gene_expression_data',
                                  'miRNA_and_gene_expression_data',
                                  'miRNA_and_dna_methylation_data',
                                  'gene_expression_and_dna_methylation_data',
                                  'miRNA_and_gene_expression_and_dna_methylation_data'
                                 ],
                        'MayoRNASeq':[
                            'metabolomics_data',
                            'gene_expression_data',
                            'proteomics_data',
                            'gene_expression_and_proteomics_data',
                            'metabolomics_and_gene_expression_data',
                            'metabolomics_and_proteomics_data',
                            'metabolomics_and_gene_expression_and_proteomics_data'
                            
                        ],
                        'BRCA':['miRNA_data',
                                  'dna_methylation_data',
                                  'gene_expression_data',
                                  'miRNA_and_gene_expression_data',
                                  'miRNA_and_dna_methylation_data',
                                  'gene_expression_and_dna_methylation_data',
                                  'miRNA_and_gene_expression_and_dna_methylation_data'
                                 ]
                       }  
for dataset_name, experiment_list in experimental_designs.items():
    df = pd.read_csv(f'../data/{dataset_name}/prepared/{experiment_list[-1]}.csv', index_col=0) 
    
    for experiment_name in experiment_list:
        view_column_splits = get_view_column_splits(df) 
        run_pipeline(dataset_name, experiment_name, view_column_splits) 
        print('\n\n')

[mkdir] ensured: ../results/ROSMAP/miRNA_data
[run_pipeline] dataset=ROSMAP, experiment=miRNA_data
[run_pipeline] loading baseline from: ../BENCHMARKING/ROSMAP/miRNA_data/ML-Baseline.json
[run_pipeline] baseline -> DataFrame with shape (11, 21)
[run_pipeline] loading feature-selection CV results from: ../BENCHMARKING/ROSMAP/miRNA_data/cross-validation-results.csv
[run_pipeline] loaded fs_performance shape=(2970, 21)
[run_pipeline] combined cv_performance shape=(2981, 21)
[run_pipeline] saved combined CV performance to: ../results/ROSMAP/miRNA_data/cross-validation-results.csv
[run_pipeline] copied ranks to: ../results/ROSMAP/miRNA_data/BiomarkerRanks.csv (shape=(200, 27))
[run_pipeline] saved validation results to: ../results/ROSMAP/miRNA_data/Biomarker-validation-results.csv (shape=(21594, 13))
[run_pipeline] preparing to augment validation with cv columns: ['modelName', 'MeanAccuracy', 'StdAccuracy', 'MeanPrecision', 'StdPrecision', 'MeanRecall', 'StdRecall', 'MeanF1', 'StdF1', 'Mean