## Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import f_classif
from statsmodels.stats.multitest import multipletests
from modules.utils import save_json, load_json

import warnings
warnings.filterwarnings("ignore")


def mogonet_preprocess(
    X: pd.DataFrame,
    y: pd.Series,
    omics_type: str,                   # "mRNA" | "meth" | "miRNA" | "proteomics" | "metabolomics"
    fdr_alpha: float = 0.05,
    var_thresh_mrna: float = 0.1,      # per MOGONET text
    var_thresh_meth: float = 0.001,    # per MOGONET text
    pc1_max: float = 0.50,             # PC1 must explain < 50%
    min_keep: int = 200,               # paper used 200 for ROSMAP
    max_keep: int = 300,
    hm27_probe_ids: set | None = None, # pass for methylation (ROSMAP) to restrict to HM27
    training_mask: np.ndarray | None = None,  # boolean mask for rows used as "training"
    verbose: bool = True,
):
    """
    Returns
    -------
    X_sel : pd.DataFrame
        Selected & scaled feature matrix (rows=samples, cols=selected features).
    kept_features : list[str]
        Names of kept features, in order.
    """

    # ------------------------------------------------------------
    # Setup: split out training rows (everything else is held-out)
    # ------------------------------------------------------------
    if training_mask is None:
        training_mask = np.ones(len(X), dtype=bool)
    X_train = X.loc[training_mask]
    y_train = y.loc[training_mask]

    n0 = X.shape[1]
    if verbose:
        print(f"[Setup] Samples: {X.shape[0]} (train={X_train.shape[0]}), Features: {n0}")

    # ------------------------------------------------------------
    # STEP 0 — Log10 normalization for proteomics/metabolomics
    #   Clamp non-positive to 0, add a training-derived pseudocount, log10.
    #   This avoids NaNs/infs and mimics left-censor handling for dropouts.
    # ------------------------------------------------------------
    if omics_type in {"proteomics", "metabolomics"}:
        # ensure numeric
        X = X.apply(pd.to_numeric, errors="coerce")
        X_train = X_train.apply(pd.to_numeric, errors="coerce")
    
        # set non-finite to 0 and clamp negatives to 0
        X = X.replace([np.inf, -np.inf], np.nan).fillna(0.0).clip(lower=0.0)
        X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0.0).clip(lower=0.0)
    
        # smallest strictly-positive value in TRAIN; fallbacks to a tiny constant
        with np.errstate(invalid="ignore"):
            min_pos = np.nanmin(X_train.values[X_train.values > 0])
        if not np.isfinite(min_pos):
            min_pos = 1e-3  # fallback if nothing > 0 in training
        pseudocount = max(min_pos / 2.0, 1e-6)
    
        if verbose:
            print(f"[Step 0] log10 transform (clamped ≤0 to 0) with pseudocount={pseudocount:.3g}")
    
        X = np.log10(X + pseudocount)
        X_train = np.log10(X_train + pseudocount)

    # ---------------------------------------------------------------------
    # STEP 1 — HM27 restriction for DNA methylation (for interpretability)
    #   “...only probes corresponding to the Illumina HumanMethylation27 were retained...”
    # ---------------------------------------------------------------------
    if omics_type == "meth" and hm27_probe_ids is not None:
        keep_cols = [c for c in X.columns if c in hm27_probe_ids]
        X = X[keep_cols]
        X_train = X_train[keep_cols]
        if verbose:
            print(f"[Step 1] HM27 restriction: kept {len(keep_cols)} features")

    # ---------------------------------------------------------------------------------
    # STEP 2 — Remove features with no signal (constant on training)
    #   “...filtered out features with no signal (zero mean values)...”
    # ---------------------------------------------------------------------------------
    vt0 = VarianceThreshold(threshold=0.0)
    vt0.fit(X_train.values)
    keep_idx0 = vt0.get_support(indices=True)
    X = X.iloc[:, keep_idx0]
    X_train = X_train.iloc[:, keep_idx0]
    if verbose:
        print(f"[Step 2] Dropped no-signal/constant features: {n0 - len(keep_idx0)} removed; {X.shape[1]} remain")
    n0 = X.shape[1]

    # ---------------------------------------------------------------------------------
    # STEP 3 — Low-variance filtering with *omics-specific* thresholds
    #   “...0.1 for mRNA, 0.001 for methylation; miRNA: drop only zero-variance...”
    # ---------------------------------------------------------------------------------
    if omics_type == "mRNA":
        vt = VarianceThreshold(threshold=var_thresh_mrna)
    elif omics_type == "meth":
        vt = VarianceThreshold(threshold=var_thresh_meth)
    else:  # miRNA / proteomics / metabolomics -> use zero-variance threshold by default
        vt = VarianceThreshold(threshold=0.0)

    vt.fit(X_train.values)
    keep_idx = vt.get_support(indices=True)
    X = X.iloc[:, keep_idx]
    X_train = X_train.iloc[:, keep_idx]
    if verbose:
        print(f"[Step 3] Variance filter ({omics_type}): kept {X.shape[1]} / previous {n0}")

    # -------------------------------------------------------------------------------------------------
    # STEP 4 — Statistical preselection on training data (ANOVA + BH-FDR)
    # -------------------------------------------------------------------------------------------------
    if X_train.shape[1] == 0:
        if verbose:
            print("[Step 4] No features left after variance filtering.")
        return pd.DataFrame(index=X.index), []

    F, pvals = f_classif(X_train.values, y_train.values)
    pvals = np.nan_to_num(pvals, nan=1.0)
    _, qvals, _, _ = multipletests(pvals, alpha=fdr_alpha, method="fdr_bh")

    ranked_idx = np.lexsort((-F, qvals))  # primary: q ascending; secondary: F descending
    if verbose:
        sig_count = int((qvals <= fdr_alpha).sum())
        print(f"[Step 4] FDR (alpha={fdr_alpha}): {sig_count} features significant; proceeding with ranked list")

    # ----------------------------------------------------------------------------------------------------------------
    # STEP 5 — Choose number of features so that PC1 explains < 50% variance (min_keep..max_keep)
    # ----------------------------------------------------------------------------------------------------------------
    def pc1_explained(idx_slice) -> float:
        pca = PCA(n_components=1, svd_solver="full").fit(X_train.iloc[:, idx_slice])
        return float(pca.explained_variance_ratio_[0])

    upper_cap = min(max_keep, X_train.shape[1])
    k = min(min_keep, upper_cap)
    while k <= upper_cap:
        idx_keep = ranked_idx[:k]
        if pc1_explained(idx_keep) < pc1_max:
            break
        k += 1

    if k > upper_cap:
        k = min(min_keep, upper_cap)
        idx_keep = ranked_idx[:k]
        if verbose:
            print(f"[Step 5] PC1≥{pc1_max*100:.0f}% for all K≤{upper_cap}; fallback to K={k}")
    else:
        idx_keep = ranked_idx[:k]
        if verbose:
            print(f"[Step 5] Selected K={k} features with PC1<{pc1_max*100:.0f}%")

    X_sel = X.iloc[:, idx_keep].copy()

    # ---------------------------------------------------------------------------------------------
    # STEP 6 — Scale each omics to [0, 1] (fit on training only; transform all rows)
    # ---------------------------------------------------------------------------------------------
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(X_train.iloc[:, idx_keep].values)       # fit on training subset only
    X_sel.loc[:, :] = scaler.transform(X_sel.values)   # transform all samples

    kept_features = list(X_sel.columns)
    if verbose:
        print(f"[Step 6] Scaled to [0,1]. Final feature count: {len(kept_features)}")

    return X_sel


In [2]:
import GEOparse
import pandas as pd

# gpl = GEOparse.get_GEO(geo="GPL8490")           # downloads & caches
# tbl = gpl.table
# hm27_probe_ids = list(set(tbl["ID"].dropna().astype(str)))
# print(len(hm27_probe_ids))

# # optional: save 
# save_json("../artifacts/hm27_probe_ids.json", hm27_probe_ids)

hm27_probe_ids = load_json("../artifacts/hm27_probe_ids.json")
print("Number of HM27 probes: ", len(hm27_probe_ids))

Number of HM27 probes:  27578


### 1. Preprocess ROSMAP Files

In [3]:
# preprocess microRNA data
df = pd.read_csv("../data/ROSMAP/cleaned/miRNA_data.csv")
df_X = df.iloc[:,6:]
df_y = df["Diagnosis"]

df_X = mogonet_preprocess(df_X, df_y, omics_type="miRNA", verbose=True, hm27_probe_ids=hm27_probe_ids)
df  = pd.concat([df.iloc[:,:6], df_X], axis = 1)
df.to_csv("../data/ROSMAP/preprocessed/miRNA_data.csv", index = False)

[Setup] Samples: 378 (train=378), Features: 309
[Step 2] Dropped no-signal/constant features: 0 removed; 309 remain
[Step 3] Variance filter (miRNA): kept 309 / previous 309
[Step 4] FDR (alpha=0.05): 22 features significant; proceeding with ranked list
[Step 5] Selected K=200 features with PC1<50%
[Step 6] Scaled to [0,1]. Final feature count: 200


In [4]:
# preprocess gene expression data
df = pd.read_csv("../data/ROSMAP/cleaned/gene_expression_data.csv")
df_X = df.iloc[:,6:]
df_y = df["Diagnosis"]

df_X = mogonet_preprocess(df_X, df_y, omics_type="mRNA", verbose=True, hm27_probe_ids=hm27_probe_ids)
df  = pd.concat([df.iloc[:,:6], df_X], axis = 1)
df.to_csv("../data/ROSMAP/preprocessed/gene_expression_data.csv", index = False)

[Setup] Samples: 380 (train=380), Features: 32833
[Step 2] Dropped no-signal/constant features: 1184 removed; 31649 remain
[Step 3] Variance filter (mRNA): kept 15440 / previous 31649
[Step 4] FDR (alpha=0.05): 5381 features significant; proceeding with ranked list
[Step 5] PC1≥50% for all K≤300; fallback to K=200
[Step 6] Scaled to [0,1]. Final feature count: 200


In [5]:
# preprocess dna methylation data
df = pd.read_csv("../data/ROSMAP/cleaned/dna_methylation_data.csv")
df_X = df.iloc[:,6:]
df_y = df["Diagnosis"]

df_X = mogonet_preprocess(df_X, df_y, omics_type="meth", verbose=True, hm27_probe_ids=hm27_probe_ids)
df  = pd.concat([df.iloc[:,:6], df_X], axis = 1)
df.to_csv("../data/ROSMAP/preprocessed/dna_methylation_data.csv", index = False)

[Setup] Samples: 375 (train=375), Features: 420132
[Step 1] HM27 restriction: kept 23788 features
[Step 2] Dropped no-signal/constant features: 396344 removed; 23788 remain
[Step 3] Variance filter (meth): kept 4159 / previous 23788
[Step 4] FDR (alpha=0.05): 570 features significant; proceeding with ranked list
[Step 5] Selected K=200 features with PC1<50%
[Step 6] Scaled to [0,1]. Final feature count: 200


### 2. Preprocess MayoRNASeq Files

In [6]:
# preprocess gene expression data
df = pd.read_csv("../data/MayoRNASeq/cleaned/gene_expression_data.csv")
df_X = df.iloc[:,5:]
df_y = df["Diagnosis"]

df_X = mogonet_preprocess(df_X, df_y, omics_type="mRNA", verbose=True, hm27_probe_ids=hm27_probe_ids)
df  = pd.concat([df.iloc[:,:5], df_X], axis = 1)
df.to_csv("../data/MayoRNASeq/preprocessed/gene_expression_data.csv", index = False)

[Setup] Samples: 162 (train=162), Features: 35489
[Step 2] Dropped no-signal/constant features: 683 removed; 34806 remain
[Step 3] Variance filter (mRNA): kept 16941 / previous 34806
[Step 4] FDR (alpha=0.05): 7134 features significant; proceeding with ranked list
[Step 5] PC1≥50% for all K≤300; fallback to K=200
[Step 6] Scaled to [0,1]. Final feature count: 200


In [7]:
# preprocess metabolomics data
df = pd.read_csv("../data/MayoRNASeq/cleaned/metabolomics_data.csv")
df_X = df.iloc[:,5:]
df_y = df["Diagnosis"]

df_X = mogonet_preprocess(df_X, df_y, omics_type="metabolomics", verbose=True, hm27_probe_ids=hm27_probe_ids)
df  = pd.concat([df.iloc[:,:5], df_X], axis = 1)
df.to_csv("../data/MayoRNASeq/preprocessed/metabolomics_data.csv", index = False)

[Setup] Samples: 98 (train=98), Features: 670
[Step 0] log10 transform (clamped ≤0 to 0) with pseudocount=4.75e+03
[Step 2] Dropped no-signal/constant features: 0 removed; 670 remain
[Step 3] Variance filter (metabolomics): kept 670 / previous 670
[Step 4] FDR (alpha=0.05): 49 features significant; proceeding with ranked list
[Step 5] Selected K=200 features with PC1<50%
[Step 6] Scaled to [0,1]. Final feature count: 200


In [8]:
# preprocess proteomics data
df = pd.read_csv("../data/MayoRNASeq/cleaned/proteomics_data.csv")
df_X = df.iloc[:,5:]
df_y = df["Diagnosis"]

df_X = mogonet_preprocess(df_X, df_y, omics_type="proteomics", verbose=True, hm27_probe_ids=hm27_probe_ids)
df  = pd.concat([df.iloc[:,:5], df_X], axis = 1)
df.to_csv("../data/MayoRNASeq/preprocessed/proteomics_data.csv", index = False)

[Setup] Samples: 112 (train=112), Features: 3607
[Step 0] log10 transform (clamped ≤0 to 0) with pseudocount=1.27e+05
[Step 2] Dropped no-signal/constant features: 0 removed; 3607 remain
[Step 3] Variance filter (proteomics): kept 3607 / previous 3607
[Step 4] FDR (alpha=0.05): 119 features significant; proceeding with ranked list
[Step 5] Selected K=200 features with PC1<50%
[Step 6] Scaled to [0,1]. Final feature count: 200


### 3. Preprocess BRCA Files

In [3]:
# preprocess microRNA data
df = pd.read_csv("../data/BRCA/cleaned/miRNA_data.csv")
df_X = df.iloc[:,5:]
df_y = df["Diagnosis"]

df_X = mogonet_preprocess(df_X, df_y, omics_type="miRNA", verbose=True, hm27_probe_ids=hm27_probe_ids)
df  = pd.concat([df.iloc[:,:5], df_X], axis = 1)
df.to_csv("../data/BRCA/preprocessed/miRNA_expression_data.csv", index = False)

[Setup] Samples: 819 (train=819), Features: 564
[Step 2] Dropped no-signal/constant features: 0 removed; 564 remain
[Step 3] Variance filter (miRNA): kept 564 / previous 564
[Step 4] FDR (alpha=0.05): 424 features significant; proceeding with ranked list
[Step 5] Selected K=200 features with PC1<50%
[Step 6] Scaled to [0,1]. Final feature count: 200


In [4]:
# preprocess gene expression data
df = pd.read_csv("../data/BRCA/cleaned/gene_expression_data.csv")
df_X = df.iloc[:,5:]
df_y = df["Diagnosis"]

df_X = mogonet_preprocess(df_X, df_y, omics_type="mRNA", verbose=True, hm27_probe_ids=hm27_probe_ids)
df  = pd.concat([df.iloc[:,:5], df_X], axis = 1)
df.to_csv("../data/BRCA/preprocessed/gene_expression_data.csv", index = False)

[Setup] Samples: 1202 (train=1202), Features: 20530
[Step 2] Dropped no-signal/constant features: 280 removed; 20250 remain
[Step 3] Variance filter (mRNA): kept 19075 / previous 20250
[Step 4] FDR (alpha=0.05): 15108 features significant; proceeding with ranked list
[Step 5] PC1≥50% for all K≤300; fallback to K=200
[Step 6] Scaled to [0,1]. Final feature count: 200


In [11]:
# preprocess dna methylation data 
df = pd.read_csv(
    "../data/BRCA/cleaned/dna_methylation_data.csv", 
    skiprows=lambda i: i > 0 and np.random.rand() > 50 / 1000,  # works if file is large
)

df_X = df.iloc[:,5:]
df_y = df["Diagnosis"]

df_X = mogonet_preprocess(df_X, df_y, omics_type="meth", verbose=True, hm27_probe_ids=hm27_probe_ids)
df  = pd.concat([df.iloc[:,:5], df_X], axis = 1) 

selected_columns =  df.columns
df = pd.read_csv("../data/BRCA/cleaned/dna_methylation_data.csv", usecols=selected_columns) 

df.to_csv("../data/BRCA/preprocessed/dna_methylation_data.csv", index = False)

[Setup] Samples: 42 (train=42), Features: 363870
[Step 1] HM27 restriction: kept 22128 features
[Step 2] Dropped no-signal/constant features: 341742 removed; 22128 remain
[Step 3] Variance filter (meth): kept 14067 / previous 22128
[Step 4] FDR (alpha=0.05): 285 features significant; proceeding with ranked list
[Step 5] PC1≥50% for all K≤300; fallback to K=200
[Step 6] Scaled to [0,1]. Final feature count: 200
