##### For connecting to google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##### For running GSD section

In [None]:

network_list = [
    "Normi.csv",
    "SINCERITIES.csv",
    "PPCOR.csv",
    "GENIE3.csv",
    "Bixgboost.csv",
    "AGRN.csv",
    "GreyNet.csv",
    "LEAP.csv",
    "SCNS.csv",
    "SINGE.csv",
    "GRNBOOST2.csv",
    "SCODE.csv",
    "Dyngenie3.csv",
    "SCRIBE.csv",
    "GRNVBEM.csv",
    "GRISLI.csv",
    "PIDC.csv"
]


folder_list = ["GSD/GSD-2000-1"] #Using one dataset as a sample
# folder_name = "" #GSD,HSC,VSC, mCAD
expression_file_name ="ExpressionData.csv"

network_csv_path = #NETWORK csv file from algorithms
expression_csv_path =  #LOCATION of Expression files
print("Network CSV:", network_csv_path)
print("Expression CSV:", expression_csv_path)


Network CSV: /content/drive/MyDrive/Colab Notebooks/Temp data folder/
Expression CSV: /content/drive/MyDrive/Colab Notebooks/paper3/inputs/Curated/


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_regression

# -------------------- I/O helpers -------------------- #

def load_and_zscore_expression(expression_path: str) -> pd.DataFrame:
    """ExpressionData.csv: rows=genes, cols=cells -> zscore per gene."""
    expr_df = pd.read_csv(expression_path, index_col=0)
    expr_df = expr_df.replace([np.inf, -np.inf], np.nan).fillna(0.0)

    X = expr_df.values.astype(np.float64)
    mu = X.mean(axis=1, keepdims=True)
    sd = X.std(axis=1, keepdims=True)
    sd = np.where(sd < 1e-6, 1.0, sd)
    Xz = (X - mu) / sd
    return pd.DataFrame(Xz, index=expr_df.index)

def load_network_filtered(network_path: str, genes_present: set[str]) -> pd.DataFrame:
    """Expected columns: Gene1, Gene2, EdgeWeight."""
    df = pd.read_csv(network_path)
    df = df[df["Gene1"].isin(genes_present) & df["Gene2"].isin(genes_present)].copy()
    df["EdgeWeight"] = df["EdgeWeight"].astype(float)
    return df

# -------------------- scoring utils -------------------- #

def normalize_minmax(x, eps=1e-12):
    x = np.asarray(x, dtype=np.float64)
    if x.size == 0:
        return x
    return (x - x.min()) / (x.max() - x.min() + eps)

def keep_top_percent_df(df: pd.DataFrame, score_col="EdgeWeight", keep_top_percent=100.0) -> pd.DataFrame:
    if len(df) == 0:
        return df
    scores = df[score_col].to_numpy(dtype=np.float64)
    k = max(1, int(len(scores) * float(keep_top_percent) / 100.0))
    thr = np.partition(scores, -k)[-k]
    return df[df[score_col] >= thr].copy()

def edge_mi_scores(net_df: pd.DataFrame, expr_df_z: pd.DataFrame, mi_n_neighbors=10):
    """MI(Gene1, Gene2) for each edge; returns (raw, minmax_norm)."""
    genes = expr_df_z.index.to_list()
    gene_to_row = {g: i for i, g in enumerate(genes)}
    X = expr_df_z.values  # genes x cells

    src = net_df["Gene1"].map(gene_to_row).to_numpy()
    dst = net_df["Gene2"].map(gene_to_row).to_numpy()

    raw = np.zeros(len(net_df), dtype=np.float64)
    for i, (s, t) in enumerate(zip(src, dst)):
        xs = X[s, :].reshape(-1, 1)
        yt = X[t, :]
        mi = mutual_info_regression(xs, yt, n_neighbors=mi_n_neighbors, random_state=0)[0]
        raw[i] = 0.0 if np.isnan(mi) else float(mi)

    return raw, normalize_minmax(raw)

# -------------------- v11_ecdf_symmetric core -------------------- #

def clr_v11_ecdf_symmetric(net_df: pd.DataFrame, base_values: np.ndarray) -> np.ndarray:
    """
    Implements v11_ecdf_symmetric:
      - context = ecdf
      - symmetry = undirected_symmetric
      - combine = l2
    """
    df = net_df[["Gene1", "Gene2"]].copy()
    df["_v"] = np.asarray(base_values, dtype=np.float64)

    # long-form to compute ECDF per incident gene
    long = pd.concat(
        [
            df[["Gene1", "_v"]].rename(columns={"Gene1": "Gene"}).assign(_side="A", _idx=np.arange(len(df))),
            df[["Gene2", "_v"]].rename(columns={"Gene2": "Gene"}).assign(_side="B", _idx=np.arange(len(df))),
        ],
        ignore_index=True,
    )

    long["_p"] = long.groupby("Gene")["_v"].rank(pct=True, method="average")
    p_a = long[long["_side"] == "A"].sort_values("_idx")["_p"].to_numpy(dtype=np.float64)
    p_b = long[long["_side"] == "B"].sort_values("_idx")["_p"].to_numpy(dtype=np.float64)

    score = np.sqrt(p_a * p_a + p_b * p_b)  # l2 combine
    return normalize_minmax(score)

def blend_linear(a, b, alpha: float):
    """alpha*a + (1-alpha)*b, both assumed normalized [0,1]."""
    return float(alpha) * a + (1.0 - float(alpha)) * b

def refine_network_v11_ecdf_symmetric(
    net_df: pd.DataFrame,
    expr_df_z: pd.DataFrame,
    *,
    mi_n_neighbors=10,
    alpha=0.8,
    keep_top_percent=100.0,
) -> pd.DataFrame:
    """
    Pipeline:
      1) base signal = MI association (minmax normalized)
      2) context score = v11_ecdf_symmetric on base
      3) blend with original normalized EdgeWeight
      4) keep top X%
    """
    w_norm = normalize_minmax(net_df["EdgeWeight"].to_numpy(dtype=np.float64))
    _, base = edge_mi_scores(net_df, expr_df_z, mi_n_neighbors=mi_n_neighbors)  # normalized MI
    context_score = clr_v11_ecdf_symmetric(net_df, base)

    final = blend_linear(context_score, w_norm, alpha=alpha)

    out = net_df[["Gene1", "Gene2"]].copy()
    out["EdgeWeight"] = final
    return keep_top_percent_df(out, "EdgeWeight", keep_top_percent)

# -------------------- run on folders / networks -------------------- #

# You already have these in your notebook/script:
# folder_list, folder_name, expression_csv_path, network_csv_path, network_list

keep_top_percent = 100.0

alpha_list =[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] #0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
neigh_list =[5,6,7,8,9,10,11,12,13,14,15] #5,6,7,8,9,10,11,12,13,14,15
for alpha_val in alpha_list:
    for neigh_val in neigh_list:
        for l in folder_list:
            output_csv_path = #Output Location
            os.makedirs(output_csv_path, exist_ok=True)

            expr_path = f"{expression_csv_path}{l}/ExpressionData.csv"
            expr_df_z = load_and_zscore_expression(expr_path)
            genes_present = set(expr_df_z.index)

            for n in network_list:
                net_path = f"{network_csv_path}{l}/{n}"
                net_df = load_network_filtered(net_path, genes_present)

                refined_df = refine_network_v11_ecdf_symmetric(
                    net_df,
                    expr_df_z,
                    mi_n_neighbors=neigh_val,
                    alpha=alpha_val,
                    keep_top_percent=keep_top_percent,
                )

                base = n.replace(".csv", "")
                out_file = os.path.join(output_csv_path, f"refined_network_CLR_ASSOC__mi__v11_ecdf_symmetric_{base}_{alpha_val}_{neigh_val}.csv")
                print(out_file)
                refined_df.to_csv(out_file, index=False)
                print("Saved:", out_file)



/content/drive/MyDrive/Post_processing_paper_ideas//GSD/GSD-2000-1/refined_network_CLR_ASSOC__mi__v11_ecdf_symmetric_Normi_0.1_10.csv
Saved: /content/drive/MyDrive/Post_processing_paper_ideas//GSD/GSD-2000-1/refined_network_CLR_ASSOC__mi__v11_ecdf_symmetric_Normi_0.1_10.csv


KeyboardInterrupt: 

##### For running HIV section

In [None]:

network_list = [
    "1_AGRN_ETR.csv",
]


folder_list = [
    "HL_L"
    # "HVL_VL",
    # "H_H",
    # "N_H",
    # "N_L",
    # "N_VL"
] #Sample dataset with HIV-Leishmaniasis
folder_name = "" #GSD,HSC,VSC, mCAD
expression_file_name ="ExpressionData.csv"


network_csv_path = #NETWORK csv file from algorithms run on HIV File
expression_csv_path =   #LOCATION of Expression files

print("Network CSV:", network_csv_path)
print("Expression CSV:", expression_csv_path)


Network CSV: /content/drive/MyDrive/Colab Notebooks/Paper_4_Preprocessing/HIV/
Expression CSV: /content/drive/MyDrive/Colab Notebooks/Paper_4_Preprocessing/HIV/


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_regression

# -------------------- I/O helpers -------------------- #

def load_and_zscore_expression(expression_path: str) -> pd.DataFrame:
    """ExpressionData.csv: rows=genes, cols=cells -> zscore per gene."""
    expr_df = pd.read_csv(expression_path, index_col=0)
    expr_df = expr_df.replace([np.inf, -np.inf], np.nan).fillna(0.0)

    X = expr_df.values.astype(np.float64)
    mu = X.mean(axis=1, keepdims=True)
    sd = X.std(axis=1, keepdims=True)
    sd = np.where(sd < 1e-6, 1.0, sd)
    Xz = (X - mu) / sd
    return pd.DataFrame(Xz, index=expr_df.index)

def load_network_filtered(network_path: str, genes_present: set[str]) -> pd.DataFrame:
    """Expected columns: Gene1, Gene2, EdgeWeight."""
    df = pd.read_csv(network_path)
    df = df[df["Gene1"].isin(genes_present) & df["Gene2"].isin(genes_present)].copy()
    df["EdgeWeight"] = df["EdgeWeight"].astype(float)
    return df

# -------------------- scoring utils -------------------- #

def normalize_minmax(x, eps=1e-12):
    x = np.asarray(x, dtype=np.float64)
    if x.size == 0:
        return x
    return (x - x.min()) / (x.max() - x.min() + eps)

def keep_top_percent_df(df: pd.DataFrame, score_col="EdgeWeight", keep_top_percent=100.0) -> pd.DataFrame:
    if len(df) == 0:
        return df
    scores = df[score_col].to_numpy(dtype=np.float64)
    k = max(1, int(len(scores) * float(keep_top_percent) / 100.0))
    thr = np.partition(scores, -k)[-k]
    return df[df[score_col] >= thr].copy()

def edge_mi_scores(net_df: pd.DataFrame, expr_df_z: pd.DataFrame, mi_n_neighbors=10):
    """MI(Gene1, Gene2) for each edge; returns (raw, minmax_norm)."""
    genes = expr_df_z.index.to_list()
    gene_to_row = {g: i for i, g in enumerate(genes)}
    X = expr_df_z.values  # genes x cells

    src = net_df["Gene1"].map(gene_to_row).to_numpy()
    dst = net_df["Gene2"].map(gene_to_row).to_numpy()

    raw = np.zeros(len(net_df), dtype=np.float64)
    for i, (s, t) in enumerate(zip(src, dst)):
        xs = X[s, :].reshape(-1, 1)
        yt = X[t, :]
        mi = mutual_info_regression(xs, yt, n_neighbors=mi_n_neighbors, random_state=0)[0]
        raw[i] = 0.0 if np.isnan(mi) else float(mi)

    return raw, normalize_minmax(raw)

# -------------------- v11_ecdf_symmetric core -------------------- #

def clr_v11_ecdf_symmetric(net_df: pd.DataFrame, base_values: np.ndarray) -> np.ndarray:
    """
    Implements v11_ecdf_symmetric:
      - context = ecdf
      - symmetry = undirected_symmetric
      - combine = l2
    """
    df = net_df[["Gene1", "Gene2"]].copy()
    df["_v"] = np.asarray(base_values, dtype=np.float64)

    # long-form to compute ECDF per incident gene
    long = pd.concat(
        [
            df[["Gene1", "_v"]].rename(columns={"Gene1": "Gene"}).assign(_side="A", _idx=np.arange(len(df))),
            df[["Gene2", "_v"]].rename(columns={"Gene2": "Gene"}).assign(_side="B", _idx=np.arange(len(df))),
        ],
        ignore_index=True,
    )

    long["_p"] = long.groupby("Gene")["_v"].rank(pct=True, method="average")
    p_a = long[long["_side"] == "A"].sort_values("_idx")["_p"].to_numpy(dtype=np.float64)
    p_b = long[long["_side"] == "B"].sort_values("_idx")["_p"].to_numpy(dtype=np.float64)

    score = np.sqrt(p_a * p_a + p_b * p_b)  # l2 combine
    return normalize_minmax(score)

def blend_linear(a, b, alpha: float):
    """alpha*a + (1-alpha)*b, both assumed normalized [0,1]."""
    return float(alpha) * a + (1.0 - float(alpha)) * b

def refine_network_v11_ecdf_symmetric(
    net_df: pd.DataFrame,
    expr_df_z: pd.DataFrame,
    *,
    mi_n_neighbors=10,
    alpha=0.8,
    keep_top_percent=100.0,
) -> pd.DataFrame:
    """
    Pipeline:
      1) base signal = MI association (minmax normalized)
      2) context score = v11_ecdf_symmetric on base
      3) blend with original normalized EdgeWeight
      4) keep top X%
    """
    w_norm = normalize_minmax(net_df["EdgeWeight"].to_numpy(dtype=np.float64))
    _, base = edge_mi_scores(net_df, expr_df_z, mi_n_neighbors=mi_n_neighbors)  # normalized MI
    context_score = clr_v11_ecdf_symmetric(net_df, base)

    final = blend_linear(context_score, w_norm, alpha=alpha)

    out = net_df[["Gene1", "Gene2"]].copy()
    out["EdgeWeight"] = final
    return keep_top_percent_df(out, "EdgeWeight", keep_top_percent)

# -------------------- run on folders / networks -------------------- #

# You already have these in your notebook/script:
# folder_list, folder_name, expression_csv_path, network_csv_path, network_list

keep_top_percent = 100.0

alpha_list =[0.1] #0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9  0.6,0.7,0.8,0.9
neigh_list =[11] #5,6,7,8,9,10,11,12,13,14,15 ,10,11,12,13,14,15
for alpha_val in alpha_list:
    for neigh_val in neigh_list:
        for l in folder_list:
            output_csv_path = #Output Location
            os.makedirs(output_csv_path, exist_ok=True)

            expr_path = f"{expression_csv_path}{l}/ExpressionData.csv"
            expr_df_z = load_and_zscore_expression(expr_path)
            genes_present = set(expr_df_z.index)

            for n in network_list:
                net_path = f"{network_csv_path}{l}/{n}"
                net_df = load_network_filtered(net_path, genes_present)

                refined_df = refine_network_v11_ecdf_symmetric(
                    net_df,
                    expr_df_z,
                    mi_n_neighbors=neigh_val,
                    alpha=alpha_val,
                    keep_top_percent=keep_top_percent,
                )

                base = n.replace(".csv", "")
                out_file = os.path.join(output_csv_path, f"refined_network_CLR_ASSOC__mi__v11_ecdf_symmetric_{base}_{alpha_val}_{neigh_val}.csv")
                refined_df.to_csv(out_file, index=False)
                print("Saved:", out_file)



Saved: /content/drive/MyDrive/Colab Notebooks/Paper_4_Preprocessing/HIV//HL_L/refined_network_CLR_ASSOC__mi__v11_ecdf_symmetric_1_AGRN_ETR_0.1_11.csv
Saved: /content/drive/MyDrive/Colab Notebooks/Paper_4_Preprocessing/HIV//HVL_VL/refined_network_CLR_ASSOC__mi__v11_ecdf_symmetric_1_AGRN_ETR_0.1_11.csv
Saved: /content/drive/MyDrive/Colab Notebooks/Paper_4_Preprocessing/HIV//H_H/refined_network_CLR_ASSOC__mi__v11_ecdf_symmetric_1_AGRN_ETR_0.1_11.csv
Saved: /content/drive/MyDrive/Colab Notebooks/Paper_4_Preprocessing/HIV//N_H/refined_network_CLR_ASSOC__mi__v11_ecdf_symmetric_1_AGRN_ETR_0.1_11.csv
Saved: /content/drive/MyDrive/Colab Notebooks/Paper_4_Preprocessing/HIV//N_L/refined_network_CLR_ASSOC__mi__v11_ecdf_symmetric_1_AGRN_ETR_0.1_11.csv
Saved: /content/drive/MyDrive/Colab Notebooks/Paper_4_Preprocessing/HIV//N_VL/refined_network_CLR_ASSOC__mi__v11_ecdf_symmetric_1_AGRN_ETR_0.1_11.csv
