In [1]:
# Core libraries
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from scipy.linalg import eigh

# ===========================
# Load and preprocess dataset
# ===========================

from google.colab import drive
drive.mount('/content/drive/')


Mounted at /content/drive/


In [2]:
import pandas as pd
import numpy as np

def load_schaefer_graphs(
    keep_fraction=0.3,
    outcome_col='Imp20PercentBPRS'
):
    # ===========================
    # Hardcoded file names
    # ===========================
    graph_file = "/content/drive/Shared drives/GNN/SchaeferAtlas_Rest_Results_Freq_008to09_wholebrain.xlsx"
    outcome_file = "/content/drive/Shared drives/GNN/RestingStateDataforAlex_3Networks.xlsx"

    demo_cols = ['Age', 'handedness', 'sex', 'PrimaryEthnicity',
                 'PrimaryRace', 'Education', 'Parental Education']

    # ===========================
    # Load outcome + demographics
    # ===========================
    df_labels = pd.read_excel(outcome_file, sheet_name='outcomeanddemographics', skiprows=1)
    demo_dict = df_labels.set_index('SID')[demo_cols].to_dict(orient='index')

    # Helper: threshold adjacency
    def threshold_graph(A, keep_fraction):
        triu_indices = np.triu_indices_from(A, k=1)
        edge_values = A[triu_indices]
        edge_values = edge_values[edge_values > 0]  # ignore zeros
        if len(edge_values) == 0:
            return np.zeros_like(A)
        threshold = np.percentile(edge_values, 100 * (1 - keep_fraction))
        mask = A >= threshold
        A_thresh = A * mask
        A_thresh = np.maximum(A_thresh, A_thresh.T)  # enforce symmetry
        return A_thresh

    # ===========================
    # Load only needed sheets
    # ===========================
    xl = pd.ExcelFile(graph_file)
    available_sheets = set(xl.sheet_names)

    # Get SIDs that actually have outcomes
    sids_with_outcome = df_labels['SID'].dropna().unique()
    sids_to_load = [sid for sid in sids_with_outcome if sid in available_sheets]

    graphs_by_sid = {}

    for sid in sids_to_load:
        try:
            # Load matrix for this patient
            mat_df = pd.read_excel(graph_file, sheet_name=sid, index_col=0)
            mat = mat_df.values.astype(float)

            mat = mat_df.values.astype(float)

            # Replace NaNs and infs with 0
            mat = np.nan_to_num(mat, nan=0.0, posinf=0.0, neginf=0.0)
            rel = mat
            # Take absolute value (since matrices can have negatives)
            mat = np.abs(mat)

            # Take absolute values (since matrices aren't absolute in this dataset)


            # Get outcome + demographics
            row = df_labels.loc[df_labels['SID'] == sid]
            outcome = row[outcome_col].values[0]
            demos = demo_dict.get(sid, None)

            # Threshold adjacency
            mat_thresh = threshold_graph(mat, keep_fraction)
            rel_thresh = threshold_graph(rel, keep_fraction)

            graphs_by_sid[sid] = {
                'abs': mat_thresh,      # absolute-valued adjacency
                'rel': rel_thresh,
                'demo': demos,
                'outcome': outcome,
                'nodes': list(mat_df.index)  # preserve brain region labels
            }
        except Exception as e:
            print(f"Skipping SID {sid} due to error: {e}")
            continue

    return graphs_by_sid


In [3]:
#combines all the previous dataloading

import pandas as pd
import numpy as np



from itertools import product
import numpy as np


def load_and_preprocess_graphs(
    file_path,
    abs_keep_fraction=0.3,
    rel_keep_fraction=0.3,
    demo_cols=None,
    outcome_col='Imp20PercentBPRS'
):
    if demo_cols is None:
        demo_cols = ['Age', 'handedness', 'sex', 'PrimaryEthnicity', 'PrimaryRace', 'Education', 'Parental Education']

    # ===========================
    # Load dataset
    # ===========================
    df = pd.read_excel(file_path)
    df_labels = pd.read_excel(file_path, sheet_name='outcomeanddemographics', skiprows=1)

    # ===========================
    # Extract nodes and edges
    # ===========================
    edge_columns = df.columns.drop('SID')

    # Get all unique nodes
    nodes_set = set()
    for col in edge_columns:
        raw_node1, raw_node2 = col.split('-')
        node1 = raw_node1.replace('ABS_', '')
        node2 = raw_node2.replace('ABS_', '')
        nodes_set.add(node1)
        nodes_set.add(node2)

    nodes = sorted(list(nodes_set))
    n_nodes = len(nodes)
    node_to_idx = {node: i for i, node in enumerate(nodes)}

    abs_edge_cols, rel_edge_cols = [], []
    abs_edge_to_idx, rel_edge_to_idx = [], []

    for col in edge_columns:
        raw_node1, raw_node2 = col.split('-')
        node1 = raw_node1.replace('ABS_', '')
        node2 = raw_node2.replace('ABS_', '')
        idx1, idx2 = node_to_idx[node1], node_to_idx[node2]

        if 'ABS_' in col:
            abs_edge_cols.append(col)
            abs_edge_to_idx.append((idx1, idx2))
        else:
            rel_edge_cols.append(col)
            rel_edge_to_idx.append((idx1, idx2))

    # ===========================
    # Build adjacency matrices
    # ===========================
    abs_adj_matrices, rel_adj_matrices = [], []

    for _, row in df.iterrows():
        abs_adj = np.zeros((n_nodes, n_nodes))
        rel_adj = np.zeros((n_nodes, n_nodes))

        for col_idx, (i1, i2) in enumerate(abs_edge_to_idx):
            val = row[abs_edge_cols[col_idx]]
            abs_adj[i1, i2] = val
            abs_adj[i2, i1] = val

        for col_idx, (i1, i2) in enumerate(rel_edge_to_idx):
            val = row[rel_edge_cols[col_idx]]
            rel_adj[i1, i2] = val
            rel_adj[i2, i1] = val

        abs_adj_matrices.append(abs_adj)
        rel_adj_matrices.append(rel_adj)

    # ===========================
    # Thresholding function
    # ===========================
    def threshold_graph(A, keep_fraction):
        triu_indices = np.triu_indices_from(A, k=1)
        edge_values = A[triu_indices]
        threshold = np.percentile(edge_values, 100 * (1 - keep_fraction))
        mask = A >= threshold
        A_thresh = A * mask
        A_thresh = np.maximum(A_thresh, A_thresh.T)
        return A_thresh

    # ===========================
    # Build final dictionary
    # ===========================
    demo_dict = df_labels.set_index('SID')[demo_cols].to_dict(orient='index')

    graphs_by_sid = {}
    for sid, abs_mat, rel_mat in zip(df['SID'], abs_adj_matrices, rel_adj_matrices):
        demos = demo_dict.get(sid, None)
        abs_mat_thresh = threshold_graph(abs_mat, abs_keep_fraction)
        rel_mat_thresh = threshold_graph(rel_mat, rel_keep_fraction)
        outcome = df_labels.loc[df_labels['SID'] == sid, outcome_col].values[0] \
                  if sid in df_labels['SID'].values else None

        graphs_by_sid[sid] = {
            'abs': abs_mat_thresh,
            'rel': rel_mat_thresh,
            'demo': demos,
            'outcome': outcome,
            'nodes': nodes
        }

    # Filter out subjects without outcome
    graphs_by_sid = {sid: g for sid, g in graphs_by_sid.items() if g['outcome'] is not None}

    return graphs_by_sid


In [4]:
def inter_mixup(X_train, y_train, n_samples, alpha=0.2, hard_labels=False):
    """
    Generates n_samples synthetic examples using mixup.
    """
    X_new = []
    y_new = []

    for _ in range(n_samples):
        i, j = np.random.choice(len(X_train), size=2, replace=False)
        lam = np.random.beta(alpha, alpha)
        x_mix = lam * X_train[i] + (1 - lam) * X_train[j]
        y_mix = lam * y_train[i] + (1 - lam) * y_train[j]

        if hard_labels:
            y_mix = int(round(y_mix))  # convert to 0 or 1

        X_new.append(x_mix)
        y_new.append(y_mix)

    return np.vstack(X_new), np.array(y_new)


def intra_mixup(X_train, y_train, n_samples_per_class, alpha=0.2):
    """
    Generates synthetic samples by mixing features within the same class.
    X_train: (n_samples, n_features)
    y_train: (n_samples,)
    n_samples_per_class: number of synthetic samples to generate per class
    alpha: Beta distribution parameter
    """
    X_new, y_new = [], []
    classes = np.unique(y_train)

    for c in classes:
        idx_class = np.where(y_train == c)[0]
        for _ in range(n_samples_per_class):
            i, j = np.random.choice(idx_class, size=2, replace=False)
            lam = np.random.beta(alpha, alpha)
            x_mix = lam * X_train[i] + (1 - lam) * X_train[j]
            X_new.append(x_mix)
            y_new.append(c)  # hard label

    return np.vstack(X_new), np.array(y_new)


import networkx as nx
import numpy as np

def compute_more_graph_features(adj_matrix):
    """
    Computes a variety of node- and graph-level features for a weighted adjacency matrix.
    Returns a 1D feature vector.
    """
    G = nx.from_numpy_array(adj_matrix)
    features = []

    # ===========================
    # Node-level metrics (mean + std)
    # ===========================
    # Node strength (weighted degree)
    strength = np.array([s for n, s in G.degree(weight='weight')])
    features.extend([strength.mean(), strength.std()])

    # Clustering coefficient
    clustering = np.array(list(nx.clustering(G, weight='weight').values()))
    features.extend([clustering.mean(), clustering.std()])

    # Betweenness centrality
    try:
        bc = np.array(list(nx.betweenness_centrality(G, weight='weight').values()))
        features.extend([bc.mean(), bc.std()])
    except:
        features.extend([np.nan, np.nan])

    # Eigenvector centrality
    try:
        ec = np.array(list(nx.eigenvector_centrality(G, weight='weight', max_iter=500).values()))
        features.extend([ec.mean(), ec.std()])
    except:
        features.extend([np.nan, np.nan])

    # PageRank
    try:
        pr = np.array(list(nx.pagerank(G, weight='weight').values()))
        features.extend([pr.mean(), pr.std()])
    except:
        features.extend([np.nan, np.nan])

    # ===========================
    # Global graph metrics
    # ===========================
    # Global efficiency
    try:
        features.append(nx.global_efficiency(G))
    except:
        features.append(np.nan)

    # Average clustering
    try:
        features.append(nx.average_clustering(G, weight='weight'))
    except:
        features.append(np.nan)

    # Transitivity
    try:
        features.append(nx.transitivity(G))
    except:
        features.append(np.nan)

    # Assortativity (degree)
    try:
        features.append(nx.degree_assortativity_coefficient(G, weight='weight'))
    except:
        features.append(np.nan)

    # Density
    try:
        features.append(nx.density(G))
    except:
        features.append(np.nan)

    return np.array(features)


def add_noise(X, sigma=0.01):
    return X + np.random.normal(0, sigma, X.shape)

# ===========================
# 1. Graph feature computation
# ===========================
def compute_graph_features(adj_matrix, feature_flags):
    """
    Computes node- and graph-level features for a weighted adjacency matrix.
    Returns a 1D feature vector and corresponding feature names.
    """
    G = nx.from_numpy_array(adj_matrix)
    features = []
    feature_names = []

    # Node-level metrics
    strength = np.array([s for n, s in G.degree(weight='weight')])
    clustering_dict = nx.clustering(G, weight='weight')
    clustering = np.array(list(clustering_dict.values()))

    if feature_flags.get("strength_mean", True):
        features.append(strength.mean())
        feature_names.append("strength_mean")
    if feature_flags.get("strength_std", True):
        features.append(strength.std())
        feature_names.append("strength_std")
    if feature_flags.get("clustering_mean", True):
        features.append(clustering.mean())
        feature_names.append("clustering_mean")
    if feature_flags.get("clustering_std", True):
        features.append(clustering.std())
        feature_names.append("clustering_std")

    if feature_flags.get("betweenness_centrality", True):
        try:
            bc = np.array(list(nx.betweenness_centrality(G, weight='weight').values()))
            features.extend([bc.mean(), bc.std()])
            feature_names.extend(["betweenness_centrality_mean", "betweenness_centrality_std"])
        except:
            features.extend([np.nan, np.nan])
            feature_names.extend(["betweenness_centrality_mean", "betweenness_centrality_std"])


    if feature_flags.get("eigenvector_centrality", True):
        try:
            ec = np.array(list(nx.eigenvector_centrality(G, weight='weight', max_iter=500).values()))
            features.extend([ec.mean(), ec.std()])
            feature_names.extend(["eigenvector_centrality_mean", "eigenvector_centrality_std"])
        except:
            features.extend([np.nan, np.nan])
            feature_names.extend(["eigenvector_centrality_mean", "eigenvector_centrality_std"])


    if feature_flags.get("avg_clustering", True):
        try:
            features.append(nx.average_clustering(G, weight='weight'))
            feature_names.append("avg_clustering")
        except:
            features.append(np.nan)
            feature_names.append("avg_clustering")

    return np.array(features), feature_names

def compute_node_strength(adj_matrix):
    # Sum of connections for each node
    strength_vector = np.sum(adj_matrix, axis=1)
    return strength_vector

def compute_node_clustering(adj_matrix):
    G = nx.from_numpy_array(adj_matrix)
    clustering_dict = nx.clustering(G, weight='weight')
    return np.array(list(clustering_dict.values()))

def mean_ci(data, confidence=0.95):
    arr = np.array(data)
    mean = np.mean(arr)
    ci_low, ci_high = stats.t.interval(confidence, len(arr)-1, loc=mean, scale=stats.sem(arr))
    return mean, ci_low, ci_high

def compute_node_local_efficiency(adj_matrix):
    """
    Computes local efficiency for each node in a weighted graph.
    adj_matrix: numpy array, shape (n_nodes, n_nodes)
    returns: numpy array of shape (n_nodes,)
    """
    G = nx.from_numpy_array(adj_matrix)  # weighted graph
    n = len(G)
    local_eff = np.zeros(n)

    for node in G.nodes():
        neighbors = list(G.neighbors(node))
        if len(neighbors) < 2:
            # Not enough neighbors to form a subgraph
            local_eff[node] = 0.0
            continue

        # Induced subgraph of neighbors
        subgraph = G.subgraph(neighbors)

        # Compute global efficiency of the subgraph
        # global efficiency = average of inverse shortest path lengths
        efficiency_sum = 0.0
        count = 0
        for i, u in enumerate(subgraph.nodes()):
            for v in list(subgraph.nodes())[i+1:]:
                try:
                    d = nx.shortest_path_length(subgraph, source=u, target=v, weight='weight')
                    efficiency_sum += 1.0 / d
                except nx.NetworkXNoPath:
                    efficiency_sum += 0.0  # no path
                count += 1
        local_eff[node] = efficiency_sum / count if count > 0 else 0.0

    return local_eff


In [9]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import time


from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, roc_curve
)



# ===========================
# 2. Load data and build feature matrix
# ===========================
file_path = '/content/drive/Shared drives/GNN/RestingStateDataforAlex_3Networks.xlsx'
graphs_by_sid = load_and_preprocess_graphs(file_path, abs_keep_fraction=1.0, rel_keep_fraction=1.0)

# ---------------------------
# Feature flags (toggle True/False)
# ---------------------------
feature_flags = {
    "strength_mean": True,
    "strength_std": True,
    "clustering_mean": True,
    "clustering_std": True,
    "betweenness_centrality": True, # Added betweenness centrality flag
    "eigenvector_centrality": True, # Added eigenvector centrality flag
    "avg_clustering": True
}

feature_list = []
all_feature_names = None
for sid in graphs_by_sid.keys():
    adj = graphs_by_sid[sid]['abs']  # use 'abs' or 'rel'


    strength_vector = compute_node_strength(adj)
    clustering_vector = compute_node_clustering(adj)
    local_efficiency_vector = compute_node_local_efficiency(adj)


    features, current_feature_names = compute_graph_features(adj, feature_flags)
    combined_features = np.concatenate([features,local_efficiency_vector])

    combined_features = features
    #combined_features = strength_vector

    feature_list.append(combined_features)
    if all_feature_names is None:
        all_feature_names = current_feature_names

X = np.array(feature_list)
y = np.array([graphs_by_sid[sid]['outcome'] for sid in graphs_by_sid.keys()])

# Remove NaN columns and corresponding feature names
nan_cols = np.isnan(X).any(axis=0)
X = X[:, ~nan_cols]
feature_names = [all_feature_names[i] for i in range(len(all_feature_names)) if not nan_cols[i]]

print("X shape after NaN removal:", X.shape)
print("feature_flags:", feature_flags)
print("feature_names:", feature_names)


print((y == 1).sum())  # number of elements equal to 1
print((y == 0).sum())  # number of elements equal to 0




X shape after NaN removal: (56, 9)
feature_flags: {'strength_mean': True, 'strength_std': True, 'clustering_mean': True, 'clustering_std': True, 'betweenness_centrality': True, 'eigenvector_centrality': True, 'avg_clustering': True}
feature_names: ['strength_mean', 'strength_std', 'clustering_mean', 'clustering_std', 'betweenness_centrality_mean', 'betweenness_centrality_std', 'eigenvector_centrality_mean', 'eigenvector_centrality_std', 'avg_clustering']
29
27


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPRegressor  # or MLPClassifier
import time
from scipy import stats
import os


sid_list = []
for sid in graphs_by_sid.keys():
    sid_list.append(sid)



# --- models ---
clf = MLPRegressor(hidden_layer_sizes=(100,100), activation='relu', solver='adam', max_iter=500)
Bclf = MLPRegressor(hidden_layer_sizes=(100,100), activation='relu', solver='adam', max_iter=500)

n_runs = 100
k_folds = 10

# store all results
all_accs, all_precisions, all_recalls, all_f1s, all_aucs = [], [], [], [], []
Ball_accs, Ball_precisions, Ball_recalls, Ball_f1s, Ball_aucs = [], [], [], [], []

predictions = []  # for saving individual predictions
all_accs_improvers, all_accs_nonimprovers = [], []
Ball_accs_improvers, Ball_accs_nonimprovers = [], []

random_seed = int(time.time()) # or np.random.randint(0, 10000)

for run in range(n_runs):
    print(f"=== Run {run+1}/{n_runs} ===")
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=random_seed + run)

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train, y_train = X[train_idx], y[train_idx]

        # --- Mixup augmentation (optional) ---
        X_aug, y_aug = inter_mixup(X_train, y_train, n_samples=70, alpha=0.7, hard_labels=False)
        X_train_aug = np.vstack([X_train, X_aug])
        y_train_aug = np.concatenate([y_train, y_aug])

        # --- fit ---
        clf.fit(X_train_aug, y_train_aug)
        Bclf.fit(X_train, y_train)

        # --- predictions ---
        y_prob = np.clip(clf.predict(X[test_idx]), 0, 1)
        y_pred = (y_prob >= 0.5).astype(int)

        By_prob = np.clip(Bclf.predict(X[test_idx]), 0, 1)
        By_pred = (By_prob >= 0.5).astype(int)

        # --- metrics ---
        all_accs.append(accuracy_score(y[test_idx], y_pred))
        all_precisions.append(precision_score(y[test_idx], y_pred, zero_division=0))
        all_recalls.append(recall_score(y[test_idx], y_pred, zero_division=0))
        all_f1s.append(f1_score(y[test_idx], y_pred, zero_division=0))
        all_aucs.append(roc_auc_score(y[test_idx], y_prob))

        Ball_accs.append(accuracy_score(y[test_idx], By_pred))
        Ball_precisions.append(precision_score(y[test_idx], By_pred, zero_division=0))
        Ball_recalls.append(recall_score(y[test_idx], By_pred, zero_division=0))
        Ball_f1s.append(f1_score(y[test_idx], By_pred, zero_division=0))
        Ball_aucs.append(roc_auc_score(y[test_idx], By_prob))

        mask_improvers = (y[test_idx] == 1)
        mask_nonimprovers = (y[test_idx] == 0)

        # MixUp model
        if mask_improvers.any():
            all_accs_improvers.append(
                accuracy_score(y[test_idx][mask_improvers], y_pred[mask_improvers])
            )
        if mask_nonimprovers.any():
            all_accs_nonimprovers.append(
                accuracy_score(y[test_idx][mask_nonimprovers], y_pred[mask_nonimprovers])
            )

        # Baseline model
        if mask_improvers.any():
            Ball_accs_improvers.append(
                accuracy_score(y[test_idx][mask_improvers], By_pred[mask_improvers])
            )
        if mask_nonimprovers.any():
            Ball_accs_nonimprovers.append(
                accuracy_score(y[test_idx][mask_nonimprovers], By_pred[mask_nonimprovers])
            )

        # --- store individual predictions ---
        for idx, true, p_mix, pred_mix, p_base, pred_base in zip(
            test_idx, y[test_idx], y_prob, y_pred, By_prob, By_pred
        ):
            predictions.append({
                "run": run,
                "fold": fold,
                "sid": sid_list[idx],
                "true_label": true,
                "mixup_prob": p_mix,
                "mixup_pred": pred_mix,
                "baseline_prob": p_base,
                "baseline_pred": pred_base
            })

# ===========================
# 4. Report overall metrics + 95% CI
# ===========================



metrics = {
    "Accuracy": (Ball_accs, all_accs),
    "Precision": (Ball_precisions, all_precisions),
    "Recall": (Ball_recalls, all_recalls),
    "F1": (Ball_f1s, all_f1s),
    "AUC": (Ball_aucs, all_aucs)
}

metrics_extra = {
    "Acc_Improvers": (Ball_accs_improvers, all_accs_improvers),
    "Acc_NonImprovers": (Ball_accs_nonimprovers, all_accs_nonimprovers)
}


print(f"\n=== MLPRegressor Overall {n_runs}-run {k_folds}-fold CV (Baseline first, then MixUp) ===")
for name, (Bvals, Mvals) in {**metrics, **metrics_extra}.items():
    Bmean, Blow, Bhigh = mean_ci(Bvals)
    Mmean, Mlow, Mhigh = mean_ci(Mvals)
    print(f"{name:<14}: Baseline {Bmean:.3f} [{Blow:.3f}, {Bhigh:.3f}] | MixUp {Mmean:.3f} [{Mlow:.3f}, {Mhigh:.3f}]")

# ===========================
# 5. Save predictions
# ===========================
df_preds = pd.DataFrame(predictions)

save_dir = "/content/drive/Shared drives/GNN/Results"
os.makedirs(save_dir, exist_ok=True)  # make sure folder exists

df_preds.to_csv(os.path.join(save_dir, "MLPRegressor_cv_predictions.csv"), index=False)
print("\nPredictions saved to MLPRegressor_cv_predictions.csv")


=== Run 1/100 ===




=== Run 2/100 ===




=== Run 3/100 ===




=== Run 4/100 ===




=== Run 5/100 ===




=== Run 6/100 ===




=== Run 7/100 ===




=== Run 8/100 ===




=== Run 9/100 ===




=== Run 10/100 ===




=== Run 11/100 ===




=== Run 12/100 ===




=== Run 13/100 ===
=== Run 14/100 ===




=== Run 15/100 ===




=== Run 16/100 ===




=== Run 17/100 ===




=== Run 18/100 ===




=== Run 19/100 ===




=== Run 20/100 ===




=== Run 21/100 ===




=== Run 22/100 ===




=== Run 23/100 ===




=== Run 24/100 ===
=== Run 25/100 ===




=== Run 26/100 ===




=== Run 27/100 ===




=== Run 28/100 ===




=== Run 29/100 ===




=== Run 30/100 ===




=== Run 31/100 ===




=== Run 32/100 ===




=== Run 33/100 ===




=== Run 34/100 ===




=== Run 35/100 ===




=== Run 36/100 ===




=== Run 37/100 ===




=== Run 38/100 ===




=== Run 39/100 ===




=== Run 40/100 ===




=== Run 41/100 ===




=== Run 42/100 ===




=== Run 43/100 ===




=== Run 44/100 ===




=== Run 45/100 ===




=== Run 46/100 ===




=== Run 47/100 ===




=== Run 48/100 ===




=== Run 49/100 ===




=== Run 50/100 ===




=== Run 51/100 ===




=== Run 52/100 ===




=== Run 53/100 ===




=== Run 54/100 ===




=== Run 55/100 ===




=== Run 56/100 ===




=== Run 57/100 ===




=== Run 58/100 ===




=== Run 59/100 ===




=== Run 60/100 ===




=== Run 61/100 ===




=== Run 62/100 ===




=== Run 63/100 ===




=== Run 64/100 ===




=== Run 65/100 ===




=== Run 66/100 ===




=== Run 67/100 ===




=== Run 68/100 ===




=== Run 69/100 ===




=== Run 70/100 ===




=== Run 71/100 ===




=== Run 72/100 ===




=== Run 73/100 ===




=== Run 74/100 ===




=== Run 75/100 ===




=== Run 76/100 ===




=== Run 77/100 ===




=== Run 78/100 ===




=== Run 79/100 ===




=== Run 80/100 ===




=== Run 81/100 ===




=== Run 82/100 ===




=== Run 83/100 ===




=== Run 84/100 ===




=== Run 85/100 ===




=== Run 86/100 ===




=== Run 87/100 ===




=== Run 88/100 ===




=== Run 89/100 ===




=== Run 90/100 ===




=== Run 91/100 ===




=== Run 92/100 ===




=== Run 93/100 ===




=== Run 94/100 ===
=== Run 95/100 ===




=== Run 96/100 ===




=== Run 97/100 ===




=== Run 98/100 ===




=== Run 99/100 ===




=== Run 100/100 ===





=== MLPRegressor Overall 100-run 10-fold CV (Baseline first, then MixUp) ===
Accuracy      : Baseline 0.506 [0.493, 0.518] | MixUp 0.495 [0.482, 0.508]
Precision     : Baseline 0.501 [0.482, 0.519] | MixUp 0.493 [0.474, 0.511]
Recall        : Baseline 0.490 [0.470, 0.509] | MixUp 0.487 [0.467, 0.506]
F1            : Baseline 0.470 [0.453, 0.486] | MixUp 0.465 [0.449, 0.482]
AUC           : Baseline 0.511 [0.496, 0.527] | MixUp 0.497 [0.481, 0.513]
Acc_Improvers : Baseline 0.490 [0.470, 0.509] | MixUp 0.487 [0.467, 0.506]
Acc_NonImprovers: Baseline 0.523 [0.503, 0.543] | MixUp 0.503 [0.483, 0.524]

Predictions saved to MLPRegressor_cv_predictions.csv


In [10]:
#SGD REGRESSOR
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPRegressor
import time
from scipy import stats


sid_list = []
for sid in graphs_by_sid.keys():
    sid_list.append(sid)



# --- models ---
clf = make_pipeline(StandardScaler(), SGDRegressor(max_iter=500, tol=1e-3))
Bclf = make_pipeline(StandardScaler(), SGDRegressor(max_iter=500, tol=1e-3))

n_runs = 100
k_folds = 10

# store all results
all_accs, all_precisions, all_recalls, all_f1s, all_aucs = [], [], [], [], []
Ball_accs, Ball_precisions, Ball_recalls, Ball_f1s, Ball_aucs = [], [], [], [], []

predictions = []  # for saving individual predictions
all_accs_improvers, all_accs_nonimprovers = [], []
Ball_accs_improvers, Ball_accs_nonimprovers = [], []

random_seed = int(time.time()) # or np.random.randint(0, 10000)

for run in range(n_runs):
    print(f"=== Run {run+1}/{n_runs} ===")
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=random_seed + run)

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train, y_train = X[train_idx], y[train_idx]

        # --- Mixup augmentation (optional) ---
        X_aug, y_aug = inter_mixup(X_train, y_train, n_samples=70, alpha=0.7, hard_labels=False)
        X_train_aug = np.vstack([X_train, X_aug])
        y_train_aug = np.concatenate([y_train, y_aug])

        # --- fit ---
        clf.fit(X_train_aug, y_train_aug)
        Bclf.fit(X_train, y_train)

        # --- predictions ---
        y_prob = np.clip(clf.predict(X[test_idx]), 0, 1)
        y_pred = (y_prob >= 0.5).astype(int)

        By_prob = np.clip(Bclf.predict(X[test_idx]), 0, 1)
        By_pred = (By_prob >= 0.5).astype(int)

        # --- metrics ---
        all_accs.append(accuracy_score(y[test_idx], y_pred))
        all_precisions.append(precision_score(y[test_idx], y_pred, zero_division=0))
        all_recalls.append(recall_score(y[test_idx], y_pred, zero_division=0))
        all_f1s.append(f1_score(y[test_idx], y_pred, zero_division=0))
        all_aucs.append(roc_auc_score(y[test_idx], y_prob))

        Ball_accs.append(accuracy_score(y[test_idx], By_pred))
        Ball_precisions.append(precision_score(y[test_idx], By_pred, zero_division=0))
        Ball_recalls.append(recall_score(y[test_idx], By_pred, zero_division=0))
        Ball_f1s.append(f1_score(y[test_idx], By_pred, zero_division=0))
        Ball_aucs.append(roc_auc_score(y[test_idx], By_prob))

        mask_improvers = (y[test_idx] == 1)
        mask_nonimprovers = (y[test_idx] == 0)

        # MixUp model
        if mask_improvers.any():
            all_accs_improvers.append(
                accuracy_score(y[test_idx][mask_improvers], y_pred[mask_improvers])
            )
        if mask_nonimprovers.any():
            all_accs_nonimprovers.append(
                accuracy_score(y[test_idx][mask_nonimprovers], y_pred[mask_nonimprovers])
            )

        # Baseline model
        if mask_improvers.any():
            Ball_accs_improvers.append(
                accuracy_score(y[test_idx][mask_improvers], By_pred[mask_improvers])
            )
        if mask_nonimprovers.any():
            Ball_accs_nonimprovers.append(
                accuracy_score(y[test_idx][mask_nonimprovers], By_pred[mask_nonimprovers])
            )

        # --- store individual predictions ---
        for idx, true, p_mix, pred_mix, p_base, pred_base in zip(
            test_idx, y[test_idx], y_prob, y_pred, By_prob, By_pred
        ):
            predictions.append({
                "run": run,
                "fold": fold,
                "sid": sid_list[idx],
                "true_label": true,
                "mixup_prob": p_mix,
                "mixup_pred": pred_mix,
                "baseline_prob": p_base,
                "baseline_pred": pred_base
            })

# ===========================
# 4. Report overall metrics + 95% CI
# ===========================



metrics = {
    "Accuracy": (Ball_accs, all_accs),
    "Precision": (Ball_precisions, all_precisions),
    "Recall": (Ball_recalls, all_recalls),
    "F1": (Ball_f1s, all_f1s),
    "AUC": (Ball_aucs, all_aucs)
}

metrics_extra = {
    "Acc_Improvers": (Ball_accs_improvers, all_accs_improvers),
    "Acc_NonImprovers": (Ball_accs_nonimprovers, all_accs_nonimprovers)
}


print(f"\n=== SGDRegressor Overall {n_runs}-run {k_folds}-fold CV (Baseline first, then MixUp) ===")
for name, (Bvals, Mvals) in {**metrics, **metrics_extra}.items():
    Bmean, Blow, Bhigh = mean_ci(Bvals)
    Mmean, Mlow, Mhigh = mean_ci(Mvals)
    print(f"{name:<14}: Baseline {Bmean:.3f} [{Blow:.3f}, {Bhigh:.3f}] | MixUp {Mmean:.3f} [{Mlow:.3f}, {Mhigh:.3f}]")

# ===========================
# 5. Save predictions
# ===========================
df_preds = pd.DataFrame(predictions)

save_dir = "/content/drive/Shared drives/GNN/Results"
os.makedirs(save_dir, exist_ok=True)  # make sure folder exists

df_preds.to_csv(os.path.join(save_dir, "SGDRegressor_cv_predictions.csv"), index=False)
print("\nPredictions saved to SGDRegressor_cv_predictions.csv")


'''
Baseline Results (only global features)
=== SGDRegressor Overall 100-run 10-fold CV (Baseline first, then MixUp) ===
Accuracy      : Baseline 0.611 [0.599, 0.623] | MixUp 0.620 [0.608, 0.631]
Precision     : Baseline 0.637 [0.619, 0.655] | MixUp 0.644 [0.628, 0.661]
Recall        : Baseline 0.571 [0.553, 0.589] | MixUp 0.620 [0.602, 0.637]
F1            : Baseline 0.574 [0.559, 0.590] | MixUp 0.605 [0.590, 0.619]
AUC           : Baseline 0.697 [0.682, 0.712] | MixUp 0.682 [0.667, 0.697]
Acc_Improvers : Baseline 0.571 [0.553, 0.589] | MixUp 0.620 [0.602, 0.637]
Acc_NonImprovers: Baseline 0.654 [0.637, 0.672] | MixUp 0.618 [0.600, 0.636]

local_efficiency_vector + global features
=== SGDRegressor Overall 100-run 10-fold CV (Baseline first, then MixUp) ===
Accuracy      : Baseline 0.644 [0.632, 0.657] | MixUp 0.638 [0.626, 0.650]
Precision     : Baseline 0.688 [0.669, 0.707] | MixUp 0.668 [0.651, 0.685]
Recall        : Baseline 0.570 [0.551, 0.588] | MixUp 0.624 [0.605, 0.642]
F1            : Baseline 0.592 [0.575, 0.608] | MixUp 0.614 [0.599, 0.629]
AUC           : Baseline 0.738 [0.724, 0.751] | MixUp 0.728 [0.714, 0.741]
Acc_Improvers : Baseline 0.570 [0.551, 0.588] | MixUp 0.624 [0.605, 0.642]
Acc_NonImprovers: Baseline 0.731 [0.714, 0.748] | MixUp 0.658 [0.640, 0.676]
'''

=== Run 1/100 ===
=== Run 2/100 ===
=== Run 3/100 ===
=== Run 4/100 ===
=== Run 5/100 ===
=== Run 6/100 ===
=== Run 7/100 ===
=== Run 8/100 ===
=== Run 9/100 ===
=== Run 10/100 ===
=== Run 11/100 ===
=== Run 12/100 ===
=== Run 13/100 ===
=== Run 14/100 ===
=== Run 15/100 ===
=== Run 16/100 ===
=== Run 17/100 ===
=== Run 18/100 ===
=== Run 19/100 ===
=== Run 20/100 ===
=== Run 21/100 ===
=== Run 22/100 ===
=== Run 23/100 ===
=== Run 24/100 ===
=== Run 25/100 ===
=== Run 26/100 ===
=== Run 27/100 ===
=== Run 28/100 ===
=== Run 29/100 ===
=== Run 30/100 ===
=== Run 31/100 ===
=== Run 32/100 ===
=== Run 33/100 ===
=== Run 34/100 ===
=== Run 35/100 ===
=== Run 36/100 ===
=== Run 37/100 ===
=== Run 38/100 ===
=== Run 39/100 ===
=== Run 40/100 ===
=== Run 41/100 ===
=== Run 42/100 ===
=== Run 43/100 ===
=== Run 44/100 ===
=== Run 45/100 ===
=== Run 46/100 ===
=== Run 47/100 ===
=== Run 48/100 ===
=== Run 49/100 ===
=== Run 50/100 ===
=== Run 51/100 ===
=== Run 52/100 ===
=== Run 53/100 ===
==

'\n=== SGDRegressor Overall 100-run 10-fold CV (Baseline first, then MixUp) ===\nAccuracy      : Baseline 0.644 [0.632, 0.657] | MixUp 0.638 [0.626, 0.650]\nPrecision     : Baseline 0.688 [0.669, 0.707] | MixUp 0.668 [0.651, 0.685]\nRecall        : Baseline 0.570 [0.551, 0.588] | MixUp 0.624 [0.605, 0.642]\nF1            : Baseline 0.592 [0.575, 0.608] | MixUp 0.614 [0.599, 0.629]\nAUC           : Baseline 0.738 [0.724, 0.751] | MixUp 0.728 [0.714, 0.741]\nAcc_Improvers : Baseline 0.570 [0.551, 0.588] | MixUp 0.624 [0.605, 0.642]\nAcc_NonImprovers: Baseline 0.731 [0.714, 0.748] | MixUp 0.658 [0.640, 0.676]\n'

In [None]:
#SVR

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPRegressor
import time
from scipy import stats


sid_list = []
for sid in graphs_by_sid.keys():
    sid_list.append(sid)



# --- models ---
clf = SVR(kernel='rbf')
Bclf = SVR(kernel='rbf')

n_runs = 100
k_folds = 10

# store all results
all_accs, all_precisions, all_recalls, all_f1s, all_aucs = [], [], [], [], []
Ball_accs, Ball_precisions, Ball_recalls, Ball_f1s, Ball_aucs = [], [], [], [], []

predictions = []  # for saving individual predictions
all_accs_improvers, all_accs_nonimprovers = [], []
Ball_accs_improvers, Ball_accs_nonimprovers = [], []

random_seed = int(time.time()) # or np.random.randint(0, 10000)

for run in range(n_runs):
    print(f"=== Run {run+1}/{n_runs} ===")
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=random_seed + run)

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train, y_train = X[train_idx], y[train_idx]

        # --- Mixup augmentation (optional) ---
        X_aug, y_aug = inter_mixup(X_train, y_train, n_samples=70, alpha=0.7, hard_labels=False)
        X_train_aug = np.vstack([X_train, X_aug])
        y_train_aug = np.concatenate([y_train, y_aug])

        # --- fit ---
        clf.fit(X_train_aug, y_train_aug)
        Bclf.fit(X_train, y_train)

        # --- predictions ---
        y_prob = np.clip(clf.predict(X[test_idx]), 0, 1)
        y_pred = (y_prob >= 0.5).astype(int)

        By_prob = np.clip(Bclf.predict(X[test_idx]), 0, 1)
        By_pred = (By_prob >= 0.5).astype(int)

        # --- metrics ---
        all_accs.append(accuracy_score(y[test_idx], y_pred))
        all_precisions.append(precision_score(y[test_idx], y_pred, zero_division=0))
        all_recalls.append(recall_score(y[test_idx], y_pred, zero_division=0))
        all_f1s.append(f1_score(y[test_idx], y_pred, zero_division=0))
        all_aucs.append(roc_auc_score(y[test_idx], y_prob))

        Ball_accs.append(accuracy_score(y[test_idx], By_pred))
        Ball_precisions.append(precision_score(y[test_idx], By_pred, zero_division=0))
        Ball_recalls.append(recall_score(y[test_idx], By_pred, zero_division=0))
        Ball_f1s.append(f1_score(y[test_idx], By_pred, zero_division=0))
        Ball_aucs.append(roc_auc_score(y[test_idx], By_prob))

        mask_improvers = (y[test_idx] == 1)
        mask_nonimprovers = (y[test_idx] == 0)

        # MixUp model
        if mask_improvers.any():
            all_accs_improvers.append(
                accuracy_score(y[test_idx][mask_improvers], y_pred[mask_improvers])
            )
        if mask_nonimprovers.any():
            all_accs_nonimprovers.append(
                accuracy_score(y[test_idx][mask_nonimprovers], y_pred[mask_nonimprovers])
            )

        # Baseline model
        if mask_improvers.any():
            Ball_accs_improvers.append(
                accuracy_score(y[test_idx][mask_improvers], By_pred[mask_improvers])
            )
        if mask_nonimprovers.any():
            Ball_accs_nonimprovers.append(
                accuracy_score(y[test_idx][mask_nonimprovers], By_pred[mask_nonimprovers])
            )

        # --- store individual predictions ---
        for idx, true, p_mix, pred_mix, p_base, pred_base in zip(
            test_idx, y[test_idx], y_prob, y_pred, By_prob, By_pred
        ):
            predictions.append({
                "run": run,
                "fold": fold,
                "sid": sid_list[idx],
                "true_label": true,
                "mixup_prob": p_mix,
                "mixup_pred": pred_mix,
                "baseline_prob": p_base,
                "baseline_pred": pred_base
            })

# ===========================
# 4. Report overall metrics + 95% CI
# ===========================



metrics = {
    "Accuracy": (Ball_accs, all_accs),
    "Precision": (Ball_precisions, all_precisions),
    "Recall": (Ball_recalls, all_recalls),
    "F1": (Ball_f1s, all_f1s),
    "AUC": (Ball_aucs, all_aucs)
}

metrics_extra = {
    "Acc_Improvers": (Ball_accs_improvers, all_accs_improvers),
    "Acc_NonImprovers": (Ball_accs_nonimprovers, all_accs_nonimprovers)
}


print(f"\n=== SVR Overall {n_runs}-run {k_folds}-fold CV (Baseline first, then MixUp) ===")
for name, (Bvals, Mvals) in {**metrics, **metrics_extra}.items():
    Bmean, Blow, Bhigh = mean_ci(Bvals)
    Mmean, Mlow, Mhigh = mean_ci(Mvals)
    print(f"{name:<14}: Baseline {Bmean:.3f} [{Blow:.3f}, {Bhigh:.3f}] | MixUp {Mmean:.3f} [{Mlow:.3f}, {Mhigh:.3f}]")

# ===========================
# 5. Save predictions
# ===========================
df_preds = pd.DataFrame(predictions)


df_preds = pd.DataFrame(predictions)

save_dir = "/content/drive/Shared drives/GNN/Results"
os.makedirs(save_dir, exist_ok=True)  # make sure folder exists

df_preds.to_csv(os.path.join(save_dir, "SVR_cv_predictions.csv"), index=False)
print("\nPredictions saved to SVR_cv_predictions.csv")

=== Run 1/100 ===
=== Run 2/100 ===
=== Run 3/100 ===
=== Run 4/100 ===
=== Run 5/100 ===
=== Run 6/100 ===
=== Run 7/100 ===
=== Run 8/100 ===
=== Run 9/100 ===
=== Run 10/100 ===
=== Run 11/100 ===
=== Run 12/100 ===
=== Run 13/100 ===
=== Run 14/100 ===
=== Run 15/100 ===
=== Run 16/100 ===
=== Run 17/100 ===
=== Run 18/100 ===
=== Run 19/100 ===
=== Run 20/100 ===
=== Run 21/100 ===
=== Run 22/100 ===
=== Run 23/100 ===
=== Run 24/100 ===
=== Run 25/100 ===
=== Run 26/100 ===
=== Run 27/100 ===
=== Run 28/100 ===
=== Run 29/100 ===
=== Run 30/100 ===
=== Run 31/100 ===
=== Run 32/100 ===
=== Run 33/100 ===
=== Run 34/100 ===
=== Run 35/100 ===
=== Run 36/100 ===
=== Run 37/100 ===
=== Run 38/100 ===
=== Run 39/100 ===
=== Run 40/100 ===
=== Run 41/100 ===
=== Run 42/100 ===
=== Run 43/100 ===
=== Run 44/100 ===
=== Run 45/100 ===
=== Run 46/100 ===
=== Run 47/100 ===
=== Run 48/100 ===
=== Run 49/100 ===
=== Run 50/100 ===
=== Run 51/100 ===
=== Run 52/100 ===
=== Run 53/100 ===
==