In [12]:
# Core libraries
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from scipy.linalg import eigh

# ===========================
# Load and preprocess dataset
# ===========================

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import pandas as pd
import numpy as np

def load_schaefer_graphs(
    keep_fraction=0.3,
    outcome_col='Imp20PercentBPRS'
):
    # ===========================
    # Hardcoded file names
    # ===========================
    graph_file = "/content/drive/Shared drives/GNN/SchaeferAtlas_Rest_Results_Freq_008to09_wholebrain.xlsx"
    outcome_file = "/content/drive/Shared drives/GNN/RestingStateDataforAlex_3Networks.xlsx"

    demo_cols = ['Age', 'handedness', 'sex', 'PrimaryEthnicity',
                 'PrimaryRace', 'Education', 'Parental Education']

    # ===========================
    # Load outcome + demographics
    # ===========================
    df_labels = pd.read_excel(outcome_file, sheet_name='outcomeanddemographics', skiprows=1)
    demo_dict = df_labels.set_index('SID')[demo_cols].to_dict(orient='index')

    # Helper: threshold adjacency
    def threshold_graph(A, keep_fraction):
        triu_indices = np.triu_indices_from(A, k=1)
        edge_values = A[triu_indices]
        edge_values = edge_values[edge_values > 0]  # ignore zeros
        if len(edge_values) == 0:
            return np.zeros_like(A)
        threshold = np.percentile(edge_values, 100 * (1 - keep_fraction))
        mask = A >= threshold
        A_thresh = A * mask
        A_thresh = np.maximum(A_thresh, A_thresh.T)  # enforce symmetry
        return A_thresh

    # ===========================
    # Load only needed sheets
    # ===========================
    xl = pd.ExcelFile(graph_file)
    available_sheets = set(xl.sheet_names)

    # Get SIDs that actually have outcomes
    sids_with_outcome = df_labels['SID'].dropna().unique()
    sids_to_load = [sid for sid in sids_with_outcome if sid in available_sheets]

    graphs_by_sid = {}

    for sid in sids_to_load:
        try:
            # Load matrix for this patient
            mat_df = pd.read_excel(graph_file, sheet_name=sid, index_col=0)
            mat = mat_df.values.astype(float)

            mat = mat_df.values.astype(float)

            # Replace NaNs and infs with 0
            mat = np.nan_to_num(mat, nan=0.0, posinf=0.0, neginf=0.0)
            rel = mat
            # Take absolute value (since matrices can have negatives)
            mat = np.abs(mat)

            # Take absolute values (since matrices aren't absolute in this dataset)


            # Get outcome + demographics
            row = df_labels.loc[df_labels['SID'] == sid]
            outcome = row[outcome_col].values[0]
            demos = demo_dict.get(sid, None)

            # Threshold adjacency
            mat_thresh = threshold_graph(mat, keep_fraction)
            rel_thresh = threshold_graph(rel, keep_fraction)

            graphs_by_sid[sid] = {
                'abs': mat_thresh,      # absolute-valued adjacency
                'rel': rel_thresh,
                'demo': demos,
                'outcome': outcome,
                'nodes': list(mat_df.index)  # preserve brain region labels
            }
        except Exception as e:
            print(f"Skipping SID {sid} due to error: {e}")
            continue

    return graphs_by_sid



In [3]:
#combines all the previous dataloading

import pandas as pd
import numpy as np



from itertools import product
import numpy as np

def mean_between_group_corr(responders_idx, nonresponders_idx):
    corr_values = []
    # Compare every responder with every non-responder
    for i, j in product(responders_idx, nonresponders_idx):
        vec_i = demo_matrix[i]
        vec_j = demo_matrix[j]
        # Skip if either vector has zero variance
        if np.std(vec_i) == 0 or np.std(vec_j) == 0:
            continue
        corr = np.corrcoef(vec_i, vec_j)[0,1]
        if not np.isnan(corr):
            corr_values.append(corr)
    if len(corr_values) == 0:
        return np.nan
    return np.mean(corr_values)

def load_and_preprocess_graphs(
    file_path,
    abs_keep_fraction=0.3,
    rel_keep_fraction=0.3,
    demo_cols=None,
    outcome_col='Imp20PercentBPRS'
):
    if demo_cols is None:
        demo_cols = ['Age', 'handedness', 'sex', 'PrimaryEthnicity', 'PrimaryRace', 'Education', 'Parental Education']

    # ===========================
    # Load dataset
    # ===========================
    df = pd.read_excel(file_path)
    df_labels = pd.read_excel(file_path, sheet_name='outcomeanddemographics', skiprows=1)

    # ===========================
    # Extract nodes and edges
    # ===========================
    edge_columns = df.columns.drop('SID')

    # Get all unique nodes
    nodes_set = set()
    for col in edge_columns:
        raw_node1, raw_node2 = col.split('-')
        node1 = raw_node1.replace('ABS_', '')
        node2 = raw_node2.replace('ABS_', '')
        nodes_set.add(node1)
        nodes_set.add(node2)

    nodes = sorted(list(nodes_set))
    n_nodes = len(nodes)
    node_to_idx = {node: i for i, node in enumerate(nodes)}

    abs_edge_cols, rel_edge_cols = [], []
    abs_edge_to_idx, rel_edge_to_idx = [], []

    for col in edge_columns:
        raw_node1, raw_node2 = col.split('-')
        node1 = raw_node1.replace('ABS_', '')
        node2 = raw_node2.replace('ABS_', '')
        idx1, idx2 = node_to_idx[node1], node_to_idx[node2]

        if 'ABS_' in col:
            abs_edge_cols.append(col)
            abs_edge_to_idx.append((idx1, idx2))
        else:
            rel_edge_cols.append(col)
            rel_edge_to_idx.append((idx1, idx2))

    # ===========================
    # Build adjacency matrices
    # ===========================
    abs_adj_matrices, rel_adj_matrices = [], []

    for _, row in df.iterrows():
        abs_adj = np.zeros((n_nodes, n_nodes))
        rel_adj = np.zeros((n_nodes, n_nodes))

        for col_idx, (i1, i2) in enumerate(abs_edge_to_idx):
            val = row[abs_edge_cols[col_idx]]
            abs_adj[i1, i2] = val
            abs_adj[i2, i1] = val

        for col_idx, (i1, i2) in enumerate(rel_edge_to_idx):
            val = row[rel_edge_cols[col_idx]]
            rel_adj[i1, i2] = val
            rel_adj[i2, i1] = val

        abs_adj_matrices.append(abs_adj)
        rel_adj_matrices.append(rel_adj)

    # ===========================
    # Thresholding function
    # ===========================
    def threshold_graph(A, keep_fraction):
        triu_indices = np.triu_indices_from(A, k=1)
        edge_values = A[triu_indices]
        threshold = np.percentile(edge_values, 100 * (1 - keep_fraction))
        mask = A >= threshold
        A_thresh = A * mask
        A_thresh = np.maximum(A_thresh, A_thresh.T)
        return A_thresh

    # ===========================
    # Build final dictionary
    # ===========================
    demo_dict = df_labels.set_index('SID')[demo_cols].to_dict(orient='index')

    graphs_by_sid = {}
    for sid, abs_mat, rel_mat in zip(df['SID'], abs_adj_matrices, rel_adj_matrices):
        demos = demo_dict.get(sid, None)
        abs_mat_thresh = threshold_graph(abs_mat, abs_keep_fraction)
        rel_mat_thresh = threshold_graph(rel_mat, rel_keep_fraction)
        outcome = df_labels.loc[df_labels['SID'] == sid, outcome_col].values[0] \
                  if sid in df_labels['SID'].values else None

        graphs_by_sid[sid] = {
            'abs': abs_mat_thresh,
            'rel': rel_mat_thresh,
            'demo': demos,
            'outcome': outcome
        }

    # Filter out subjects without outcome
    graphs_by_sid = {sid: g for sid, g in graphs_by_sid.items() if g['outcome'] is not None}

    return graphs_by_sid


In [None]:
import numpy as np
import pandas as pd
from itertools import combinations

# ===========================
# Load labels and demographics
# ===========================
file_path = '/content/drive/Shared drives/GNN/RestingStateDataforAlex_3Networks.xlsx'
graphs_by_sid = load_and_preprocess_graphs(file_path, abs_keep_fraction=1.0, rel_keep_fraction=1.0)

#graphs_by_sid = load_schaefer_graphs(keep_fraction=0.3)

# ===========================
# Load labels and demographics
# ===========================
df_labels = pd.read_excel(file_path, sheet_name='outcomeanddemographics', skiprows=1)

# Impute missing Parental Education with mean
parent_edu = df_labels['Parental Education'].values.astype(float)
mean_val = np.nanmean(parent_edu)
parent_edu[np.isnan(parent_edu)] = mean_val
df_labels['Parental Education'] = parent_edu

# ===========================
# Build demo matrix
# ===========================
sids = list(graphs_by_sid.keys())
demo_list = []
outcomes = []

for sid in sids:
    demo_dict = graphs_by_sid[sid]['demo']
    demo_vector = [
        demo_dict['Age'],
        demo_dict['handedness'],
        demo_dict['sex'],
        demo_dict['PrimaryEthnicity'],
        demo_dict['PrimaryRace'],
        demo_dict['Education'],
        demo_dict['Parental Education']
    ]
    demo_list.append(demo_vector)
    outcomes.append(graphs_by_sid[sid]['outcome'])

demo_matrix = np.array(demo_list, dtype=float)
outcomes = np.array(outcomes)

# ===========================
# Remove zero-variance columns
# ===========================
stds = demo_matrix.std(axis=0)
nonzero_var_cols = stds > 0
demo_matrix = demo_matrix[:, nonzero_var_cols]

# ===========================
# Split indices by outcome
# ===========================
responders_idx = np.where(outcomes == 1)[0]  # adjust to your encoding
nonresponders_idx = np.where(outcomes == 0)[0]

# ===========================
# Mean pairwise correlation
# ===========================
def mean_pairwise_corr(indices):
    corr_values = []
    for i, j in combinations(indices, 2):
        vec_i = demo_matrix[i]
        vec_j = demo_matrix[j]
        if np.std(vec_i) == 0 or np.std(vec_j) == 0:
            continue
        corr = np.corrcoef(vec_i, vec_j)[0,1]
        if not np.isnan(corr):
            corr_values.append(corr)
    if len(corr_values) == 0:
        return np.nan
    return np.mean(corr_values)

mean_corr_responders = mean_pairwise_corr(responders_idx)
mean_corr_nonresponders = mean_pairwise_corr(nonresponders_idx)

print("Mean correlation - responders:", mean_corr_responders)
print("Mean correlation - non-responders:", mean_corr_nonresponders)

# ===========================
# Average of each demographic feature
# ===========================
feature_names = ['Age','handedness','sex','PrimaryEthnicity','PrimaryRace','Education','Parental Education']

mean_responders = np.mean(demo_matrix[responders_idx], axis=0)
mean_nonresponders = np.mean(demo_matrix[nonresponders_idx], axis=0)

print("\nMean demographics - responders:")
for name, val in zip(feature_names, mean_responders):
    print(f"{name}: {val:.2f}")

print("\nMean demographics - non-responders:")
for name, val in zip(feature_names, mean_nonresponders):
    print(f"{name}: {val:.2f}")


mean_corr_between = mean_between_group_corr(responders_idx, nonresponders_idx)
print("Mean correlation between responders and non-responders:", mean_corr_between)

Mean correlation - responders: 0.9868648579820746
Mean correlation - non-responders: 0.9873388791828345

Mean demographics - responders:
Age: 19.69
handedness: 1.10
sex: 1.21
PrimaryEthnicity: 0.28
PrimaryRace: 3.24
Education: 11.83

Mean demographics - non-responders:
Age: 18.93
handedness: 1.04
sex: 1.56
PrimaryEthnicity: 0.33
PrimaryRace: 3.89
Education: 11.78
Mean correlation between responders and non-responders: 0.9864991697868103


In [None]:
def flatten_upper_tri(mat):
    # Take only upper triangle (excluding diagonal)
    triu_indices = np.triu_indices_from(mat, k=1)
    return mat[triu_indices]
sids = list(graphs_by_sid.keys())
outcomes = []
graph_vectors = []

for sid in sids:
    adj = graphs_by_sid[sid]['abs']  # or 'rel' depending on which you want
    vec = flatten_upper_tri(adj)
    graph_vectors.append(vec)
    outcomes.append(graphs_by_sid[sid]['outcome'])

graph_matrix = np.array(graph_vectors)  # shape: (n_patients, n_edges)
outcomes = np.array(outcomes)

from itertools import combinations

def mean_pairwise_corr(matrix, indices):
    corr_values = []
    for i, j in combinations(indices, 2):
        vec_i = matrix[i]
        vec_j = matrix[j]
        if np.std(vec_i) == 0 or np.std(vec_j) == 0:
            continue
        corr = np.corrcoef(vec_i, vec_j)[0,1]
        if not np.isnan(corr):
            corr_values.append(corr)
    if len(corr_values) == 0:
        return np.nan
    return np.mean(corr_values)

# Split indices
responders_idx = np.where(outcomes == 1)[0]
nonresponders_idx = np.where(outcomes == 0)[0]

mean_corr_responders = mean_pairwise_corr(graph_matrix, responders_idx)
mean_corr_nonresponders = mean_pairwise_corr(graph_matrix, nonresponders_idx)

print("Mean correlation - responders:", mean_corr_responders)
print("Mean correlation - non-responders:", mean_corr_nonresponders)

from itertools import product

corr_values = []
responders_idx = np.where(outcomes == 1)[0]
nonresponders_idx = np.where(outcomes == 0)[0]

responders_matrix = graph_matrix[responders_idx]
nonresponders_matrix = graph_matrix[nonresponders_idx]

# Compare every responder with every non-responder
for i, j in product(range(responders_matrix.shape[0]), range(nonresponders_matrix.shape[0])):
    vec_i = responders_matrix[i]
    vec_j = nonresponders_matrix[j]

    if np.std(vec_i) == 0 or np.std(vec_j) == 0:
        continue  # skip if vector has zero variance

    corr = np.corrcoef(vec_i, vec_j)[0,1]
    if not np.isnan(corr):
        corr_values.append(corr)

# Mean between-group correlation
mean_corr_between = np.mean(corr_values)
print("Mean correlation between responders and non-responders:", mean_corr_between)

#Pearson correlation coefficient

Mean correlation - responders: 0.3815547586741385
Mean correlation - non-responders: 0.3961451769994697
Mean correlation between responders and non-responders: 0.3876776424107272


In [None]:
def flatten_upper_tri(mat):
    # Take only upper triangle (excluding diagonal)
    triu_indices = np.triu_indices_from(mat, k=1)
    return mat[triu_indices]
sids = list(graphs_by_sid.keys())
outcomes = []
graph_vectors = []

for sid in sids:
    adj = graphs_by_sid[sid]['rel']  # or 'rel' depending on which you want
    vec = flatten_upper_tri(adj)
    graph_vectors.append(vec)
    outcomes.append(graphs_by_sid[sid]['outcome'])

graph_matrix = np.array(graph_vectors)  # shape: (n_patients, n_edges)
outcomes = np.array(outcomes)

from itertools import combinations

def mean_pairwise_corr(matrix, indices):
    corr_values = []
    for i, j in combinations(indices, 2):
        vec_i = matrix[i]
        vec_j = matrix[j]
        if np.std(vec_i) == 0 or np.std(vec_j) == 0:
            continue
        corr = np.corrcoef(vec_i, vec_j)[0,1]
        if not np.isnan(corr):
            corr_values.append(corr)
    if len(corr_values) == 0:
        return np.nan
    return np.mean(corr_values)

# Split indices
responders_idx = np.where(outcomes == 1)[0]
nonresponders_idx = np.where(outcomes == 0)[0]

mean_corr_responders = mean_pairwise_corr(graph_matrix, responders_idx)
mean_corr_nonresponders = mean_pairwise_corr(graph_matrix, nonresponders_idx)

print("rel Mean correlation - responders:", mean_corr_responders)
print("rel Mean correlation - non-responders:", mean_corr_nonresponders)

from itertools import product

corr_values = []
responders_idx = np.where(outcomes == 1)[0]
nonresponders_idx = np.where(outcomes == 0)[0]

responders_matrix = graph_matrix[responders_idx]
nonresponders_matrix = graph_matrix[nonresponders_idx]

# Compare every responder with every non-responder
for i, j in product(range(responders_matrix.shape[0]), range(nonresponders_matrix.shape[0])):
    vec_i = responders_matrix[i]
    vec_j = nonresponders_matrix[j]

    if np.std(vec_i) == 0 or np.std(vec_j) == 0:
        continue  # skip if vector has zero variance

    corr = np.corrcoef(vec_i, vec_j)[0,1]
    if not np.isnan(corr):
        corr_values.append(corr)

# Mean between-group correlation
mean_corr_between = np.mean(corr_values)
print("rel Mean correlation between responders and non-responders:", mean_corr_between)

#Pearson correlation coefficient

rel Mean correlation - responders: 0.5222061973246399
rel Mean correlation - non-responders: 0.4945713722099749
rel Mean correlation between responders and non-responders: 0.5052354151151565


In [None]:
import numpy as np
from itertools import combinations, product
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ===========================
# 1. Mean pairwise correlations
# ===========================
def mean_pairwise_corr(matrix, indices):
    corr_values = []
    for i, j in combinations(indices, 2):
        vec_i, vec_j = matrix[i], matrix[j]
        if np.std(vec_i) == 0 or np.std(vec_j) == 0:
            continue
        corr = np.corrcoef(vec_i, vec_j)[0,1]
        if not np.isnan(corr):
            corr_values.append(corr)
    return np.mean(corr_values) if corr_values else np.nan

def mean_between_group_corr(matrix, idx1, idx2):
    corr_values = []
    for i, j in product(idx1, idx2):
        vec_i, vec_j = matrix[i], matrix[j]
        if np.std(vec_i) == 0 or np.std(vec_j) == 0:
            continue
        corr = np.corrcoef(vec_i, vec_j)[0,1]
        if not np.isnan(corr):
            corr_values.append(corr)
    return np.mean(corr_values) if corr_values else np.nan

responders_idx = np.where(outcomes == 1)[0]
nonresponders_idx = np.where(outcomes == 0)[0]

mean_corr_resp = mean_pairwise_corr(graph_matrix, responders_idx)
mean_corr_nonresp = mean_pairwise_corr(graph_matrix, nonresponders_idx)
mean_corr_between = mean_between_group_corr(graph_matrix, responders_idx, nonresponders_idx)

print("Mean correlation - responders:", mean_corr_resp)
print("Mean correlation - non-responders:", mean_corr_nonresp)
print("Mean correlation between groups:", mean_corr_between)

# ===========================
# 2. Permutation test for correlations
# ===========================
def perm_test_corr(matrix, outcomes, n_permutations=1000):
    observed_within = (
        mean_pairwise_corr(matrix, np.where(outcomes==1)[0]) -
        mean_pairwise_corr(matrix, np.where(outcomes==0)[0])
    )
    perm_diffs = []
    for _ in range(n_permutations):
        perm_labels = np.random.permutation(outcomes)
        perm_diff = (
            mean_pairwise_corr(matrix, np.where(perm_labels==1)[0]) -
            mean_pairwise_corr(matrix, np.where(perm_labels==0)[0])
        )
        perm_diffs.append(perm_diff)
    perm_diffs = np.array(perm_diffs)
    p_val = np.mean(np.abs(perm_diffs) >= np.abs(observed_within))
    return observed_within, p_val

obs_diff, p_val_corr = perm_test_corr(graph_matrix, outcomes)
print("Observed within-group difference:", obs_diff)
print("Permutation p-value (within-group correlation difference):", p_val_corr)

# ===========================
# 3. Edge-wise correlation with outcome
# ===========================
edge_corrs = np.array([np.corrcoef(graph_matrix[:,i], outcomes)[0,1]
                       for i in range(graph_matrix.shape[1])])
top_edges = np.argsort(np.abs(edge_corrs))[::-1][:20]  # top 20 edges
print("Top 20 edges by absolute correlation:", top_edges)
print("Corresponding correlations:", edge_corrs[top_edges])

# ===========================
# 4. Supervised classification with cross-validation
# ===========================
X = graph_matrix
y = outcomes
clf = LogisticRegression(max_iter=1000)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []
for train_idx, test_idx in skf.split(X, y):
    clf.fit(X[train_idx], y[train_idx])
    preds = clf.predict(X[test_idx])
    accs.append(accuracy_score(y[test_idx], preds))
mean_acc = np.mean(accs)
print("5-fold CV accuracy:", mean_acc)

# ===========================
# 5. Permutation test for classifier
# ===========================
n_perm = 1000
perm_accs = []
for _ in range(n_perm):
    y_perm = np.random.permutation(y)
    perm_fold_accs = []
    for train_idx, test_idx in skf.split(X, y_perm):
        clf.fit(X[train_idx], y_perm[train_idx])
        perm_preds = clf.predict(X[test_idx])
        perm_fold_accs.append(accuracy_score(y_perm[test_idx], perm_preds))
    perm_accs.append(np.mean(perm_fold_accs))

perm_accs = np.array(perm_accs)
p_val_class = np.mean(perm_accs >= mean_acc)
print("Permutation p-value for classifier:", p_val_class)


Mean correlation - responders: 0.5222061973246399
Mean correlation - non-responders: 0.4945713722099749
Mean correlation between groups: 0.5052354151151565


KeyboardInterrupt: 

In [7]:
def inter_mixup(X_train, y_train, n_samples, alpha=0.2, hard_labels=False):
    """
    Generates n_samples synthetic examples using mixup.
    """
    X_new = []
    y_new = []

    for _ in range(n_samples):
        i, j = np.random.choice(len(X_train), size=2, replace=False)
        lam = np.random.beta(alpha, alpha)
        x_mix = lam * X_train[i] + (1 - lam) * X_train[j]
        y_mix = lam * y_train[i] + (1 - lam) * y_train[j]

        if hard_labels:
            y_mix = int(round(y_mix))  # convert to 0 or 1

        X_new.append(x_mix)
        y_new.append(y_mix)

    return np.vstack(X_new), np.array(y_new)


def intra_mixup(X_train, y_train, n_samples_per_class, alpha=0.2):
    """
    Generates synthetic samples by mixing features within the same class.
    X_train: (n_samples, n_features)
    y_train: (n_samples,)
    n_samples_per_class: number of synthetic samples to generate per class
    alpha: Beta distribution parameter
    """
    X_new, y_new = [], []
    classes = np.unique(y_train)

    for c in classes:
        idx_class = np.where(y_train == c)[0]
        for _ in range(n_samples_per_class):
            i, j = np.random.choice(idx_class, size=2, replace=False)
            lam = np.random.beta(alpha, alpha)
            x_mix = lam * X_train[i] + (1 - lam) * X_train[j]
            X_new.append(x_mix)
            y_new.append(c)  # hard label

    return np.vstack(X_new), np.array(y_new)


import networkx as nx
import numpy as np

def compute_more_graph_features(adj_matrix):
    """
    Computes a variety of node- and graph-level features for a weighted adjacency matrix.
    Returns a 1D feature vector.
    """
    G = nx.from_numpy_array(adj_matrix)
    features = []

    # ===========================
    # Node-level metrics (mean + std)
    # ===========================
    # Node strength (weighted degree)
    strength = np.array([s for n, s in G.degree(weight='weight')])
    features.extend([strength.mean(), strength.std()])

    # Clustering coefficient
    clustering = np.array(list(nx.clustering(G, weight='weight').values()))
    features.extend([clustering.mean(), clustering.std()])

    # Betweenness centrality
    try:
        bc = np.array(list(nx.betweenness_centrality(G, weight='weight').values()))
        features.extend([bc.mean(), bc.std()])
    except:
        features.extend([np.nan, np.nan])

    # Eigenvector centrality
    try:
        ec = np.array(list(nx.eigenvector_centrality(G, weight='weight', max_iter=500).values()))
        features.extend([ec.mean(), ec.std()])
    except:
        features.extend([np.nan, np.nan])

    # PageRank
    try:
        pr = np.array(list(nx.pagerank(G, weight='weight').values()))
        features.extend([pr.mean(), pr.std()])
    except:
        features.extend([np.nan, np.nan])

    # ===========================
    # Global graph metrics
    # ===========================
    # Global efficiency
    try:
        features.append(nx.global_efficiency(G))
    except:
        features.append(np.nan)

    # Average clustering
    try:
        features.append(nx.average_clustering(G, weight='weight'))
    except:
        features.append(np.nan)

    # Transitivity
    try:
        features.append(nx.transitivity(G))
    except:
        features.append(np.nan)

    # Assortativity (degree)
    try:
        features.append(nx.degree_assortativity_coefficient(G, weight='weight'))
    except:
        features.append(np.nan)

    # Density
    try:
        features.append(nx.density(G))
    except:
        features.append(np.nan)

    return np.array(features)

file_path = '/content/drive/Shared drives/GNN/RestingStateDataforAlex_3Networks.xlsx'
graphs_by_sid = load_and_preprocess_graphs(file_path, abs_keep_fraction=1.0, rel_keep_fraction=1.0)
print(graphs_by_sid['epp270']['abs'])
print(graphs_by_sid['epp270']['rel'])

[[0.         0.17338755 0.12403889 0.58831806 0.23387851 0.04568071
  0.33025175 0.0296529  0.10872398 0.07329236 0.10732256 0.17219024
  0.09179144 0.09451418 0.21203314]
 [0.17338755 0.         0.63607592 0.0721911  0.51286114 0.39463516
  0.31410516 0.23677453 0.12476689 0.01937445 0.13113582 0.09955679
  0.28756996 0.09870434 0.02389843]
 [0.12403889 0.63607592 0.         0.09948099 0.31710779 0.58452413
  0.06856401 0.12770892 0.27110495 0.27026323 0.14050177 0.39097136
  0.15021865 0.30674262 0.06529333]
 [0.58831806 0.0721911  0.09948099 0.         0.20853883 0.24963951
  0.29828674 0.48810218 0.10053241 0.21720226 0.20801852 0.19070761
  0.12809837 0.17672336 0.50478525]
 [0.23387851 0.51286114 0.31710779 0.20853883 0.         0.00130209
  0.26577205 0.26334115 0.14032486 0.23866158 0.16743091 0.34795873
  0.23701888 0.11188243 0.04141316]
 [0.04568071 0.39463516 0.58452413 0.24963951 0.00130209 0.
  0.00597454 0.08512837 0.35144027 0.32152661 0.27190813 0.048595
  0.01981576 0

In [20]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx


from sklearn.neural_network import MLPRegressor



from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, roc_curve
)

def add_noise(X, sigma=0.01):
    return X + np.random.normal(0, sigma, X.shape)

# ===========================
# 1. Graph feature computation
# ===========================
def compute_graph_features(adj_matrix, feature_flags):
    """
    Computes node- and graph-level features for a weighted adjacency matrix.
    Returns a 1D feature vector.
    """
    G = nx.from_numpy_array(adj_matrix)
    features = []

    # Node-level metrics
    strength = np.array([s for n, s in G.degree(weight='weight')])
    clustering_dict = nx.clustering(G, weight='weight')
    clustering = np.array(list(clustering_dict.values()))

    if feature_flags.get("strength_mean", True):
        features.append(strength.mean())
    if feature_flags.get("strength_std", True):
        features.append(strength.std())
    if feature_flags.get("clustering_mean", True):
        features.append(clustering.mean())
    if feature_flags.get("clustering_std", True):
        features.append(clustering.std())
    if feature_flags.get("avg_clustering", True):
        try:
            features.append(nx.average_clustering(G, weight='weight'))
        except:
            features.append(np.nan)

    return np.array(features)

def threshold_graph(A, keep_fraction):
    triu_indices = np.triu_indices_from(A, k=1)
    edge_values = A[triu_indices]
    edge_values = edge_values[edge_values > 0]  # ignore zeros
    if len(edge_values) == 0:
        return np.zeros_like(A)
    threshold = np.percentile(edge_values, 100 * (1 - keep_fraction))
    mask = A >= threshold
    A_thresh = A * mask
    A_thresh = np.maximum(A_thresh, A_thresh.T)  # enforce symmetry
    return A_thresh

# ===========================
# 2. Load data and build feature matrix
# ===========================
file_path = '/content/drive/Shared drives/GNN/RestingStateDataforAlex_3Networks.xlsx'
graphs_by_sid = load_and_preprocess_graphs(file_path, abs_keep_fraction=1.0, rel_keep_fraction=1.0)



# ---------------------------
# Feature flags (toggle True/False)
# ---------------------------
feature_flags = {
    "strength_mean": True,
    "strength_std": True,
    "clustering_mean": True,
    "clustering_std": True,
    "avg_clustering": True
}

feature_list = []
for sid in graphs_by_sid.keys():
    adj = graphs_by_sid[sid]['abs']  # use 'abs' or 'rel'
    #features = compute_more_graph_features(adj)
    #adj = threshold_graph(adj, 1.0)
    features = compute_graph_features(adj, feature_flags)
    feature_list.append(features)

X = np.array(feature_list)
y = np.array([graphs_by_sid[sid]['outcome'] for sid in graphs_by_sid.keys()])

# Remove NaN columns
nan_cols = np.isnan(X).any(axis=0)
X = X[:, ~nan_cols]
print("X shape after NaN removal:", X.shape)
print("feature_flags:", feature_flags)

print((y == 1).sum())  # number of elements equal to 1
print((y == 0).sum())  # number of elements equal to 0

# Dynamically generate feature names based on flags and non-NaN
feature_names = [name for name, include in feature_flags.items() if include]

# ===========================
# 3. 20-run 5-fold CV
# ===========================
clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

n_runs = 20  # number of repeated CV runs

all_accs, all_precisions, all_recalls, all_f1s, all_aucs = [], [], [], [], []
all_coefs = []

for run in range(n_runs):
    print(f"=== Run {run+1}/{n_runs} ===")

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + run)

    accs, precisions, recalls, f1s, aucs, coefs = [], [], [], [], [], []

    for train_idx, test_idx in skf.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]

        # ---------------------------
        # Mixup (optional)
        # ---------------------------
        X_aug, y_aug = intra_mixup(X_train, y_train, n_samples_per_class=40) #same class mixup
        X_train_aug = np.vstack([X_train, X_aug])
        y_train_aug = np.concatenate([y_train, y_aug])

        X_train_aug, y_train_aug = X_train, y_train  # no mixup
        X_train_aug = add_noise(X_train_aug)

        clf.fit(X_train_aug, y_train_aug)

        y_pred = clf.predict(X[test_idx])
        y_prob = clf.predict_proba(X[test_idx])[:,1]

        accs.append(accuracy_score(y[test_idx], y_pred))
        precisions.append(precision_score(y[test_idx], y_pred))
        recalls.append(recall_score(y[test_idx], y_pred))
        f1s.append(f1_score(y[test_idx], y_pred))
        aucs.append(roc_auc_score(y[test_idx], y_prob))

        log_reg = clf.named_steps['logisticregression']
        coefs.append(log_reg.coef_[0])

    # Append this run's results to overall
    all_accs.extend(accs)
    all_precisions.extend(precisions)
    all_recalls.extend(recalls)
    all_f1s.extend(f1s)
    all_aucs.extend(aucs)
    all_coefs.append(np.mean(coefs, axis=0))

# ===========================
# 4. Report overall metrics
# ===========================
print("\n=== Overall 20-run 5-fold CV metrics ===")
print(f"Accuracy  : {np.mean(all_accs):.3f}")
print(f"Precision : {np.mean(all_precisions):.3f}")
print(f"Recall    : {np.mean(all_recalls):.3f}")
print(f"F1-score  : {np.mean(all_f1s):.3f}")
print(f"AUC       : {np.mean(all_aucs):.3f}")

# ===========================
# 5. Feature importance
# ===========================
mean_coef = np.mean(all_coefs, axis=0)
importance = np.abs(mean_coef)

print("\nFeature importance (avg |coef| across runs):")
for name, imp in zip(feature_names, importance):
    print(f"{name:<20}: {imp:.3f}")


X shape after NaN removal: (56, 5)
feature_flags: {'strength_mean': True, 'strength_std': True, 'clustering_mean': True, 'clustering_std': True, 'avg_clustering': True}
29
27
=== Run 1/20 ===
=== Run 2/20 ===
=== Run 3/20 ===
=== Run 4/20 ===
=== Run 5/20 ===
=== Run 6/20 ===
=== Run 7/20 ===
=== Run 8/20 ===
=== Run 9/20 ===
=== Run 10/20 ===
=== Run 11/20 ===
=== Run 12/20 ===
=== Run 13/20 ===
=== Run 14/20 ===
=== Run 15/20 ===
=== Run 16/20 ===
=== Run 17/20 ===
=== Run 18/20 ===
=== Run 19/20 ===
=== Run 20/20 ===

=== Overall 20-run 5-fold CV metrics ===
Accuracy  : 0.635
Precision : 0.654
Recall    : 0.658
F1-score  : 0.639
AUC       : 0.713

Feature importance (avg |coef| across runs):
strength_mean       : 0.368
strength_std        : 0.448
clustering_mean     : 0.181
clustering_std      : 0.175
avg_clustering      : 0.308


In [None]:
#inter class mixup

from sklearn.linear_model import SGDRegressor

all_accs, all_precisions, all_recalls, all_f1s, all_aucs = [], [], [], [], []

for run in range(n_runs):
    print(f"=== Run {run+1}/{n_runs} ===")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + run)

    accs, precisions, recalls, f1s, aucs = [], [], [], [], []

    for train_idx, test_idx in skf.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]

        # ---------------------------
        # Mixup: produce soft labels
        # ---------------------------
        X_aug, y_aug = inter_mixup(X_train, y_train, n_samples=40)
        X_train_aug = np.vstack([X_train, X_aug])
        y_train_aug = np.concatenate([y_train, y_aug])

        # Add small noise
        X_train_aug = add_noise(X_train_aug)

        # ---------------------------
        # Fit regression model
        # ---------------------------
        clf = make_pipeline(StandardScaler(), SGDRegressor(max_iter=1000, tol=1e-3))
        clf.fit(X_train_aug, y_train_aug)

        # Predict probabilities on test set
        y_prob = clf.predict(X[test_idx])
        y_prob = np.clip(y_prob, 0, 1)  # ensure in [0,1]

        # Threshold for discrete labels
        y_pred = (y_prob >= 0.5).astype(int)

        # Compute metrics
        accs.append(accuracy_score(y[test_idx], y_pred))
        precisions.append(precision_score(y[test_idx], y_pred))
        recalls.append(recall_score(y[test_idx], y_pred))
        f1s.append(f1_score(y[test_idx], y_pred))
        aucs.append(roc_auc_score(y[test_idx], y_prob))

    # Append this run’s results
    all_accs.extend(accs)
    all_precisions.extend(precisions)
    all_recalls.extend(recalls)
    all_f1s.extend(f1s)
    all_aucs.extend(aucs)

# ---------------------------
# Report overall metrics
# ---------------------------
print("\n=== Overall 20-run 5-fold CV metrics ===")
print(f"Accuracy  : {np.mean(all_accs):.3f}")
print(f"Precision : {np.mean(all_precisions):.3f}")
print(f"Recall    : {np.mean(all_recalls):.3f}")
print(f"F1-score  : {np.mean(all_f1s):.3f}")
print(f"AUC       : {np.mean(all_aucs):.3f}")


=== Run 1/20 ===
=== Run 2/20 ===
=== Run 3/20 ===
=== Run 4/20 ===
=== Run 5/20 ===
=== Run 6/20 ===
=== Run 7/20 ===
=== Run 8/20 ===
=== Run 9/20 ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


=== Run 10/20 ===
=== Run 11/20 ===
=== Run 12/20 ===
=== Run 13/20 ===
=== Run 14/20 ===
=== Run 15/20 ===
=== Run 16/20 ===
=== Run 17/20 ===
=== Run 18/20 ===
=== Run 19/20 ===
=== Run 20/20 ===

=== Overall 20-run 5-fold CV metrics ===
Accuracy  : 0.639
Precision : 0.669
Recall    : 0.595
F1-score  : 0.610
AUC       : 0.734


In [None]:
#random forest

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score
)

# ===========================
# 1. Graph feature computation
# ===========================
def compute_graph_features(adj_matrix):
    """
    Computes node- and graph-level features for a weighted adjacency matrix.
    Returns a 1D feature vector.
    """
    G = nx.from_numpy_array(adj_matrix)
    features = []

    strength = np.array([s for n, s in G.degree(weight='weight')])
    clustering_dict = nx.clustering(G, weight='weight')
    clustering = np.array(list(clustering_dict.values()))

    # append features one by one
    features.append(strength.mean())
    features.append(strength.std())
    features.append(clustering.mean())
    features.append(clustering.std())
    features.append(nx.average_clustering(G, weight='weight'))

    return np.array(features)

# ===========================
# 2. Load data and build feature matrix
# ===========================
file_path = '/content/drive/Shared drives/GNN/RestingStateDataforAlex_3Networks.xlsx'
graphs_by_sid = load_and_preprocess_graphs(file_path, abs_keep_fraction=1.0, rel_keep_fraction=1.0)

feature_list = []
for sid in graphs_by_sid.keys():
    adj = graphs_by_sid[sid]['abs']
    features = compute_graph_features(adj)
    feature_list.append(features)

X = np.array(feature_list)
y = np.array([graphs_by_sid[sid]['outcome'] for sid in graphs_by_sid.keys()])

# Remove NaN columns
nan_cols = np.isnan(X).any(axis=0)
X = X[:, ~nan_cols]

# Set feature names for included columns
feature_flags = {
    "strength_mean": True,
    "strength_std": True,
    "clustering_mean": True,
    "clustering_std": True,
    "avg_clustering": True
}
feature_names = [name for name, include in feature_flags.items() if include]

# ===========================
# 3. Cross-validated RF classification
# ===========================
n_runs = 20
all_accs, all_precisions, all_recalls, all_f1s, all_aucs = [], [], [], [], []
all_importances = []

for run in range(n_runs):
    print(f"=== Run {run+1}/{n_runs} ===")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42 + run)

    # Temporary lists for this run
    accs, precisions, recalls, f1s, aucs, importances = [], [], [], [], [], []

    for train_idx, test_idx in skf.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]

        # Random Forest classifier
        clf = RandomForestClassifier(n_estimators=200, random_state=42)
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test)[:, 1]

        accs.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))
        aucs.append(roc_auc_score(y_test, y_prob))
        importances.append(clf.feature_importances_)

    all_accs.extend(accs)
    all_precisions.extend(precisions)
    all_recalls.extend(recalls)
    all_f1s.extend(f1s)
    all_aucs.extend(aucs)
    all_importances.append(np.mean(importances, axis=0))

# ===========================
# 4. Overall metrics and feature importances
# ===========================
print("\n=== Overall 20-run 5-fold CV metrics ===")
print(f"Accuracy  : {np.mean(all_accs):.3f} ± {np.std(all_accs):.3f}")
print(f"Precision : {np.mean(all_precisions):.3f} ± {np.std(all_precisions):.3f}")
print(f"Recall    : {np.mean(all_recalls):.3f} ± {np.std(all_recalls):.3f}")
print(f"F1-score  : {np.mean(all_f1s):.3f} ± {np.std(all_f1s):.3f}")
print(f"AUC       : {np.mean(all_aucs):.3f} ± {np.std(all_aucs):.3f}")

# Mean feature importance across runs
mean_importance = np.mean(all_importances, axis=0)
print("\nFeature importance (avg across runs):")
for name, imp in sorted(zip(feature_names, mean_importance), key=lambda x: -x[1]):
    print(f"{name:<15}: {imp:.3f}")


=== Run 1/20 ===
=== Run 2/20 ===
=== Run 3/20 ===
=== Run 4/20 ===
=== Run 5/20 ===
=== Run 6/20 ===
=== Run 7/20 ===
=== Run 8/20 ===
=== Run 9/20 ===
=== Run 10/20 ===
=== Run 11/20 ===
=== Run 12/20 ===
=== Run 13/20 ===
=== Run 14/20 ===
=== Run 15/20 ===
=== Run 16/20 ===
=== Run 17/20 ===
=== Run 18/20 ===
=== Run 19/20 ===
=== Run 20/20 ===

=== Overall 20-run 5-fold CV metrics ===
Accuracy  : 0.637 ± 0.123
Precision : 0.666 ± 0.173
Recall    : 0.608 ± 0.180
F1-score  : 0.624 ± 0.154
AUC       : 0.653 ± 0.142

Feature importance (avg across runs):
strength_std   : 0.229
strength_mean  : 0.227
clustering_std : 0.206
avg_clustering : 0.184
clustering_mean: 0.155


In [None]:
import numpy as np
import networkx as nx
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score
)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# ===========================
# 1. Graph feature computation
# ===========================
def compute_graph_features(adj_matrix):
    G = nx.from_numpy_array(adj_matrix)
    features = []

    strength = np.array([s for n, s in G.degree(weight='weight')])
    clustering_dict = nx.clustering(G, weight='weight')
    clustering = np.array(list(clustering_dict.values()))

    features.append(strength.mean())
    features.append(clustering.mean())
    features.append(clustering.std())

    return np.array(features)

# ===========================
# 2. Load data and build feature matrix
# ===========================
file_path = '/content/drive/Shared drives/GNN/RestingStateDataforAlex_3Networks.xlsx'
graphs_by_sid = load_and_preprocess_graphs(file_path, abs_keep_fraction=1.0, rel_keep_fraction=1.0)

feature_list = []
for sid in graphs_by_sid.keys():
    adj = graphs_by_sid[sid]['abs']
    features = compute_graph_features(adj)
    feature_list.append(features)

X = np.array(feature_list)
y = np.array([graphs_by_sid[sid]['outcome'] for sid in graphs_by_sid.keys()])

# Remove NaNs
nan_cols = np.isnan(X).any(axis=0)
X = X[:, ~nan_cols]

feature_names = ["strength_mean", "clustering_mean", "clustering_std"]
X = StandardScaler().fit_transform(X)

# ===========================
# 3. Neural network definition
# ===========================
class SmallNN(nn.Module):
    def __init__(self, input_dim):
        super(SmallNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 8)
        self.fc2 = nn.Linear(8, 4)
        self.out = nn.Linear(4, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.out(x))
        return x

# ===========================
# 4. Mixup function
# ===========================
def mixup_data(X, y, alpha=0.2):
    lam = np.random.beta(alpha, alpha)
    batch_size = X.shape[0]
    index = np.random.permutation(batch_size)
    X_mix = lam * X + (1 - lam) * X[index]
    y_mix = lam * y + (1 - lam) * y[index]
    return X_mix, y_mix

# ===========================
# 5. 20-run 5-fold CV
# ===========================
n_runs = 20
all_accs, all_precisions, all_recalls, all_f1s, all_aucs = [], [], [], [], []

for run in range(n_runs):
    print(f"=== Run {run+1}/{n_runs} ===")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42+run)

    for train_idx, test_idx in skf.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]



        # Convert to torch tensors
        X_train_t = torch.tensor(X_train, dtype=torch.float32)
        y_train_t = torch.tensor(y_train.reshape(-1,1), dtype=torch.float32)
        X_test_t = torch.tensor(X_test, dtype=torch.float32)
        y_test_t = torch.tensor(y_test.reshape(-1,1), dtype=torch.float32)

        # Apply mixup
        X_train_t, y_train_t = mixup_data(X_train_t.numpy(), y_train_t.numpy())
        X_train_t = torch.tensor(X_train_t, dtype=torch.float32)
        y_train_t = torch.tensor(y_train_t, dtype=torch.float32)

        # Dataset and loader
        train_dataset = TensorDataset(X_train_t, y_train_t)
        train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

        # Model
        model = SmallNN(input_dim=X.shape[1])
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        criterion = nn.BCELoss()

        # Train
        model.train()
        for epoch in range(100):
            for xb, yb in train_loader:
                optimizer.zero_grad()
                y_pred = model(xb)
                loss = criterion(y_pred, yb)
                loss.backward()
                optimizer.step()

        # Evaluate
        model.eval()
        with torch.no_grad():
            y_pred_prob = model(X_test_t).numpy().flatten()
            y_pred_label = (y_pred_prob >= 0.5).astype(int)

        all_accs.append(accuracy_score(y_test, y_pred_label))
        all_precisions.append(precision_score(y_test, y_pred_label))
        all_recalls.append(recall_score(y_test, y_pred_label))
        all_f1s.append(f1_score(y_test, y_pred_label))
        all_aucs.append(roc_auc_score(y_test, y_pred_prob))

# ===========================
# 6. Overall metrics
# ===========================
print("\n=== Overall 20-run 5-fold CV metrics (NN) ===")
print(f"Accuracy  : {np.mean(all_accs):.3f} ± {np.std(all_accs):.3f}")
print(f"Precision : {np.mean(all_precisions):.3f} ± {np.std(all_precisions):.3f}")
print(f"Recall    : {np.mean(all_recalls):.3f} ± {np.std(all_recalls):.3f}")
print(f"F1-score  : {np.mean(all_f1s):.3f} ± {np.std(all_f1s):.3f}")
print(f"AUC       : {np.mean(all_aucs):.3f} ± {np.std(all_aucs):.3f}")


=== Run 1/20 ===
=== Run 2/20 ===
=== Run 3/20 ===
=== Run 4/20 ===
=== Run 5/20 ===
=== Run 6/20 ===
=== Run 7/20 ===
=== Run 8/20 ===
=== Run 9/20 ===
=== Run 10/20 ===
=== Run 11/20 ===
=== Run 12/20 ===
=== Run 13/20 ===
=== Run 14/20 ===
=== Run 15/20 ===
=== Run 16/20 ===
=== Run 17/20 ===
=== Run 18/20 ===
=== Run 19/20 ===
=== Run 20/20 ===

=== Overall 20-run 5-fold CV metrics (NN) ===
Accuracy  : 0.596 ± 0.132
Precision : 0.622 ± 0.153
Recall    : 0.601 ± 0.208
F1-score  : 0.593 ± 0.153
AUC       : 0.656 ± 0.149


In [None]:
# Logistic regression intra mixup  (same class mixup)
# === Overall 20-run 5-fold CV metrics ===
# Accuracy  : 0.620
# Precision : 0.649
# Recall    : 0.605
# F1-score  : 0.610
# AUC       : 0.727

# Linear regression inter mixup (different class mixup)
# === Overall 20-run 5-fold CV metrics ===
# Accuracy  : 0.639
# Precision : 0.669
# Recall    : 0.595
# F1-score  : 0.610
# AUC       : 0.734

# Logistic regression no mixup
# === Overall 20-run 5-fold CV metrics ===
# Accuracy  : 0.634
# Precision : 0.657
# Recall    : 0.647
# F1-score  : 0.636
# AUC       : 0.740