## Run this to install dgl

In [None]:
!pip install  dgl -f https://data.dgl.ai/wheels/torch-2.4/cu121/repo.html

### Path to the dataset

In [None]:
'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/SLN FL/GNN/GNN-research/SLN FL/data/w_removal_%s'

# Automated

In [None]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import pandas as pd
from dgl import graph
from dgl.nn import SAGEConv, GINConv
import dgl.function as fn
import math
from sklearn.metrics import roc_auc_score
import os
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import networkx as nx



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return g.edata['score'][:, 0]

class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

# def compute_loss(pos_score, neg_score):
#     scores = torch.cat([pos_score, neg_score]).to(device)
#     labels = torch.cat([torch.ones(pos_score.shape[0], device=scores.device), torch.zeros(neg_score.shape[0], device=scores.device)])
#     return F.binary_cross_entropy_with_logits(scores, labels)

# def compute_loss(pos_score, neg_score):
#     # Concatenate scores and labels
#     scores = torch.cat([pos_score, neg_score]).to(device)
#     labels = torch.cat([
#         torch.ones(pos_score.shape[0], device=scores.device),
#         torch.zeros(neg_score.shape[0], device=scores.device)
#     ])

#     # Calculate positive class weight
#     num_pos = pos_score.shape[0]
#     num_neg = neg_score.shape[0]
#     pos_weight = torch.tensor([num_neg / num_pos], device=device)

#     # Compute weighted loss
#     loss = F.binary_cross_entropy_with_logits(
#         scores, labels, pos_weight=pos_weight
#     )
#     return loss

def compute_loss(pos_score, neg_score, max_weight=20):
    scores = torch.cat([pos_score, neg_score]).to(device)
    labels = torch.cat([torch.ones(pos_score.shape[0], device=scores.device), torch.zeros(neg_score.shape[0], device=scores.device)])

    # Calculate positive class weight
    num_pos = pos_score.shape[0]
    num_neg = neg_score.shape[0]
    pos_weight = torch.tensor([num_neg / num_pos], device=device)

    # Clamp the pos_weight to avoid extremely high values
    pos_weight = torch.clamp(pos_weight, max=max_weight)

    # Compute the weighted loss
    loss = F.binary_cross_entropy_with_logits(scores, labels, pos_weight=pos_weight)
    return loss



def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).cpu().numpy()
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).cpu().numpy()
    return roc_auc_score(labels, scores)

def compute_acc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).cpu().numpy()
    scores = [1 if i >= 0.5 else 0 for i in scores]
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).cpu().numpy()
    correct_pred = 0
    for i in range(labels.shape[0]):
        if labels[i] == scores[i]:
            correct_pred += 1
    return correct_pred / labels.shape[0]


def process_edges(edges_chunk, edges_in_graph):
    pos_u = []
    pos_v = []
    neg_u = []
    neg_v = []
    for u, v in edges_chunk:
        if (u, v) in edges_in_graph:
            pos_u.append(u)
            pos_v.append(v)
        else:
            neg_u.append(u)
            neg_v.append(v)
    return pos_u, pos_v, neg_u, neg_v


# def mean_algebraic_connectivity(dgl_graph):
#     dgl_graph = dgl_graph.cpu()  # Move the graph to the CPU if necessary

#     # Convert DGL graph to a NetworkX graph (this will create a directed graph)
#     nx_graph = dgl.to_networkx(dgl_graph)

#     # Convert the NetworkX graph to an undirected graph
#     nx_graph = nx_graph.to_undirected()

#     # Get all connected components in the undirected graph
#     connected_components = [nx_graph.subgraph(c).copy() for c in nx.connected_components(nx_graph)]

#     # Initialize list to store algebraic connectivity values
#     connectivity_values = []

#     # Loop through each connected component and calculate the algebraic connectivity
#     for component in connected_components:
#         if len(component) >= 5:  # Algebraic connectivity is only defined for graphs with more than 1 node
#             try:
#                 connectivity = nx.algebraic_connectivity(component)
#                 connectivity_values.append(connectivity)
#             except nx.NetworkXError as e:
#                 print(f"Error computing algebraic connectivity for component: {e}")
#         else:
#             pass

#     # Calculate and return the mean algebraic connectivity
#     if connectivity_values:
#         mean_connectivity = sum(connectivity_values) / len(connectivity_values)
#         return mean_connectivity
#     else:
#         return 0.0  # Return 0 if there are no valid connected components

def compute_algebraic_connectivity(component):
    if len(component) >= 5:  # Algebraic connectivity is only defined for graphs with more than 1 node
        try:
            connectivity = nx.algebraic_connectivity(component)
            return connectivity
        except nx.NetworkXError as e:
            print(f"Error computing algebraic connectivity for component: {e}")
            return None
    else:
        return None

def mean_algebraic_connectivity(dgl_graph, n_jobs=-1):
    dgl_graph = dgl_graph.cpu()  # Move the graph to the CPU if necessary

    # Convert DGL graph to a NetworkX graph and then to an undirected graph
    nx_graph = dgl.to_networkx(dgl_graph).to_undirected()

    # Get all connected components in the graph
    connected_components = [nx_graph.subgraph(c).copy() for c in nx.connected_components(nx_graph)]

    # Use joblib to parallelize the calculation of algebraic connectivity for each component
    connectivity_values = Parallel(n_jobs=n_jobs)(
        delayed(compute_algebraic_connectivity)(component) for component in connected_components
    )

    # Filter out any None values (in case there was an error in computing connectivity)
    connectivity_values = [c for c in connectivity_values if c is not None]

    # Calculate and return the mean algebraic connectivity
    if connectivity_values:
        mean_connectivity = sum(connectivity_values) / len(connectivity_values)
        return mean_connectivity
    else:
        return 0.0  # Return 0 if there are no valid connected components




# , 'ml', 'virtualshakespeare' , 0.05, 0.10, 0.15, 0.20, 0.25, 0.50, 0.75, 'comp', 'ml', 'virtualshakespeare'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
percent = [0.05, 0.10, 0.15, 0.20, 0.25, 0.50, 0.75]
seed = 42  # Fixed random seed for reproducibility

for ds in datasets:
    data_total = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/SLN FL/data/w_removal_%s' % ds, sep=" ", header=None)
    print('Analyzing %s Dataset' % ds)
    data_total.columns = ["n1", "n2", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "l"]
    data_total.dropna(subset=['n1', 'n2'], inplace=True)

    # Taking unique nodes and saving indices
    nodes = np.unique(data_total[['n1', 'n2']].values)
    node_index = {node: idx for idx, node in enumerate(nodes)}

    # Create adjacency matrix
    adj_matrix = np.zeros((len(nodes), len(nodes)), dtype=float)
    for _, row in data_total.iterrows():
        i, j = node_index[row['n1']], node_index[row['n2']]
        adj_matrix[i, j] = row['l']
        adj_matrix[j, i] = row['l']

    adj_df = pd.DataFrame(adj_matrix, index=nodes, columns=nodes)

    labels = data_total["l"]
    ones_label = np.where(labels == 1)

    # Create graph using node indices
    src = data_total["n1"].map(node_index).astype(int)
    dst = data_total["n2"].map(node_index).astype(int)

    src = torch.tensor(src.values[ones_label], dtype=torch.int64)
    dst = torch.tensor(dst.values[ones_label], dtype=torch.int64)

    # Create the graph
    g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src])), num_nodes=len(nodes))
    #inputs = torch.eye(g.number_of_nodes()).to(device)
    inputs = g.adj().to_dense().to(device)

    # Generate all possible edges
    upper_tri_idx = np.triu_indices_from(adj_df, k=1)
    possible_edges = [(i, j) for i, j in zip(*upper_tri_idx)]
    possible_edges += [(j, i) for i, j in possible_edges]  # Add (j, i) for undirected graph

    total_edges = len(possible_edges)

    # Shuffle the possible edges using the fixed seed
    np.random.seed(seed)
    np.random.shuffle(possible_edges)

    # Initialize test_edges_set
    test_edges_set = set()

    for p in percent:
      res =[]
      total_test_size = int(total_edges * p)
      new_test_size = total_test_size - len(test_edges_set)

      if new_test_size > 0:
          # Get the remaining edges not in test_edges_set
          remaining_edges = [edge for edge in possible_edges if edge not in test_edges_set]

          # Randomly select new_test_size edges from remaining_edges
          new_test_edge_indices = np.random.choice(len(remaining_edges), size=new_test_size, replace=False)
          new_test_edges = [remaining_edges[i] for i in new_test_edge_indices]

      duplicate_edges = [edge for edge in new_test_edges if edge in test_edges_set]

      if duplicate_edges:
          print(f"Duplicate edges found: {duplicate_edges}")
          break
      else:
          print(f"No duplicate edges found. New edges are unique.")

          # Add new edges to test_edges_set
          test_edges_set.update(new_test_edges)

      # Now test_edges is test_edges_set
      test_edges = list(test_edges_set)

      # The training edges are the remaining edges not in test_edges_set
      train_edges = [edge for edge in possible_edges if edge not in test_edges_set]

      # Create a set of all edges for fast lookup
      edges_in_graph = set()
      us, vs = g.edges()
      us = us.tolist()
      vs = vs.tolist()
      for u, v in zip(us, vs):
          edges_in_graph.add((u, v))
          edges_in_graph.add((v, u))  # Ensure undirected edge representation

      # Process test edges
      num_cores = os.cpu_count()
      test_edges_chunks = np.array_split(test_edges, num_cores)

      test_results = Parallel(n_jobs=num_cores)(
          delayed(process_edges)(chunk, edges_in_graph) for chunk in test_edges_chunks
      )

      test_pos_u, test_pos_v, test_neg_u, test_neg_v = [], [], [], []
      for pos_u, pos_v, neg_u, neg_v in test_results:
          test_pos_u.extend(pos_u)
          test_pos_v.extend(pos_v)
          test_neg_u.extend(neg_u)
          test_neg_v.extend(neg_v)

      # Update edges_in_graph for training by excluding test edges
      test_edges_set_set = set(test_edges)
      edges_in_graph_train = edges_in_graph - test_edges_set_set

      # Process training edges
      train_edges_chunks = np.array_split(train_edges, num_cores)
      train_results = Parallel(n_jobs=num_cores)(
          delayed(process_edges)(chunk, edges_in_graph_train) for chunk in train_edges_chunks
      )

      train_pos_u, train_pos_v, train_neg_u, train_neg_v = [], [], [], []
      for pos_u, pos_v, neg_u, neg_v in train_results:
          train_pos_u.extend(pos_u)
          train_pos_v.extend(pos_v)
          train_neg_u.extend(neg_u)
          train_neg_v.extend(neg_v)

      # Create subgraphs for training and testing
      train_pos_u = torch.tensor(train_pos_u)
      train_pos_v = torch.tensor(train_pos_v)
      train_neg_u = torch.tensor(train_neg_u)
      train_neg_v = torch.tensor(train_neg_v)

      test_pos_u = torch.tensor(test_pos_u)
      test_pos_v = torch.tensor(test_pos_v)
      test_neg_u = torch.tensor(test_neg_u)
      test_neg_v = torch.tensor(test_neg_v)


      train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
      train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())
      test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
      test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

      # Create a mapping from edge tuples to edge IDs, including both directions
      edge_tuple_to_eid = {}
      u_edges, v_edges = g.edges()
      for eid, (u, v) in enumerate(zip(u_edges.tolist(), v_edges.tolist())):
          edge_tuple_to_eid[(u, v)] = eid
          edge_tuple_to_eid[(v, u)] = eid  # Include reverse direction

      # Get the edge IDs for test positive edges
      test_pos_eids = []
      for u, v in zip(test_pos_u.tolist(), test_pos_v.tolist()):
          eid = edge_tuple_to_eid.get((u, v))
          if eid is None:
              eid = edge_tuple_to_eid.get((v, u))
          if eid is not None:
              test_pos_eids.append(eid)

      # Remove the test positive edges from g to create train_g
      train_g = dgl.remove_edges(g, test_pos_eids).to(device)

      # Verify the edge counts
      print(f'train_g edges: {train_g.num_edges()}, original g edges: {g.num_edges()}')


      # Verify the splits
      print(f"Dataset: {ds}")
      print(f"Percentage: {p*100}%")
      print(f"Number of test edges: {len(test_edges)}")
      print(f"Number of training edges: {len(train_edges)}")

      train_pos_g = train_pos_g.to(device)
      train_neg_g = train_neg_g.to(device)
      test_pos_g = test_pos_g.to(device)
      test_neg_g = test_neg_g.to(device)

      model = GraphSAGE(g.number_of_nodes(), 16).to(device)
      pred = DotPredictor().to(device)

      optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.0005)

      for e in range(2001):
          h = model(train_g, train_g.adj().to_dense().to(device))
          pos_score = pred(train_pos_g, h)
          neg_score = pred(train_neg_g, h)
          loss = compute_loss(pos_score, neg_score)


          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

          if e % 100 == 0:
              print('In epoch {}, loss: {}'.format(e, loss.item()))
      model.eval()
      with torch.no_grad():
          h = model(train_g, train_g.adj().to_dense().to(device))
          pos_score = pred(test_pos_g, h)
          neg_score = pred(test_neg_g, h)

          auc = compute_auc(pos_score, neg_score)
          acc = compute_acc(pos_score, neg_score)
          print('AUC', auc)
          print('ACC', acc)

      train_g_toNX = train_g.cpu()
      nx_g = dgl.to_networkx(train_g_toNX)
      nx_g = nx_g.to_undirected()
      if(nx.is_connected(nx_g)):
        res.append({'Algebraic Connectivity': nx.algebraic_connectivity(nx_g)})
        print(f'using nx: {nx.algebraic_connectivity(nx_g)}')
      else:
        mean_conn = mean_algebraic_connectivity(train_g, n_jobs=-1)
        res.append({'Algebraic Connectivity': mean_conn})
        print(f'using manual: {mean_conn}')


      res.append({
        'Dataset': ds,
        'Test_Size': p,
        'AUC': auc,
        'ACC': acc
      })

      # save model
      torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': e
      }, f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/models/model_{ds}_{p}p.pth' )


      dgl.save_graphs(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/graphs/graphs_{ds}_{p}p.bin' , [train_pos_g, train_neg_g, test_pos_g, test_neg_g, g])


      res_df = pd.DataFrame(res)
      res_df.to_csv(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/csv/results_{ds}_{p}p.csv' , index=False)


      # Function to calculate Balanced Error Rate (BER)
      def balanced_error_rate(cm):
          tn, fp, fn, tp = cm.ravel()
          print(f'tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}')
          # Calculate the False Positive Rate and False Negative Rate
          fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
          fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
          #integral tpr and fpr for auc
          # Calculate the Balanced Error Rate
          ber = 0.5 * (fpr + fnr)

          return ber

      def compute_auc_manual(y_true, y_scores):
        # Sort by predicted scores
        desc_score_indices = np.argsort(y_scores)[::-1]  # Sort scores in descending order
        y_scores = y_scores[desc_score_indices]
        y_true = y_true[desc_score_indices]

        # Total number of positives and negatives
        num_positives = np.sum(y_true)
        num_negatives = len(y_true) - num_positives

        if num_positives == 0 or num_negatives == 0:
            raise ValueError("AUC is not defined when there are no positive or negative samples.")

        # Initialize variables
        tpr = []  # True Positive Rate
        fpr = []  # False Positive Rate

        # Initialize counters for TP and FP
        tp = 0
        fp = 0

        # Loop through the sorted true labels and scores
        for i in range(len(y_true)):
            if y_true[i] == 1:
                tp += 1
            else:
                fp += 1

            # Calculate TPR and FPR at each threshold
            tpr.append(tp / num_positives)  # True Positive Rate
            fpr.append(fp / num_negatives)  # False Positive Rate

        # Convert lists to numpy arrays for numerical integration
        tpr = np.array(tpr)
        fpr = np.array(fpr)

        # Calculate AUC using the trapezoidal rule to integrate the ROC curve
        auc = np.trapz(tpr, fpr)  # Integrating TPR vs. FPR

        return auc

      # Your existing code
      # Generate true labels
      y_true_pos = [1] * len(pos_score)  # True labels for positive edges (1)
      y_true_neg = [0] * len(neg_score)  # True labels for negative edges (0)

      y_true = y_true_pos + y_true_neg  # Combine true labels for the test set

      # Generate predicted labels by thresholding the scores
      y_pred_pos = (pos_score >= 0.5).int().tolist()  # Predicted labels for positive edges
      y_pred_neg = (neg_score >= 0.5).int().tolist()  # Predicted labels for negative edges

      y_pred = y_pred_pos + y_pred_neg  # Combine predicted labels for the test set

      # Compute confusion matrix
      cm = confusion_matrix(y_true, y_pred)

      # Calculate and print BER
      # 1/2 (FPR + FNR)
      # FPR = FP / FP + TN
      # FNR = FN / FN + TP
      ber = balanced_error_rate(cm)
      print(f"Balanced Error Rate for {ds} for {p}p: {ber}")

      # Combine positive and negative scores (these are your y_scores)
      y_scores = torch.cat([pos_score, neg_score]).cpu().numpy()

      # Combine positive and negative labels (these are your y_true)
      y_true = np.concatenate([np.ones(pos_score.shape[0]), np.zeros(neg_score.shape[0])])

      # Now, pass these to your AUC function (either your custom one or sklearn's roc_auc_score)
      auc = compute_auc_manual(y_true, y_scores)
      print(f"Manual AUC for {ds} for {p}p: {auc}")

      # Visualize the confusion matrix using seaborn with integer formatting
      plt.figure(figsize=(6, 6))
      sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                  xticklabels=['Predicted 0', 'Predicted 1'],
                  yticklabels=['Actual 0', 'Actual 1'])
      plt.xlabel('Predicted')
      plt.ylabel('Actual')
      plt.title(f'Confusion Matrix for {ds} for {p}p')
      plt.savefig(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/cm_png/confusion_matrix_{ds}_{p}p.png' , dpi=200, bbox_inches='tight')
      plt.show()

      np.save(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/cm_npy/cm_{ds}_{p}p.npy', cm)



# Combining Graphs

In [None]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import pandas as pd
from dgl.nn import SAGEConv
import dgl.function as fn
from sklearn.metrics import roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import os
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, message="You are using `torch.load` with `weights_only=False`")

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define datasets and combinations
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
percentages = [0.05, 0.10, 0.15, 0.20, 0.25, 0.50, 0.75]
combinations = [
    ['algo004', 'comp'],
    ['algo004', 'ml'],
    ['algo004', 'virtualshakespeare'],
    ['comp', 'ml'],
    ['comp', 'virtualshakespeare'],
    ['ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml'],
    ['algo004', 'comp', 'virtualshakespeare'],
    ['algo004', 'ml', 'virtualshakespeare'],
    ['comp', 'ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml', 'virtualshakespeare']
]

# Define GraphSAGE and predictors
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return g.edata['score'][:, 0]

# Loss and metric functions
def compute_loss(pos_score, neg_score, max_weight=20):
    scores = torch.cat([pos_score, neg_score]).to(device)
    labels = torch.cat([
        torch.ones(pos_score.shape[0], device=device),
        torch.zeros(neg_score.shape[0], device=device)
    ])
    num_pos = pos_score.shape[0]
    num_neg = neg_score.shape[0]
    pos_weight = torch.tensor([num_neg / num_pos], device=device)
    pos_weight = torch.clamp(pos_weight, max=max_weight)
    loss = F.binary_cross_entropy_with_logits(scores, labels, pos_weight=pos_weight)
    return loss

def compute_auc(pos_score, neg_score):
    scores = np.concatenate([pos_score, neg_score])
    labels = np.concatenate([np.ones(len(pos_score)), np.zeros(len(neg_score))])
    return roc_auc_score(labels, scores)

def compute_acc(pos_score, neg_score):
    scores = np.concatenate([pos_score, neg_score])
    labels = np.concatenate([np.ones(len(pos_score)), np.zeros(len(neg_score))])
    predictions = (scores >= 0.5).astype(int)
    return np.mean(predictions == labels)

# Function to combine graphs and keep track of node ID ranges
def combine_graph_list(graph_list):
    num_nodes_list = [g.num_nodes() for g in graph_list]
    total_nodes = sum(num_nodes_list)
    offsets = np.cumsum([0] + num_nodes_list[:-1])

    def process_graph(idx):
        g = graph_list[idx]
        offset = offsets[idx]
        src, dst = g.edges()
        src = src + offset
        dst = dst + offset
        node_id_range = (offset, offset + g.num_nodes())
        return src, dst, node_id_range

    results = Parallel(n_jobs=-1)(
        delayed(process_graph)(idx) for idx in range(len(graph_list))
    )

    src_nodes_list = [res[0] for res in results]
    dst_nodes_list = [res[1] for res in results]
    node_id_ranges = [res[2] for res in results]

    # Concatenate edges
    combined_src = torch.cat(src_nodes_list)
    combined_dst = torch.cat(dst_nodes_list)
    combined_g = dgl.graph((combined_src, combined_dst), num_nodes=total_nodes)
    return combined_g, node_id_ranges

# Balanced Error Rate
def balanced_error_rate(cm):
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    ber = 0.5 * (fpr + fnr)
    return ber

# Main training and evaluation loop
for percentage in percentages:
    for combo in combinations:
        print(f"Processing combination {combo} at {percentage}%")
        # Initialize lists to store graphs and node ID ranges
        train_pos_g_list = []
        train_neg_g_list = []
        test_pos_g_list = []
        test_neg_g_list = []
        full_g_list = []

        # Load graphs and combine them
        for ds in combo:
            # Load graphs
            graphs, _ = dgl.load_graphs(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/graphs/graphs_{ds}_{percentage}p.bin')
            train_pos_g, train_neg_g, test_pos_g, test_neg_g, g = graphs
            train_pos_g_list.append(train_pos_g)
            train_neg_g_list.append(train_neg_g)
            test_pos_g_list.append(test_pos_g)
            test_neg_g_list.append(test_neg_g)
            full_g_list.append(g)

        # Combine graphs
        train_pos_g_combined, _ = combine_graph_list(train_pos_g_list)
        train_neg_g_combined, _ = combine_graph_list(train_neg_g_list)
        test_pos_g_combined, _ = combine_graph_list(test_pos_g_list)
        test_neg_g_combined, _ = combine_graph_list(test_neg_g_list)
        g_combined, node_id_ranges = combine_graph_list(full_g_list)

        # Prepare inputs (adjacency matrix as features)
        inputs = g_combined.adj().to_dense().to(device)

        # Move graphs to device
        train_pos_g_combined = train_pos_g_combined.to(device)
        train_neg_g_combined = train_neg_g_combined.to(device)
        test_pos_g_combined = test_pos_g_combined.to(device)
        test_neg_g_combined = test_neg_g_combined.to(device)
        g_combined = g_combined.to(device)

        # Initialize model and optimizer
        model = GraphSAGE(g_combined.number_of_nodes(), 16).to(device)
        pred = DotPredictor().to(device)
        optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.0005)

        # Training loop
        for e in range(2001):
            h = model(g_combined, inputs)
            pos_score = pred(train_pos_g_combined, h)
            neg_score = pred(train_neg_g_combined, h)
            loss = compute_loss(pos_score, neg_score)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if e % 100 == 0:
                print(f'In epoch {e}, loss: {loss.item()}')

        # Evaluation
        model.eval()
        with torch.no_grad():
            h = model(g_combined, inputs)
            pos_score = pred(test_pos_g_combined, h)
            neg_score = pred(test_neg_g_combined, h)

        # Extract scores per dataset
        total_nodes = g_combined.num_nodes()
        node_id_to_dataset = np.zeros(total_nodes, dtype=int) - 1  # Initialize to -1
        for idx, (start, end) in enumerate(node_id_ranges):
            node_id_to_dataset[start:end] = idx

        # Positive edges
        pos_src = test_pos_g_combined.edges()[0].cpu().numpy()
        pos_dst = test_pos_g_combined.edges()[1].cpu().numpy()
        pos_scores = pos_score.cpu().numpy()

        src_dataset_indices = node_id_to_dataset[pos_src]
        dst_dataset_indices = node_id_to_dataset[pos_dst]

        same_dataset_mask = (src_dataset_indices == dst_dataset_indices) & (src_dataset_indices >= 0)
        pos_edge_dataset_indices = src_dataset_indices[same_dataset_mask]
        pos_scores = pos_scores[same_dataset_mask]

        # Negative edges
        neg_src = test_neg_g_combined.edges()[0].cpu().numpy()
        neg_dst = test_neg_g_combined.edges()[1].cpu().numpy()
        neg_scores = neg_score.cpu().numpy()

        src_dataset_indices_neg = node_id_to_dataset[neg_src]
        dst_dataset_indices_neg = node_id_to_dataset[neg_dst]

        same_dataset_mask_neg = (src_dataset_indices_neg == dst_dataset_indices_neg) & (src_dataset_indices_neg >= 0)
        neg_edge_dataset_indices = src_dataset_indices_neg[same_dataset_mask_neg]
        neg_scores = neg_scores[same_dataset_mask_neg]

        # Group scores by dataset
        num_datasets = len(combo)
        pos_scores_datasets = [[] for _ in range(num_datasets)]
        neg_scores_datasets = [[] for _ in range(num_datasets)]

        for idx in range(num_datasets):
            pos_scores_ds = pos_scores[pos_edge_dataset_indices == idx]
            neg_scores_ds = neg_scores[neg_edge_dataset_indices == idx]
            pos_scores_datasets[idx].extend(pos_scores_ds.tolist())
            neg_scores_datasets[idx].extend(neg_scores_ds.tolist())

        # Compute AUC, ACC, and confusion matrices per dataset
        res = []
        for idx, ds in enumerate(combo):
            pos_scores = np.array(pos_scores_datasets[idx])
            neg_scores = np.array(neg_scores_datasets[idx])
            if len(pos_scores) > 0 and len(neg_scores) > 0:
                auc = compute_auc(pos_scores, neg_scores)
                acc = compute_acc(pos_scores, neg_scores)
                print(f'AUC for {ds}: {auc}')
                print(f'ACC for {ds}: {acc}')

                # Generate confusion matrix
                y_true = np.concatenate([np.ones(len(pos_scores)), np.zeros(len(neg_scores))])
                y_scores = np.concatenate([pos_scores, neg_scores])
                y_pred = (y_scores >= 0.5).astype(int)
                cm = confusion_matrix(y_true, y_pred)
                ber = balanced_error_rate(cm)
                print(f'Balanced Error Rate for {ds}: {ber}')
                # Save confusion matrix as image
                plt.figure(figsize=(6,6))
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                            xticklabels=['Predicted 0', 'Predicted 1'],
                            yticklabels=['Actual 0', 'Actual 1'])
                plt.xlabel('Predicted')
                plt.ylabel('Actual')
                plt.title(f'Confusion Matrix for {ds} in {"-".join(combo)} at {percentage}%')
                plt.savefig(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/combined_results/cm_png/png_{percentage}/cm_{ds}_{"_".join(combo)}_{percentage}p.png', dpi=200, bbox_inches='tight')
                plt.close()

                # Save confusion matrix as numpy array
                np.save(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/combined_results/cm_npy/npy_{percentage}/cm_{ds}_{"_".join(combo)}_{percentage}p.npy', cm)


                # Append results
                res.append({
                    'Dataset': ds,
                    'Combination': '-'.join(combo),
                    'Percentage': percentage,
                    'AUC': auc,
                    'ACC': acc,
                    'BER': ber
                })
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'epoch': e
                }, f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/combined_results/Models/models_{percentage}/model_{combo}_{percentage}p.pth')
            else:
                print(f'No data for {ds}')

        # Save results to CSV
        results_df = pd.DataFrame(res)
        results_df.to_csv(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/combined_results/csv/csv_{percentage}/Combined_{"_".join(combo)}_{percentage}p.csv', index=False)



Processing combination ['algo004', 'comp'] at 0.05%
In epoch 0, loss: 3.0597212314605713
In epoch 100, loss: 0.7805683612823486
In epoch 200, loss: 0.752528965473175
In epoch 300, loss: 0.7434179782867432
In epoch 400, loss: 0.7352843880653381
In epoch 500, loss: 0.727710485458374
In epoch 600, loss: 0.7216676473617554
In epoch 700, loss: 0.7161529660224915
In epoch 800, loss: 0.7107337713241577
In epoch 900, loss: 0.7063560485839844
In epoch 1000, loss: 0.7027537822723389
In epoch 1100, loss: 0.6998125910758972
In epoch 1200, loss: 0.6973937153816223
In epoch 1300, loss: 0.6953670978546143
In epoch 1400, loss: 0.6937201619148254
In epoch 1500, loss: 0.692266583442688
In epoch 1600, loss: 0.6909770965576172
In epoch 1700, loss: 0.6898539662361145
In epoch 1800, loss: 0.6887195706367493
In epoch 1900, loss: 0.6877244114875793
In epoch 2000, loss: 0.6865882277488708
AUC for algo004: 0.9519530214539778
ACC for algo004: 0.940179638069112
Balanced Error Rate for algo004: 0.114374223607865
A

In [None]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import pandas as pd
from dgl.nn import SAGEConv
import dgl.function as fn
from sklearn.metrics import roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import os
from joblib import Parallel, delayed

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
percent = [0.05, 0.10, 0.15, 0.20, 0.25, 0.50, 0.75]
seeds = [18, 61, 53, 29, 69, 42, 2, 21, 78, 99]  # List of seeds

# Define GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

# Define DotPredictor
class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return g.edata['score'][:, 0]

# Loss and metric functions
def compute_loss(pos_score, neg_score, max_weight=20):
    scores = torch.cat([pos_score, neg_score]).to(device)
    labels = torch.cat([
        torch.ones(pos_score.shape[0], device=device),
        torch.zeros(neg_score.shape[0], device=device)
    ])
    num_pos = pos_score.shape[0]
    num_neg = neg_score.shape[0]
    pos_weight = torch.tensor([num_neg / num_pos], device=device)
    pos_weight = torch.clamp(pos_weight, max=max_weight)
    loss = F.binary_cross_entropy_with_logits(scores, labels, pos_weight=pos_weight)
    return loss

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).cpu().detach().numpy()
    labels = np.concatenate([
        np.ones(pos_score.shape[0]),
        np.zeros(neg_score.shape[0])
    ])
    return roc_auc_score(labels, scores)

def compute_acc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).cpu().detach().numpy()
    labels = np.concatenate([
        np.ones(pos_score.shape[0]),
        np.zeros(neg_score.shape[0])
    ])
    predictions = (scores >= 0.5).astype(int)
    return np.mean(predictions == labels)

# Balanced Error Rate
def balanced_error_rate(cm):
    tn, fp, fn, tp = cm.ravel()
    print(f'tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}')
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    ber = 0.5 * (fpr + fnr)
    return ber

# Function to process edges
def process_edges(edge_chunk, edges_in_graph):
    pos_u, pos_v, neg_u, neg_v = [], [], [], []
    for u, v in edge_chunk:
        if (u, v) in edges_in_graph:
            pos_u.append(u)
            pos_v.append(v)
        else:
            neg_u.append(u)
            neg_v.append(v)
    return pos_u, pos_v, neg_u, neg_v

for seed in seeds:
    print(f'Processing seed {seed}')
    for ds in datasets:
        data_total = pd.read_csv(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/SLN FL/data/w_removal_{ds}', sep=" ", header=None)
        print(f'Analyzing {ds} Dataset with seed {seed}')
        data_total.columns = ["n1", "n2", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "l"]
        data_total.dropna(subset=['n1', 'n2'], inplace=True)

        # Taking unique nodes and saving indices
        nodes = np.unique(data_total[['n1', 'n2']].values)
        node_index = {node: idx for idx, node in enumerate(nodes)}

        # Create adjacency matrix
        adj_matrix = np.zeros((len(nodes), len(nodes)), dtype=float)
        for _, row in data_total.iterrows():
            i, j = node_index[row['n1']], node_index[row['n2']]
            adj_matrix[i, j] = row['l']
            adj_matrix[j, i] = row['l']

        adj_df = pd.DataFrame(adj_matrix, index=nodes, columns=nodes)

        labels = data_total["l"]
        ones_label = np.where(labels == 1)

        # Create graph using node indices
        src = data_total["n1"].map(node_index).astype(int)
        dst = data_total["n2"].map(node_index).astype(int)

        src = torch.tensor(src.values[ones_label], dtype=torch.int64)
        dst = torch.tensor(dst.values[ones_label], dtype=torch.int64)

        # Create the graph
        g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src])), num_nodes=len(nodes))
        inputs = g.adj().to_dense().to(device)

        # Generate all possible edges
        upper_tri_idx = np.triu_indices_from(adj_df, k=1)
        possible_edges = [(i, j) for i, j in zip(*upper_tri_idx)]
        possible_edges += [(j, i) for i, j in possible_edges]  # Add (j, i) for undirected graph

        total_edges = len(possible_edges)

        # Shuffle the possible edges using the seed
        np.random.seed(seed)
        np.random.shuffle(possible_edges)

        # Initialize test_edges_set per seed
        test_edges_set = set()

        for p in percent:
            res = []
            total_test_size = int(total_edges * p)
            new_test_size = total_test_size - len(test_edges_set)

            if new_test_size > 0:
                # Get the remaining edges not in test_edges_set
                remaining_edges = [edge for edge in possible_edges if edge not in test_edges_set]

                # Randomly select new_test_size edges from remaining_edges
                np.random.seed(seed + int(p*100))  # Slightly modify seed for different percentages
                new_test_edge_indices = np.random.choice(len(remaining_edges), size=new_test_size, replace=False)
                new_test_edges = [remaining_edges[i] for i in new_test_edge_indices]

                duplicate_edges = [edge for edge in new_test_edges if edge in test_edges_set]

                if duplicate_edges:
                    print(f"Duplicate edges found: {duplicate_edges}")
                    break
                else:
                    print(f"No duplicate edges found. New edges are unique.")

                    # Add new edges to test_edges_set
                    test_edges_set.update(new_test_edges)

            else:
                print("No new test edges needed.")

            # Now test_edges is test_edges_set
            test_edges = list(test_edges_set)

            # The training edges are the remaining edges not in test_edges_set
            train_edges = [edge for edge in possible_edges if edge not in test_edges_set]

            # Create a set of all edges for fast lookup
            edges_in_graph = set()
            us, vs = g.edges()
            us = us.tolist()
            vs = vs.tolist()
            for u, v in zip(us, vs):
                edges_in_graph.add((u, v))
                edges_in_graph.add((v, u))  # Ensure undirected edge representation

            # Process test edges
            num_cores = os.cpu_count()
            test_edges_chunks = np.array_split(test_edges, num_cores)

            test_results = Parallel(n_jobs=num_cores)(
                delayed(process_edges)(chunk, edges_in_graph) for chunk in test_edges_chunks
            )

            test_pos_u, test_pos_v, test_neg_u, test_neg_v = [], [], [], []
            for pos_u, pos_v, neg_u, neg_v in test_results:
                test_pos_u.extend(pos_u)
                test_pos_v.extend(pos_v)
                test_neg_u.extend(neg_u)
                test_neg_v.extend(neg_v)

            # Update edges_in_graph for training by excluding test edges
            test_edges_set_set = set(test_edges)
            edges_in_graph_train = edges_in_graph - test_edges_set_set

            # Process training edges
            train_edges_chunks = np.array_split(train_edges, num_cores)
            train_results = Parallel(n_jobs=num_cores)(
                delayed(process_edges)(chunk, edges_in_graph_train) for chunk in train_edges_chunks
            )

            train_pos_u, train_pos_v, train_neg_u, train_neg_v = [], [], [], []
            for pos_u, pos_v, neg_u, neg_v in train_results:
                train_pos_u.extend(pos_u)
                train_pos_v.extend(pos_v)
                train_neg_u.extend(neg_u)
                train_neg_v.extend(neg_v)

            # Create subgraphs for training and testing
            train_pos_u = torch.tensor(train_pos_u)
            train_pos_v = torch.tensor(train_pos_v)
            train_neg_u = torch.tensor(train_neg_u)
            train_neg_v = torch.tensor(train_neg_v)

            test_pos_u = torch.tensor(test_pos_u)
            test_pos_v = torch.tensor(test_pos_v)
            test_neg_u = torch.tensor(test_neg_u)
            test_neg_v = torch.tensor(test_neg_v)

            train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
            train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())
            test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
            test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

            # Create a mapping from edge tuples to edge IDs, including both directions
            edge_tuple_to_eid = {}
            u_edges, v_edges = g.edges()
            for eid, (u, v) in enumerate(zip(u_edges.tolist(), v_edges.tolist())):
                edge_tuple_to_eid[(u, v)] = eid
                edge_tuple_to_eid[(v, u)] = eid  # Include reverse direction

            # Get the edge IDs for test positive edges
            test_pos_eids = []
            for u, v in zip(test_pos_u.tolist(), test_pos_v.tolist()):
                eid = edge_tuple_to_eid.get((u, v))
                if eid is None:
                    eid = edge_tuple_to_eid.get((v, u))
                if eid is not None:
                    test_pos_eids.append(eid)

            # Remove the test positive edges from g to create train_g
            train_g = dgl.remove_edges(g, test_pos_eids).to(device)

            # Verify the edge counts
            print(f'train_g edges: {train_g.num_edges()}, original g edges: {g.num_edges()}')

            # Verify the splits
            print(f"Dataset: {ds}")
            print(f"Percentage: {p*100}%")
            print(f"Number of test edges: {len(test_edges)}")
            print(f"Number of training edges: {len(train_edges)}")

            train_pos_g = train_pos_g.to(device)
            train_neg_g = train_neg_g.to(device)
            test_pos_g = test_pos_g.to(device)
            test_neg_g = test_neg_g.to(device)

            # Initialize model and optimizer
            model = GraphSAGE(g.number_of_nodes(), 16).to(device)
            pred = DotPredictor().to(device)
            optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.0005)

            # Training loop
            for e in range(1501):
                h = model(train_g, train_g.adj().to_dense().to(device))
                pos_score = pred(train_pos_g, h)
                neg_score = pred(train_neg_g, h)
                loss = compute_loss(pos_score, neg_score)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if e % 100 == 0:
                    print('In epoch {}, loss: {}'.format(e, loss.item()))

            model.eval()
            with torch.no_grad():
                h = model(train_g, train_g.adj().to_dense().to(device))
                pos_score = pred(test_pos_g, h)
                neg_score = pred(test_neg_g, h)

                auc = compute_auc(pos_score, neg_score)
                acc = compute_acc(pos_score, neg_score)
                print('AUC', auc)
                print('ACC', acc)

            # Compute confusion matrix and BER
            y_true_pos = [1] * len(pos_score)  # True labels for positive edges (1)
            y_true_neg = [0] * len(neg_score)  # True labels for negative edges (0)
            y_true = y_true_pos + y_true_neg
            y_pred_pos = (pos_score >= 0.5).int().tolist()
            y_pred_neg = (neg_score >= 0.5).int().tolist()
            y_pred = y_pred_pos + y_pred_neg
            cm = confusion_matrix(y_true, y_pred)
            ber = balanced_error_rate(cm)
            print(f"Balanced Error Rate for {ds} at {p}% with seed {seed}: {ber}")

            # Save confusion matrix as image
            plt.figure(figsize=(6, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                        xticklabels=['Predicted 0', 'Predicted 1'],
                        yticklabels=['Actual 0', 'Actual 1'])
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
            plt.title(f'Confusion Matrix for {ds} at {p}% (Seed {seed})')
            plt.savefig(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/cm_png/{seed}/confusion_matrix_{ds}_{p}p_seed{seed}.png', dpi=200, bbox_inches='tight')
            plt.close()

            # Save confusion matrix as numpy array
            np.save(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/cm_npy/{seed}/cm_{ds}_{p}p_seed{seed}.npy', cm)

            # Append results
            res.append({
                'Dataset': ds,
                'Seed': seed,
                'Test_Size': p,
                'AUC': auc,
                'ACC': acc,
                'BER': ber
            })

            # Convert results to DataFrame and save
            res_df = pd.DataFrame(res)
            res_df.to_csv(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/csv/{seed}/results_{ds}_{p}p_seed{seed}.csv', index=False)

            # Save model
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': e
            }, f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/models/{seed}/model_{ds}_{p}p_seed{seed}.pth')

            # Save graphs
            dgl.save_graphs(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/graphs/{seed}/graphs_{ds}_{p}p_seed{seed}.bin', [train_pos_g, train_neg_g, test_pos_g, test_neg_g, g])



DGL backend not selected or invalid.  Assuming PyTorch for now.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Balanced Error Rate for algo004 at 0.25% with seed 29: 0.12864915556649154
No duplicate edges found. New edges are unique.
train_g edges: 8534, original g edges: 13546
Dataset: algo004
Percentage: 50.0%
Number of test edges: 678030
Number of training edges: 678030
In epoch 0, loss: 1.2123637199401855
In epoch 100, loss: 0.7607712149620056
In epoch 200, loss: 0.7445869445800781
In epoch 300, loss: 0.7343974709510803
In epoch 400, loss: 0.7236365079879761
In epoch 500, loss: 0.71242356300354
In epoch 600, loss: 0.7033018469810486
In epoch 700, loss: 0.6958670020103455
In epoch 800, loss: 0.6894582509994507
In epoch 900, loss: 0.6831847429275513
In epoch 1000, loss: 0.6767660975456238
In epoch 1100, loss: 0.6704502701759338
In epoch 1200, loss: 0.6646671891212463
In epoch 1300, loss: 0.6599022150039673
In epoch 1400, loss: 0.6559640765190125
In epoch 1500, loss: 0.6523853540420532
AUC 0.9451781485222174
ACC 0.930423432591478

In [None]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import pandas as pd
from dgl.nn import SAGEConv
import dgl.function as fn
from sklearn.metrics import roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import os
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, message="You are using `torch.load` with `weights_only=False`")

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define datasets, percentages, combinations, and seeds
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
percentages = [0.05, 0.10, 0.15, 0.20, 0.25, 0.50, 0.75]
seeds = [21, 78, 99]
combinations = [
    ['algo004', 'comp'],
    ['algo004', 'ml'],
    ['algo004', 'virtualshakespeare'],
    ['comp', 'ml'],
    ['comp', 'virtualshakespeare'],
    ['ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml'],
    ['algo004', 'comp', 'virtualshakespeare'],
    ['algo004', 'ml', 'virtualshakespeare'],
    ['comp', 'ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml', 'virtualshakespeare']
]

# Define GraphSAGE and predictors
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return g.edata['score'][:, 0]

# Loss and metric functions
def compute_loss(pos_score, neg_score, max_weight=20):
    scores = torch.cat([pos_score, neg_score]).to(device)
    labels = torch.cat([
        torch.ones(pos_score.shape[0], device=device),
        torch.zeros(neg_score.shape[0], device=device)
    ])
    num_pos = pos_score.shape[0]
    num_neg = neg_score.shape[0]
    pos_weight = torch.tensor([num_neg / num_pos], device=device)
    pos_weight = torch.clamp(pos_weight, max=max_weight)
    loss = F.binary_cross_entropy_with_logits(scores, labels, pos_weight=pos_weight)
    return loss

def compute_auc(pos_score, neg_score):
    scores = np.concatenate([pos_score, neg_score])
    labels = np.concatenate([np.ones(len(pos_score)), np.zeros(len(neg_score))])
    return roc_auc_score(labels, scores)

def compute_acc(pos_score, neg_score):
    scores = np.concatenate([pos_score, neg_score])
    labels = np.concatenate([np.ones(len(pos_score)), np.zeros(len(neg_score))])
    predictions = (scores >= 0.5).astype(int)
    return np.mean(predictions == labels)

# Function to combine graphs and keep track of node ID ranges
def combine_graph_list(graph_list):
    num_nodes_list = [g.num_nodes() for g in graph_list]
    total_nodes = sum(num_nodes_list)
    offsets = np.cumsum([0] + num_nodes_list[:-1])

    def process_graph(idx):
        g = graph_list[idx]
        offset = offsets[idx]
        src, dst = g.edges()
        src = src + offset
        dst = dst + offset
        node_id_range = (offset, offset + g.num_nodes())
        return src, dst, node_id_range

    results = Parallel(n_jobs=-1)(
        delayed(process_graph)(idx) for idx in range(len(graph_list))
    )

    src_nodes_list = [res[0] for res in results]
    dst_nodes_list = [res[1] for res in results]
    node_id_ranges = [res[2] for res in results]

    # Concatenate edges
    combined_src = torch.cat(src_nodes_list)
    combined_dst = torch.cat(dst_nodes_list)
    combined_g = dgl.graph((combined_src, combined_dst), num_nodes=total_nodes)
    return combined_g, node_id_ranges

# Balanced Error Rate
def balanced_error_rate(cm):
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    ber = 0.5 * (fpr + fnr)
    return ber

# Main training and evaluation loop
for seed in seeds:
    print(f"Processing seed {seed}")
    for percentage in percentages:
        percentage_int = int(percentage * 100)
        for combo in combinations:
            print(f"Processing combination {combo} at {percentage_int}% with seed {seed}")
            # Initialize lists to store graphs and node ID ranges
            train_pos_g_list = []
            train_neg_g_list = []
            test_pos_g_list = []
            test_neg_g_list = []
            full_g_list = []

            # Load graphs and combine them
            for ds in combo:
                # Load graphs with seed included in the path
                graphs_path = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/graphs/{seed}/graphs_{ds}_{percentage}p_seed{seed}.bin'
                graphs, _ = dgl.load_graphs(graphs_path)
                train_pos_g, train_neg_g, test_pos_g, test_neg_g, g = graphs
                train_pos_g_list.append(train_pos_g)
                train_neg_g_list.append(train_neg_g)
                test_pos_g_list.append(test_pos_g)
                test_neg_g_list.append(test_neg_g)
                full_g_list.append(g)

            # Combine graphs
            train_pos_g_combined, _ = combine_graph_list(train_pos_g_list)
            train_neg_g_combined, _ = combine_graph_list(train_neg_g_list)
            test_pos_g_combined, node_id_ranges = combine_graph_list(test_pos_g_list)
            test_neg_g_combined, _ = combine_graph_list(test_neg_g_list)
            g_combined, _ = combine_graph_list(full_g_list)

            # Prepare inputs (adjacency matrix as features)
            inputs = g_combined.adj().to_dense().to(device)

            # Move graphs to device
            train_pos_g_combined = train_pos_g_combined.to(device)
            train_neg_g_combined = train_neg_g_combined.to(device)
            test_pos_g_combined = test_pos_g_combined.to(device)
            test_neg_g_combined = test_neg_g_combined.to(device)
            g_combined = g_combined.to(device)

            # Initialize model and optimizer
            model = GraphSAGE(g_combined.number_of_nodes(), 16).to(device)
            pred = DotPredictor().to(device)
            optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.0005)

            # Training loop
            model.train()  # Ensure model is in training mode
            for e in range(1501):
                h = model(g_combined, inputs)
                pos_score = pred(train_pos_g_combined, h)
                neg_score = pred(train_neg_g_combined, h)
                loss = compute_loss(pos_score, neg_score)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if e % 100 == 0:
                    print(f'In epoch {e}, loss: {loss.item()}')

            # Evaluation
            model.eval()  # Set model to evaluation mode
            with torch.no_grad():
                h = model(g_combined, inputs)
                pos_score = pred(test_pos_g_combined, h)
                neg_score = pred(test_neg_g_combined, h)

            # Extract scores per dataset
            total_nodes = g_combined.num_nodes()
            node_id_to_dataset = np.zeros(total_nodes, dtype=int) - 1  # Initialize to -1
            for idx, (start, end) in enumerate(node_id_ranges):
                node_id_to_dataset[start:end] = idx

            # Positive edges
            pos_src = test_pos_g_combined.edges()[0].cpu().numpy()
            pos_dst = test_pos_g_combined.edges()[1].cpu().numpy()
            pos_scores = pos_score.cpu().numpy()

            src_dataset_indices = node_id_to_dataset[pos_src]
            dst_dataset_indices = node_id_to_dataset[pos_dst]

            same_dataset_mask = (src_dataset_indices == dst_dataset_indices) & (src_dataset_indices >= 0)
            pos_edge_dataset_indices = src_dataset_indices[same_dataset_mask]
            pos_scores = pos_scores[same_dataset_mask]

            # Negative edges
            neg_src = test_neg_g_combined.edges()[0].cpu().numpy()
            neg_dst = test_neg_g_combined.edges()[1].cpu().numpy()
            neg_scores = neg_score.cpu().numpy()

            src_dataset_indices_neg = node_id_to_dataset[neg_src]
            dst_dataset_indices_neg = node_id_to_dataset[neg_dst]

            same_dataset_mask_neg = (src_dataset_indices_neg == dst_dataset_indices_neg) & (src_dataset_indices_neg >= 0)
            neg_edge_dataset_indices = src_dataset_indices_neg[same_dataset_mask_neg]
            neg_scores = neg_scores[same_dataset_mask_neg]

            # Group scores by dataset
            num_datasets = len(combo)
            pos_scores_datasets = [[] for _ in range(num_datasets)]
            neg_scores_datasets = [[] for _ in range(num_datasets)]

            for idx in range(num_datasets):
                pos_scores_ds = pos_scores[pos_edge_dataset_indices == idx]
                neg_scores_ds = neg_scores[neg_edge_dataset_indices == idx]
                pos_scores_datasets[idx].extend(pos_scores_ds.tolist())
                neg_scores_datasets[idx].extend(neg_scores_ds.tolist())

            # Compute AUC, ACC, and confusion matrices per dataset
            res = []
            for idx, ds in enumerate(combo):
                pos_scores = np.array(pos_scores_datasets[idx])
                neg_scores = np.array(neg_scores_datasets[idx])
                if len(pos_scores) > 0 and len(neg_scores) > 0:
                    auc = compute_auc(pos_scores, neg_scores)
                    acc = compute_acc(pos_scores, neg_scores)
                    print(f'AUC for {ds} on seed {seed}: {auc}')
                    print(f'ACC for {ds} on seed {seed}: {acc}')

                    # Generate confusion matrix
                    y_true = np.concatenate([np.ones(len(pos_scores)), np.zeros(len(neg_scores))])
                    y_scores = np.concatenate([pos_scores, neg_scores])
                    y_pred = (y_scores >= 0.5).astype(int)
                    cm = confusion_matrix(y_true, y_pred)
                    ber = balanced_error_rate(cm)
                    print(f'Balanced Error Rate for {ds}: {ber}')

                    # set directories
                    cm_png_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/combined_results/cm_png/png_{percentage}/{seed}'
                    cm_npy_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/combined_results/cm_npy/npy_{percentage}/{seed}'
                    model_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/combined_results/Models/models_{percentage}/{seed}'
                    csv_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/combined_results/csv/csv_{percentage}/{seed}'

                    # Save confusion matrix as image
                    plt.figure(figsize=(6,6))
                    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                                xticklabels=['Predicted 0', 'Predicted 1'],
                                yticklabels=['Actual 0', 'Actual 1'])
                    plt.xlabel('Predicted')
                    plt.ylabel('Actual')
                    plt.title(f'Confusion Matrix for {ds} in {"-".join(combo)} at {percentage_int}% (Seed {seed})')
                    plt.savefig(f'{cm_png_dir}/cm_{ds}_{"_".join(combo)}_{percentage}p_seed{seed}.png', dpi=200, bbox_inches='tight')
                    plt.close()

                    # Save confusion matrix as numpy array
                    np.save(f'{cm_npy_dir}/cm_{ds}_{"_".join(combo)}_{percentage}p_seed{seed}.npy', cm)

                    # Append results
                    res.append({
                        'Dataset': ds,
                        'Combination': '-'.join(combo),
                        'Percentage': percentage,
                        'Seed': seed,
                        'AUC': auc,
                        'ACC': acc,
                        'BER': ber
                    })
                else:
                    print(f'No data for {ds}')

            # Save model
            model_save_path = f'{model_dir}/model_{"_".join(combo)}_{percentage}p_seed{seed}.pth'
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': e
            }, model_save_path)

            # Save results to CSV
            results_df = pd.DataFrame(res)
            csv_save_path = f'{csv_dir}/Combined_{"_".join(combo)}_{percentage}p_seed{seed}.csv'
            results_df.to_csv(csv_save_path, index=False)


DGL backend not selected or invalid.  Assuming PyTorch for now.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
In epoch 1100, loss: 0.7290806770324707
In epoch 1200, loss: 0.7278087139129639
In epoch 1300, loss: 0.7266542315483093
In epoch 1400, loss: 0.7256326675415039
In epoch 1500, loss: 0.724814772605896
AUC for algo004 on seed 21: 0.95469639279876
ACC for algo004 on seed 21: 0.9313747179328349
Balanced Error Rate for algo004: 0.10856160281141847
AUC for comp on seed 21: 0.9041365418707072
ACC for comp on seed 21: 0.9476043340337
Balanced Error Rate for comp: 0.17175514797559102
AUC for ml on seed 21: 0.8694267061365663
ACC for ml on seed 21: 0.9647142757687196
Balanced Error Rate for ml: 0.19571030007089213
Processing combination ['algo004', 'comp', 'virtualshakespeare'] at 15% with seed 21
In epoch 0, loss: 2.143583059310913
In epoch 100, loss: 0.7738823294639587
In epoch 200, loss: 0.7587386965751648
In epoch 300, loss: 0.7474077939987183
In epoch 400, loss: 0.7386520504951477
In epoch 500, loss: 0.7306910753250122
In epoch

In [None]:
import os

# Define the main folders and base path in Google Drive
base_path = '/content/drive/MyDrive/Colab Notebooks/Fed Learning Research'
main_folders = ['algo004_cnn', 'virtualshakespeare_cnn', 'comp_cnn', 'ml_cnn']
subfolders = ['models', 'graphs', 'csv', 'cm_npy', 'cm_png']
seeds = [18, 61, 53, 29, 69, 42, 2, 21, 78, 99]  # Seeds for subdirectories

# Function to create the folder structure
def create_folders(base_path, main_folders, subfolders, seeds):
    for main_folder in main_folders:
        for subfolder in subfolders:
            # Path to each subfolder within the main folder
            subfolder_path = os.path.join(base_path, main_folder, subfolder)
            # Create the subfolder if it doesn’t exist
            if not os.path.exists(subfolder_path):
                os.makedirs(subfolder_path)
            # Create seed folders within each subfolder
            for seed in seeds:
                seed_folder_path = os.path.join(subfolder_path, str(seed))
                if not os.path.exists(seed_folder_path):
                    os.makedirs(seed_folder_path)

# Run the folder creation script
create_folders(base_path, main_folders, subfolders, seeds)

print("Folders created successfully in Google Drive.")


Folders created successfully in Google Drive.


In [None]:
import networkx as nx
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Suppress scientific notation
np.set_printoptions(suppress=True)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
seeds = [18, 61, 53, 29, 69, 42, 2, 21, 78, 99]  # List of seeds

# Define MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size=128):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size).to(device)
        self.bn1 = nn.BatchNorm1d(hidden_size).to(device)
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2).to(device)
        self.bn2 = nn.BatchNorm1d(hidden_size // 2).to(device)
        self.fc3 = nn.Linear(hidden_size // 2, 1).to(device)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = torch.sigmoid(self.fc3(x))
        return x

# Loss function
def compute_loss(pred, labels):
    loss = F.binary_cross_entropy(pred, labels)
    return loss

# Balanced Error Rate
def balanced_error_rate(cm):
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    ber = 0.5 * (fpr + fnr)
    return ber

# Function to calculate features
def calculate_features(G_nx, edge_list):
    # Precompute resource allocation index
    rai_gen = nx.resource_allocation_index(G_nx, edge_list)
    rai = {(u, v): p for u, v, p in rai_gen}

    # Precompute Jaccard coefficient
    jc_gen = nx.jaccard_coefficient(G_nx, edge_list)
    jc = {(u, v): p for u, v, p in jc_gen}

    # Precompute Adamic-Adar index
    aa_gen = nx.adamic_adar_index(G_nx, edge_list)
    aa = {(u, v): p for u, v, p in aa_gen}

    # Precompute Preferential Attachment
    pa_gen = nx.preferential_attachment(G_nx, edge_list)
    pa = {(u, v): p for u, v, p in pa_gen}

    # Precompute shortest path lengths up to cutoff
    shortest_paths = dict(nx.all_pairs_shortest_path_length(G_nx, cutoff=5))

    features = []
    for u, v in edge_list:
        # Jaccard Coefficient
        jc_score = jc.get((u, v), 0.0)
        # Adamic-Adar Index
        aa_score = aa.get((u, v), 0.0)
        # Resource Allocation Index
        rai_score = rai.get((u, v), 0.0)
        # Preferential Attachment
        pa_score = pa.get((u, v), 0.0)
        # Shortest Path Length
        spl = shortest_paths.get(u, {}).get(v, 6)  # Use 6 if path length > 5 or no path
        # Number of Shortest Paths
        num_paths = 1  # For simplicity, set to 1
        features.append([jc_score, aa_score, rai_score, pa_score, spl, num_paths])
    return np.array(features)

# Main loop
for seed in seeds:
    print(f'Processing seed {seed}')
    for ds in datasets:
        print(f'Analyzing {ds} Dataset with seed {seed}')
        data_total = pd.read_csv(f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/SLN FL/data/w_removal_{ds}', sep=" ", header=None)
        data_total.columns = ["n1", "n2", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "l"]
        data_total.dropna(subset=['n1', 'n2'], inplace=True)

        # Taking unique nodes and saving indices
        nodes = np.unique(data_total[['n1', 'n2']].values)
        node_index = {node: idx for idx, node in enumerate(nodes)}
        num_nodes = len(nodes)

        # Map node IDs to indices
        edges = data_total[data_total['l'] == 1][['n1', 'n2']].values
        edges_mapped = np.array([[node_index[u], node_index[v]] for u, v in edges])

        # Create NetworkX graph
        G = nx.Graph()
        G.add_nodes_from(range(num_nodes))
        G.add_edges_from(edges_mapped)

        # Generate all possible edges (excluding self-loops)
        possible_edges = [(i, j) for i in range(num_nodes) for j in range(i+1, num_nodes)]
        total_edges = len(possible_edges)

        # Shuffle edges
        np.random.seed(seed)
        np.random.shuffle(possible_edges)

        # Split edges into test and train sets (50% test)
        test_size = int(0.5 * total_edges)
        test_edges = possible_edges[:test_size]
        train_edges = possible_edges[test_size:]

        # Create training graph by removing test edges
        G_train = G.copy()
        G_train.remove_edges_from(test_edges)

        # Prepare training data
        print("Calculating training features...")
        train_features = calculate_features(G_train, train_edges)
        train_labels = np.array([1 if G.has_edge(u, v) else 0 for u, v in train_edges])

        # Prepare test data
        print("Calculating test features...")
        test_features = calculate_features(G_train, test_edges)
        test_labels = np.array([1 if G.has_edge(u, v) else 0 for u, v in test_edges])

        # Standardize features
        scaler = StandardScaler()
        train_features = scaler.fit_transform(train_features)
        test_features = scaler.transform(test_features)

        # Convert to tensors and move to GPU
        X_train = torch.tensor(train_features, dtype=torch.float32).to(device)
        y_train = torch.tensor(train_labels, dtype=torch.float32).unsqueeze(1).to(device)
        X_test = torch.tensor(test_features, dtype=torch.float32).to(device)
        y_test = torch.tensor(test_labels, dtype=torch.float32).unsqueeze(1).to(device)

        # Initialize model, optimizer
        input_size = X_train.shape[1]
        model = MLP(input_size).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        # Training loop
        num_epochs = 100
        batch_size = 8192  # Adjust batch size based on your GPU memory
        num_batches = int(np.ceil(X_train.shape[0] / batch_size))
        model.train()
        for epoch in range(num_epochs):
            permutation = torch.randperm(X_train.size()[0]).to(device)
            epoch_loss = 0.0
            for i in range(num_batches):
                indices = permutation[i*batch_size:(i+1)*batch_size]
                batch_x, batch_y = X_train[indices], y_train[indices]

                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = compute_loss(outputs, batch_y)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
            if epoch % 10 == 0:
                print(f'Epoch {epoch}, Loss: {epoch_loss/num_batches}')

        # Evaluation
        model.eval()
        with torch.no_grad():
            outputs = model(X_test)
            preds = (outputs >= 0.5).float()
            y_true = y_test.cpu().numpy()
            y_scores = outputs.cpu().numpy()
            y_pred = preds.cpu().numpy()

            # Compute metrics
            auc = roc_auc_score(y_true, y_scores)
            acc = np.mean(y_pred == y_true)
            cm = confusion_matrix(y_true, y_pred)
            ber = balanced_error_rate(cm)
            print(f'AUC: {auc}')
            print(f'ACC: {acc}')
            print(f'BER: {ber}')

        # Define directories (no need to create them)
        base_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_cnn'
        cm_png_dir = os.path.join(base_dir, 'cm_png', str(seed))
        cm_npy_dir = os.path.join(base_dir, 'cm_npy', str(seed))
        model_dir = os.path.join(base_dir, 'models', str(seed))
        csv_dir = os.path.join(base_dir, 'csv', str(seed))
        graphs_dir = os.path.join(base_dir, 'graphs', str(seed))

        # Save confusion matrix as image
        plt.figure(figsize=(6, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                    xticklabels=['Predicted 0', 'Predicted 1'],
                    yticklabels=['Actual 0', 'Actual 1'])
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title(f'Confusion Matrix for {ds} (Seed {seed})')
        plt.savefig(f'{cm_png_dir}/confusion_matrix_{ds}_seed{seed}.png', dpi=200, bbox_inches='tight')
        plt.close()

        # Save confusion matrix as numpy array
        np.save(f'{cm_npy_dir}/cm_{ds}_seed{seed}.npy', cm)

        # Save model
        model_save_path = f'{model_dir}/model_{ds}_seed{seed}.pth'
        torch.save(model.state_dict(), model_save_path)

        # Save results to CSV
        res = [{
            'Dataset': ds,
            'Seed': seed,
            'AUC': auc,
            'ACC': acc,
            'BER': ber
        }]
        results_df = pd.DataFrame(res)
        csv_save_path = f'{csv_dir}/results_{ds}_seed{seed}.csv'
        results_df.to_csv(csv_save_path, index=False)




Using device: cuda
Processing seed 18
Analyzing algo004 Dataset with seed 18
Calculating training features...
Calculating test features...
Epoch 0, Loss: 0.351457617467358
Epoch 10, Loss: 0.005677654563138883
Epoch 20, Loss: 0.0017169860407843121
Epoch 30, Loss: 0.0004926402734348639
Epoch 40, Loss: 0.0005600910271390429
Epoch 50, Loss: 0.00015769555587515546
Epoch 60, Loss: 7.77787004370198e-05
Epoch 70, Loss: 6.172961842648441e-05
Epoch 80, Loss: 3.8392464678812153e-05
Epoch 90, Loss: 0.00017428504456585783
AUC: 0.8633658744136963
ACC: 0.9913484654071354
BER: 0.4245801968731905
Analyzing comp Dataset with seed 18
Calculating training features...
Calculating test features...
Epoch 0, Loss: 0.46975901603698733
Epoch 10, Loss: 0.02070616565644741
Epoch 20, Loss: 0.005643592402338982
Epoch 30, Loss: 0.00202217108104378
Epoch 40, Loss: 0.0018191658006981015
Epoch 50, Loss: 0.000975847882218659
Epoch 60, Loss: 0.000885573944542557
Epoch 70, Loss: 0.0015362016437575222
Epoch 80, Loss: 0.000