In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from littleballoffur import RandomWalkWithRestartSampler
from littleballoffur import ForestFireSampler
from torch_geometric.nn import GCNConv, global_mean_pool, GAE
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import json
import random
import numpy as np
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx
from torch_geometric.loader import ClusterData
from torch_geometric.loader import ClusterLoader
from sklearn.decomposition import NMF
from numpy.linalg import svd
import pickle
from scipy.sparse import coo_matrix


# class GCNEncoder(torch.nn.Module):
#         def __init__(self, in_channels, out_channels):
#             super(GCNEncoder, self).__init__()
#             self.conv1 = GCNConv(in_channels, 2 * out_channels)
#             self.conv2 = GCNConv(2 * out_channels, out_channels)
    
#         def forward(self, x, edge_index):
#             x = torch.relu(self.conv1(x, edge_index))
#             x = torch.dropout(x, p=0.2, train=self.training)
#             x = self.conv2(x, edge_index)
#             return x

# class GCNEncoder(torch.nn.Module):
#     def __init__(self, in_channels, hidden_channels, out_channels):
#         super(GCNEncoder, self).__init__()
#         self.conv1 = GCNConv(in_channels, hidden_channels)
#         self.conv2 = GCNConv(hidden_channels, out_channels)

#     def forward(self, x, edge_index):
#         x = F.relu(self.conv1(x, edge_index))
#         x = torch.dropout(x, p=0.2, train=self.training)
#         x = F.relu(self.conv2(x, edge_index))
#         return x


import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, BatchNorm
import scipy.sparse as sp

class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super(GCNEncoder, self).__init__()
        self.layers = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()


        # First layer
        self.layers.append(GCNConv(in_channels, hidden_channels))
        self.batch_norms.append(BatchNorm(hidden_channels))
        
        # Hidden layers
        for _ in range(num_layers - 2):
            self.layers.append(GCNConv(hidden_channels, hidden_channels))
            self.batch_norms.append(BatchNorm(hidden_channels))

        # Output layer
        self.layers.append(GCNConv(hidden_channels, out_channels))


    def forward(self, x, edge_index):
        for i in range(len(self.layers) - 1):
            x = F.relu(self.batch_norms[i](self.layers[i](x, edge_index)))
            x = F.dropout(x, p=0.2, training=self.training)
            
        x = F.relu(self.layers[-1](x, edge_index))
        return x

    


class GAEPipeline:
    def __init__(self, in_channels, out_channels, sampling_method='method1', preprocessing=True):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.sampling_method_name = sampling_method
        self.sampling_method = self._get_sampling_method(sampling_method)
        self.preprocessing = preprocessing
        self.encoder = GCNEncoder(in_channels, 64, out_channels, 1).to(self.device)
        self.model = GAE(self.encoder).to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)

    def recon_loss(self, predicted_adj, true_adj):
        loss = F.binary_cross_entropy(predicted_adj, true_adj)
        return loss

    def load_graph_from_pickle(self, file_path):
        with open(file_path, 'rb') as f:
            return pickle.load(f)

    def _get_sampling_method(self, method_name):
        """Dynamically selects the sampling method."""
        if method_name == 'random_walk':
            return self.random_walk
        elif method_name == 'forest_fire':
            return self.forest_fire
        elif method_name == 'clusterGCN':
            return self.cluster_GCN
        else:
            raise ValueError("Unknown sampling method")

    def convert_node_labels_to_integers(self, graph):
        """
        Convert the node labels of a NetworkX graph to integers.
        
        Parameters:
        - graph: NetworkX graph with any type of node labels.
        
        Returns:
        - A new NetworkX graph with node labels converted to integers.
        """
        # Create a mapping from old labels to new ones (integers)
        mapping = {node: i for i, node in enumerate(graph.nodes())}
        # Use relabel_nodes to create a new graph with integer labels
        graph_int_labels = nx.relabel_nodes(graph, mapping)
        
        return graph_int_labels

    def preprocess_features(self, features):
        # Convert features to a numeric format, handle non-numeric cases
        processed_features = []
        for feature in features:
            try:
                # convert the features to float
                processed_features.append(float(feature))
            except ValueError:
                # Handle non-numeric feature (could implement encoding here)
                processed_features.append(0.0)  # Using 0.0 as a placeholder
        return processed_features

    def normalize_features(self, features):
        features = np.array(features)
        mean = features.mean(axis=0, keepdims=True)
        std = features.std(axis=0, keepdims=True)
        # Avoid division by zero
        std[std == 0] = 1
        normalized_features = (features - mean) / std
        return normalized_features.tolist()

    def from_networkx_to_torch_geometric(self, G):
        # Convert node indices to a continuous range
        mapping = {k: i for i, k in enumerate(G.nodes())}
        edges = torch.tensor([list(map(mapping.get, edge)) for edge in G.edges()], dtype=torch.long).t().contiguous()

        if G.nodes():
            # Extract a sample node to get feature keys (assumes at least one node exists)
            sample_features = next(iter(G.nodes(data=True)))[1]
            feature_keys = list(sample_features.keys())
            
            # Extract and preprocess features for all nodes
            features = []
            for _, node_features in G.nodes(data=True):
                node_feature_values = [node_features.get(key, 0) for key in feature_keys]
                processed_features = self.preprocess_features(node_feature_values)
                features.append(processed_features)
            
            # Normalize features
            features = self.normalize_features(features)
        else:
            # Default to a single feature of 0 if no nodes or features
            features = [[0]]
    
        # Convert features to a tensor
        x = torch.tensor(features, dtype=torch.float)

        # adj_matrix = nx.to_numpy_array(G, nodelist=sorted(G.nodes(), key=lambda x: mapping[x]))
        # adj_tensor = torch.tensor(adj_matrix, dtype=torch.float)
        
        # Create the Data object
        # data = Data(x=x, edge_index=edges, adj=adj_tensor)
        data = Data(x=x, edge_index=edges)
        return data.to(self.device)




    
    def random_walk(self, graph):
        graph = self.convert_node_labels_to_integers(graph)
        model = RandomWalkWithRestartSampler(20000)
        new_graph = model.sample(graph)
        return new_graph

    def forest_fire(self, graph):
        graph = self.convert_node_labels_to_integers(graph)
        model = ForestFireSampler(10000)
        new_graph = model.sample(graph)
        return new_graph

    def cluster_GCN(self, data):
        # Implement your third graph sampling algorithm here

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # data = data.to(device)
        torch.manual_seed(12345)
        # Prepare cluster data
        cluster_data = ClusterData(data, num_parts=8) 
        # Create a loader to iterate over clusters
        loader = ClusterLoader(cluster_data, batch_size=16, shuffle=True)  
        
        print()
        total_num_nodes = 0
        for step, sub_data in enumerate(loader):
            print(f'Step {step + 1}:')
            print('=======')
            print(f'Number of nodes in the current batch: {sub_data.num_nodes}')
            print(sub_data)
            print()
            total_num_nodes += sub_data.num_nodes
        
        print(f'Iterated over {total_num_nodes} of {data.num_nodes} nodes!')
        
        return loader

    

    def preprocess_graph(self, graph):
        data = self.from_networkx_to_torch_geometric(graph)
        return data

    def train(self, graph, epochs=100):
        # Preprocessing

        
        # Graph Sampling
        sampled_subgraph = self.sampling_method(graph)

        if self.preprocessing:
            data = self.preprocess_graph(graph).to(self.device)
            
        # data.adj = (data.adj > 0).float()
        # data = Data(edge_index=sampled_subgraph.edge_index, x=sampled_subgraph.x)
        
        # Model Training
        self.model.train()
        for epoch in range(epochs):
            self.optimizer.zero_grad()
            z = self.model.encode(data.x, data.edge_index)
            # adj_recon = self.model.decoder.forward_all(z) 
            # loss = self.recon_loss(adj_recon, data.adj)
            # recon = self.model.decoder(z, data.edge_index, sigmoid=True)
            loss = self.model.recon_loss(z, data.edge_index)
            loss.backward()
            self.optimizer.step()
            print(f'Epoch {epoch+1}, Loss: {loss.item()}')
            # print(f'Epoch {epoch+1}, adj_recon: {adj_recon}')
            
        torch.save(z, f'{self.sampling_method_name}_embedding.pt')


    def train_clusterGCN(self, graph, epochs=500):
        # Preprocessing
        data = self.preprocess_graph(graph).to(self.device)
        # data.adj = (data.adj > 0).float()
        
        # Graph Sampling
        loader = self.cluster_GCN(data)


        # optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        criterion = torch.nn.BCELoss() 
        final_embeddings = []
        self.model.train()
        for epoch in range(epochs):  
            total_loss = 0
            epoch_embeddings = [] 
            for batch_idx, batch_data in enumerate(loader):
                batch_data = batch_data.to(self.device)
                self.optimizer.zero_grad()
                z = self.model.encode(batch_data.x, batch_data.edge_index)
                loss = self.model.recon_loss(z, batch_data.edge_index)
                # recon = self.model.decoder.forward(z, cluster.edge_index, sigmoid=True)
                # adj_recon = self.model.decoder(z, cluster.edge_index)
                # loss = F.binary_cross_entropy_with_logits(z, adj_recon)
                loss.backward()
                self.optimizer.step()
                epoch_embeddings.append(z.detach().cpu().numpy())  # Collect embeddings
        
                total_loss += loss.item()
            avg_loss = total_loss / len(loader)
            print(f"Epoch {epoch+1}, Average Loss: {loss.item()}")
            if epoch == epochs - 1:
                final_embeddings = np.concatenate(epoch_embeddings, axis=0)
            
                
        torch.save(torch.from_numpy(final_embeddings), f'{self.sampling_method_name}_embedding.pt')


    def non_negative_matrix_factorization(self, graph):
        subgraph = self.sampling_method(graph)
        
        A = nx.to_numpy_array(subgraph)
        print(A.shape)
        model = NMF(n_components=2, init='random', random_state=0)
        
        # Fit the model to the adjacency matrix and transform
        W = model.fit_transform(A)  # Basis matrix (features)
        H = model.components_  # Coefficient matrix (components)
        return W, H








In [2]:
# graph = nx.read_graphml("aggregated_proteins_v30_subgraph.graphml")       
pipeline = GAEPipeline(in_channels=11, out_channels=30, sampling_method='random_walk')
graph = pipeline.load_graph_from_pickle('combined_graph.pkl')
pipeline.train(graph)

KeyboardInterrupt: 

In [None]:
# graph = nx.read_graphml("aggregated_proteins_v70_subgraph.graphml")
pipeline = GAEPipeline(in_channels=11, out_channels=32, sampling_method='clusterGCN')
graph = pipeline.load_graph_from_pickle('combined_graph.pkl')
pipeline.train_clusterGCN(graph)


In [1]:
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))
    
Z = torch.load('clusterGCN_embedding.pt').detach().numpy()

Z_normalized = Z / np.linalg.norm(Z, axis=1, keepdims=True)
# Compute dot product
similarity_matrix = np.dot(Z_normalized, Z_normalized.T)
# print(similarity_matrix)

# Apply sigmoid function to convert scores to probabilities
adjacency_matrix_reconstructed = sigmoid(similarity_matrix)
print(adjacency_matrix_reconstructed.shape)

(50164, 50164)
