In [4]:
class GCNEncoder(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super(GCNEncoder, self).__init__()
            self.conv1 = GCNConv(in_channels, 2 * out_channels)
            self.conv2 = GCNConv(2 * out_channels, out_channels)
    
        def forward(self, x, edge_index):
            x = torch.relu(self.conv1(x, edge_index))
            x = torch.dropout(x, p=0.2, train=self.training)
            x = self.conv2(x, edge_index)
            return x


In [16]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from littleballoffur import RandomWalkWithRestartSampler
from littleballoffur import ForestFireSampler
from torch_geometric.nn import GCNConv, GAE
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import json
import random
import numpy as np
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx
from torch_geometric.loader import ClusterData
from torch_geometric.loader import ClusterLoader
from sklearn.decomposition import NMF
from numpy.linalg import svd


class GAEPipeline:
    def __init__(self, in_channels, out_channels, sampling_method='method1', preprocessing=True):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.sampling_method_name = sampling_method
        self.sampling_method = self._get_sampling_method(sampling_method)
        self.preprocessing = preprocessing
        self.encoder = GCNEncoder(in_channels=11, out_channels=32)
        self.model = GAE(self.encoder)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)



    def _get_sampling_method(self, method_name):
        """Dynamically selects the sampling method."""
        if method_name == 'random_walk':
            return self.random_walk
        elif method_name == 'forest_fire':
            return self.forest_fire
        elif method_name == 'clusterGCN':
            return self.cluster_GCN
        else:
            raise ValueError("Unknown sampling method")

    def convert_node_labels_to_integers(self, graph):
        """
        Convert the node labels of a NetworkX graph to integers.
        
        Parameters:
        - graph: NetworkX graph with any type of node labels.
        
        Returns:
        - A new NetworkX graph with node labels converted to integers.
        """
        # Create a mapping from old labels to new ones (integers)
        mapping = {node: i for i, node in enumerate(graph.nodes())}
        # Use relabel_nodes to create a new graph with integer labels
        graph_int_labels = nx.relabel_nodes(graph, mapping)
        
        return graph_int_labels

    def preprocess_features(self, features):
        # Convert features to a numeric format, handle non-numeric cases
        processed_features = []
        for feature in features:
            try:
                # convert the features to float
                processed_features.append(float(feature))
            except ValueError:
                # Handle non-numeric feature (could implement encoding here)
                processed_features.append(0.0)  # Using 0.0 as a placeholder
        return processed_features

    def normalize_features(self, features):
        features = np.array(features)
        mean = features.mean(axis=0, keepdims=True)
        std = features.std(axis=0, keepdims=True)
        # Avoid division by zero
        std[std == 0] = 1
        normalized_features = (features - mean) / std
        return normalized_features.tolist()

    def from_networkx_to_torch_geometric(self, G):
        # Convert node indices to a continuous range
        mapping = {k: i for i, k in enumerate(G.nodes())}
        edges = torch.tensor([list(map(mapping.get, edge)) for edge in G.edges()], dtype=torch.long).t().contiguous()
        
        if G.nodes():
            # Extract a sample node to get feature keys (assumes at least one node exists)
            sample_features = next(iter(G.nodes(data=True)))[1]
            feature_keys = list(sample_features.keys())
            
            # Extract and preprocess features for all nodes
            features = []
            for _, node_features in G.nodes(data=True):
                node_feature_values = [node_features.get(key, 0) for key in feature_keys]
                processed_features = self.preprocess_features(node_feature_values)
                features.append(processed_features)
            
            # Normalize features
            features = self.normalize_features(features)
        else:
            # Default to a single feature of 0 if no nodes or features
            features = [[0]]
    
        # Convert features to a tensor
        x = torch.tensor(features, dtype=torch.float)
    
        # Create the Data object
        data = Data(x=x, edge_index=edges)
        
        return data
    
    def random_walk(self, graph):
        graph = self.convert_node_labels_to_integers(graph)
        model = RandomWalkWithRestartSampler(10000)
        new_graph = model.sample(graph)
        return new_graph

    def forest_fire(self, graph):
        graph = self.convert_node_labels_to_integers(graph)
        model = ForestFireSampler(1000)
        new_graph = model.sample(graph)
        return new_graph

    def cluster_GCN(self, data):
        # Implement your third graph sampling algorithm here

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # data = data.to(device)
        torch.manual_seed(12345)
        # Prepare cluster data
        cluster_data = ClusterData(data, num_parts=8, recursive=True) 
        # Create a loader to iterate over clusters
        loader = ClusterLoader(cluster_data, batch_size=1, shuffle=True)  
        
        print()
        total_num_nodes = 0
        for step, sub_data in enumerate(loader):
            print(f'Step {step + 1}:')
            print('=======')
            print(f'Number of nodes in the current batch: {sub_data.num_nodes}')
            print(sub_data)
            print()
            total_num_nodes += sub_data.num_nodes
        
        print(f'Iterated over {total_num_nodes} of {data.num_nodes} nodes!')
        
        return loader

    

    def preprocess_graph(self, graph):
        # Placeholder function for preprocessing
        # Implement your preprocessing logic here
        data = self.from_networkx_to_torch_geometric(graph)
        return data

    def train(self, graph, epochs=500):
        # Preprocessing

        
        # Graph Sampling
        sampled_subgraph = self.sampling_method(graph)

        if self.preprocessing:
            data = self.preprocess_graph(sampled_subgraph)
        
        # data = Data(edge_index=sampled_subgraph.edge_index, x=sampled_subgraph.x)
        
        # Model Training
        self.model.train()
        for epoch in range(epochs):
            self.optimizer.zero_grad()
            z = self.model.encode(data.x, data.edge_index)
            loss = self.model.recon_loss(z, data.edge_index)
            loss.backward()
            self.optimizer.step()
            print(f'Epoch {epoch+1}, Loss: {loss.item()}')
            
        torch.save(z, f'{self.sampling_method_name}_embedding.pt')


    def train_clusterGCN(self, graph, epochs=500):
        # Preprocessing
        data = self.preprocess_graph(graph)
        # Graph Sampling
        loader = self.cluster_GCN(data)


        # optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


        self.model.train()
        for epoch in range(epochs):  
            total_loss = 0
            for batch_idx, cluster in enumerate(loader):
                self.optimizer.zero_grad()
                z = self.model.encode(cluster.x, cluster.edge_index)
                loss = self.model.recon_loss(z, cluster.edge_index)
                loss.backward()
                self.optimizer.step()
        
                # total_loss += loss.item()
            
            # avg_loss = total_loss / len(loader)
                print(f"Epoch {epoch+1}, Average Loss: {loss.item()}")
                
        torch.save(z, f'{self.sampling_method_name}_embedding.pt')


    def non_negative_matrix_factorization(self, graph):
        subgraph = self.sampling_method(graph)
        
        A = nx.to_numpy_array(subgraph)
        print(A.shape)
        model = NMF(n_components=2, init='random', random_state=0)
        
        # Fit the model to the adjacency matrix and transform
        W = model.fit_transform(A)  # Basis matrix (features)
        H = model.components_  # Coefficient matrix (components)
        return W, H

    def svd(self, graph):
        subgraph = self.sampling_method(graph)
        A = nx.to_numpy_array(subgraph)
        U, S, Vt = svd(A, full_matrices=True)
        return U, S, Vt






In [None]:
graph = nx.read_graphml("aggregated_proteins_v30.graphml")
pipeline = GAEPipeline(in_channels=11, out_channels=32, sampling_method='forest_fire')
pipeline.train(graph)

In [None]:
graph = nx.read_graphml("aggregated_proteins_v30.graphml")
pipeline = GAEPipeline(in_channels=11, out_channels=32, sampling_method='clusterGCN')
pipeline.train_clusterGCN(graph)


In [17]:
graph = nx.read_graphml("aggregated_proteins_v30.graphml")
pipeline = GAEPipeline(in_channels=11, out_channels=32, sampling_method='random_walk')
pipeline.svd(graph)

(array([[-1.07126714e-01,  1.03663346e-01, -1.74306127e-17, ...,
          8.02817288e-14, -1.47232496e-13,  6.22626503e-14],
        [-6.96964805e-03, -7.20250236e-03,  1.37538882e-17, ...,
          5.09435538e-01, -2.25257230e-02, -4.59647577e-02],
        [-6.96964805e-03, -7.20250236e-03,  4.47387511e-17, ...,
         -7.15199696e-03, -1.04293884e-03,  8.17146610e-04],
        ...,
        [-6.96964805e-03, -7.20250236e-03,  7.99728626e-17, ...,
          5.66758230e-02, -4.30608635e-02, -8.94499976e-03],
        [-6.96964805e-03, -7.20250236e-03,  7.98561014e-17, ...,
          5.66758230e-02, -4.30608635e-02, -8.94499976e-03],
        [-6.96964805e-03, -7.20250236e-03,  7.96159510e-17, ...,
          5.66758230e-02, -4.30608635e-02, -8.94499976e-03]]),
 array([6.91670815e+02, 6.47670815e+02, 1.00000000e+00, ...,
        2.31160422e-15, 1.07535966e-15, 9.53098613e-16]),
 array([[-1.07126714e-01, -6.96964805e-03, -6.96964805e-03, ...,
         -6.96964805e-03, -6.96964805e-03, -6

# Matrix Factorization

In [41]:
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
import numpy as np


A_sparse = csr_matrix(A)

model = NMF(n_components=2, init='random', random_state=42, solver='mu')


W = model.fit_transform(A_sparse)  # Basis matrix (features)
H = model.components_  # Coefficient matrix (components)
print(H)

[[5.51355222e-06 4.20314926e-02 4.20314947e-02 ... 4.20315017e-02
  4.20315060e-02 4.20314950e-02]
 [1.60202193e+00 4.41530137e-05 4.40754106e-05 ... 4.38195937e-05
  4.36605349e-05 4.40637424e-05]]
