In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from neo4j import GraphDatabase
import json
from py2neo import Graph
import random


In [14]:
bio_df = pd.read_excel('data/expression_data.xlsx', engine='openpyxl')



In [15]:
bio_df = bio_df.drop(bio_df.columns[0], axis=1)
proteins = bio_df.columns.tolist()


In [16]:
uniprot_to_reactome = pd.read_csv('data/MMU_Uniprot2Reactome.txt', sep='\t')

existing_proteins = uniprot_to_reactome[uniprot_to_reactome['V1'].isin(proteins)]['V1'].unique()

existing_proteins_list = existing_proteins.tolist()
len(existing_proteins_list)

1631

In [None]:
existing_proteins_list


In [None]:



graph = Graph("bolt://localhost:7688", auth=("neo4j", "123456789"))


query = """
WITH "MATCH (r:Reaction)-[rel]-(connectedNodes)
WHERE r.schemaClass = 'Reaction' AND r.speciesName = 'Mus musculus'
RETURN connectedNodes,rel" AS query
CALL apoc.export.graphml.query(query, "mouse_python.graphml", {format:"gephi", useTypes:true, readLabels:True})
YIELD file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
RETURN file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data;

"""
graph.run(query).data()




In [18]:
UNWIND $proteinNames AS proteinName
MATCH (p)-[rel1]-()-[rel2]-(connectedNodes)
WHERE ('EntityWithAccessionedSequence' IN labels(p) OR 'GenomeEncodedEntity' IN labels(p)) 
  AND connectedNodes.speciesName = 'Mus musculus' 
  AND ANY(name IN p.name WHERE name = proteinName)
RETURN p AS protein, collect(DISTINCT rel1) + collect(DISTINCT rel2) AS relationships, collect(DISTINCT connectedNodes) AS nodes

UNWIND $proteinNames AS proteinName
MATCH path = (p)-[*2]-(connectedNodes)
WHERE ('EntityWithAccessionedSequence' IN labels(p) OR 'GenomeEncodedEntity' IN labels(p)) 
  AND connectedNodes.speciesName = 'Mus musculus' 
  AND ANY(name IN p.name WHERE name = proteinName)
RETURN p AS protein, [r in relationships(path) | r] AS relationships, collect(DISTINCT connectedNodes) AS nodes
LIMIT 2



UNWIND $proteinNames AS proteinName
MATCH (p)
WHERE ('EntityWithAccessionedSequence' IN labels(p) OR 'GenomeEncodedEntity' IN labels(p))
  AND ANY(name IN p.name WHERE name = proteinName)
CALL apoc.path.subgraphAll(p, {
  maxLevel: 2,
  relationshipFilter: ">", // Adjust this based on your relationship types
  labelFilter: "/Mus musculus", // Adjust this to filter the end nodes, use + for include, - for exclude
  endNodeFilter: "Mus musculus" // This is not a native option, demonstrating intent
})
YIELD nodes, relationships
RETURN p AS protein, nodes, relationships



        UNWIND $proteinNames AS proteinName
        MATCH (p)-[rel]-(connectedNodes)
        WHERE ('EntityWithAccessionedSequence' IN labels(p) OR 'GenomeEncodedEntity' IN labels(p)) AND connectedNodes.speciesName = 'Mus musculus' AND ANY(name IN p.name WHERE name = proteinName)
        RETURN p AS protein, collect(rel) AS relationships, collect(connectedNodes) AS nodes

SyntaxError: invalid syntax (1274561687.py, line 1)

In [21]:

def fetch_subgraphs(driver, protein_names):
    with driver.session() as session:
        result = session.run("""
UNWIND $proteinNames AS proteinName
MATCH (p)
WHERE ('EntityWithAccessionedSequence' IN labels(p) OR 'GenomeEncodedEntity' IN labels(p))
  AND ANY(name IN p.name WHERE name = proteinName)
CALL apoc.path.subgraphAll(p, {
  maxLevel: 2,
  minLevel:1,
  relationshipFilter: "<" 
})
YIELD nodes, relationships
WITH p, 
     [node in nodes WHERE 'Mus musculus' IN labels(node) OR node.speciesName = 'Mus musculus'] AS filteredNodes,
     relationships
RETURN p AS protein, filteredNodes AS nodes, relationships
        """, proteinNames=protein_names)
        return list(result)

def create_subgraph(record):
    subgraph = nx.MultiGraph()
    protein_node = record['protein']
    relationships = record['relationships']
    connected_nodes = record['nodes']
    
    # Add the protein node with its properties
    subgraph.add_node(protein_node.id, **protein_node._properties)
    
    # Add connected nodes and relationships
    for rel, node in zip(relationships, connected_nodes):
        subgraph.add_node(node.id, **node._properties)
        subgraph.add_edge(protein_node.id, node.id, **rel._properties)
    
    return subgraph





def combine_subgraphs(results):

    subgraphs = [create_subgraph(record) for record in results]
    

    aggregated_graph = nx.compose_all(subgraphs)
    
    return aggregated_graph

def convert_attributes_to_strings(G):
    for node, data in G.nodes(data=True):
        for key, value in data.items():
            if isinstance(value, (list, dict)):  
                G.nodes[node][key] = json.dumps(value)
    
    for u, v, data in G.edges(data=True):
        for key, value in data.items():
            if isinstance(value, (list, dict)): 
                G.edges[u, v][key] = json.dumps(value)


driver = GraphDatabase.driver("bolt://localhost:7688", auth=("neo4j", "123456789"))


subgraphs = fetch_subgraphs(driver, existing_proteins_list[:400])
aggregated_graph = combine_subgraphs(subgraphs)
convert_attributes_to_strings(aggregated_graph)
nx.write_graphml(aggregated_graph, "aggregated_proteins_v2.graphml")

driver.close()


  subgraph.add_node(protein_node.id, **protein_node._properties)
  subgraph.add_node(node.id, **node._properties)
  subgraph.add_edge(protein_node.id, node.id, **rel._properties)


In [23]:
G = nx.read_graphml("aggregated_proteins_v2.graphml")

num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")


# Find connected components
connected_components = list(nx.connected_components(G))

# Print connected components
for i, component in enumerate(connected_components, 1):
    print(f"Component {i}: {component}")


source_node = "2085497"
hop_counts = nx.shortest_path_length(G, source=source_node)

# hop_counts is a dictionary where keys are target nodes and values are the number of hops
for target, hops in hop_counts.items():
    print(f"Number of hops from node {source_node} to node {target}: {hops}")


Number of nodes: 4419
Number of edges: 5303
Component 1: {'215292', '216675', '215300'}
Component 2: {'391394', '391395', '391403', '391409'}
Component 3: {'865942', '865939', '865945', '865954'}
Component 4: {'2260915', '1067745', '2260680', '1068025', '2260413', '1068140', '2260907', '1066835', '1067649', '1068065'}
Component 5: {'884910', '739712', '2234479', '884927', '2231871', '884822', '735749', '2234490', '884918', '2240046', '884901', '2231891', '2232179', '2234514', '2234277', '2240014', '2245443', '2247298', '1942348', '884923', '2231880', '1943034', '1943086', '1943120', '2240004', '2235550', '884817', '2241212', '1943096'}
Component 6: {'744817', '679957', '80354', '162269', '980918', '1961364', '201270', '1923926', '744656', '754742', '493638', '493468', '1961243', '158435', '227323', '899030', '1772865', '137981', '813054', '655601', '848186', '897895', '645977', '745303', '654463', '1348583', '241827', '838534', '744755', '288225', '156387', '141888', '592388', '685821'

# Random Walk with a restart using networkx pagerank function

In [20]:
G = nx.read_graphml("aggregated_proteins_v2.graphml")
# restart_nodes = [1]  # For example, restarting from node 1

# personalization = {node: 0.1 for node in G.nodes()}  # Starting value for all nodes
# for node in restart_nodes:
#     personalization[node] = 0.8  # Higher value for restart nodes

# pagerank_scores = nx.pagerank(G, personalization=personalization)

# print(pagerank_scores)

pagerank_scores = nx.pagerank(G)

# Number of nodes to sample
num_nodes_to_sample = 1000

# Sample nodes based on top PageRank scores
# Convert pagerank_scores to a list of tuples, sort by score, and select the top N
sampled_nodes_by_score = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)[:num_nodes_to_sample]
sampled_nodes_by_score = [node for node, score in sampled_nodes_by_score]

# Alternatively, sample nodes randomly
# Convert pagerank_scores keys (node names) to a list and sample randomly
all_nodes = list(pagerank_scores.keys())
sampled_nodes_randomly = random.sample(all_nodes, num_nodes_to_sample)

print("Sampled nodes based on top PageRank scores:", sampled_nodes_by_score)
# print("Sampled nodes randomly:", sampled_nodes_randomly)

Sampled nodes based on top PageRank scores: ['83121', '146816', '422023', '21115', '190730', '1043669', '1519350', '12882', '1647857', '158963', '210526', '1000990', '396416', '94133', '53994', '149034', '589972', '191202', '399008', '517845', '714233', '288595', '763063', '1416632', '1306993', '1042358', '1333810', '77491', '809915', '400438', '348749', '1010908', '492475', '119765', '400445', '1079333', '72134', '912298', '1325581', '658017', '304601', '218570', '201270', '111839', '1525812', '234495', '540493', '810530', '810578', '1601742', '140294', '926088', '935053', '1516419', '448264', '492165', '1962207', '1298883', '1305597', '72282', '74012', '560355', '454912', '829359', '163700', '125775', '1231702', '133575', '1900648', '1900658', '1901571', '1901822', '1408057', '185062', '642297', '219462', '1350211', '415544', '1332859', '939167', '248509', '277261', '810610', '88495', '1306984', '138095', '1333803', '135665', '141888', '428586', '343060', '261981', '443137', '40121',

# Random Walk with restart

In [21]:
import networkx as nx
import random

def random_walk_with_restart(G, start_node, restart_prob=0.1, walk_length=10):
    """
    Performs a random walk with restart on a graph.

    Parameters:
    G: NetworkX graph
    start_node: Node to start the random walk from
    restart_prob: Probability of restarting the walk at the start node
    walk_length: The number of steps to take in the walk

    Returns:
    A list of nodes visited during the walk.
    """
    if start_node not in G:
        raise ValueError("Start node not in graph")

    walk = [start_node]
    current_node = start_node
    for _ in range(walk_length):
        if random.random() < restart_prob:
            current_node = start_node
        else:
            neighbors = list(G.neighbors(current_node))
            if neighbors:
                current_node = random.choice(neighbors)
            else:
                # No neighbors to move to
                break
        walk.append(current_node)
    return walk



G = nx.read_graphml("aggregated_proteins_v2.graphml")
# Perform a random walk with restart from node 0
walk = random_walk_with_restart(G, start_node='2235550', restart_prob=0.2, walk_length=100)
print(walk)


['2235550', '2234277', '2235550', '2235550', '1943034', '1943086', '1942348', '2231871', '1942348', '2235550', '1942348', '2231871', '1942348', '2235550', '2234277', '2234479', '884817', '884927', '884901', '884927', '2235550', '1942348', '2231871', '1942348', '1942348', '2235550', '1942348', '2231880', '1942348', '2235550', '1942348', '2231871', '1942348', '2235550', '2234277', '2234490', '884817', '2240046', '884817', '2235550', '2234277', '2234277', '2234277', '2234277', '2234479', '884817', '2240014', '884817', '884822', '884817', '2234479', '884817', '2240004', '884817', '2240014', '884817', '2240014', '884817', '2235550', '1942348', '2231891', '2235550', '1943034', '1943120', '1942348', '2231871', '1942348', '2231880', '1942348', '2235550', '2235550', '1943034', '1943034', '1943096', '1943034', '2235550', '1942348', '1942348', '2235550', '2234277', '2241212', '2234277', '2235550', '1943034', '1943096', '1943034', '2235550', '2235550', '2234277', '2247298', '735749', '2235550', '1

# Forest Fire Sampling

In [22]:
import queue

def forest_fire_sampling(graph_path, start_node=None, burn_prob=0.5, max_steps=500):
    """
    Performs Forest Fire Sampling on a graph read from a GraphML file.

    :param graph_path: Path to the GraphML file.
    :param start_node: Node from which to start the fire. If None, a random node is chosen.
    :param burn_prob: Probability of a neighbor node catching fire.
    :param max_steps: Maximum number of steps to simulate the fire spreading.
    :return: A set of nodes that were 'burned' during the sampling.
    """
    # Read the graph from the GraphML file
    G = nx.read_graphml(graph_path)
    
    # If no start node is specified, choose one randomly
    if start_node is None:
        start_node = random.choice(list(G.nodes))
    
    burned_nodes = set([start_node])
    frontier = queue.Queue()
    frontier.put(start_node)
    
    while not frontier.empty() and len(burned_nodes) < max_steps:
        current_node = frontier.get()
        neighbors = list(G.neighbors(current_node))
        for neighbor in neighbors:
            if neighbor not in burned_nodes and random.random() < burn_prob:
                burned_nodes.add(neighbor)
                frontier.put(neighbor)
    
    return burned_nodes

In [24]:
sampled_nodes = forest_fire_sampling("aggregated_proteins_v2.graphml",burn_prob=0.8)
print("Sampled Nodes:", sampled_nodes)

Sampled Nodes: {'1924169', '1894549', '1333781', '1306815', '345915', '1324716', '344348', '1293489', '1896526', '1298642', '2289043', '1889325', '1298345', '1895701', '1321844', '1316157', '1344916', '1304673', '1332003', '1924770', '1317501', '1267462', '1323777', '1316341', '589522', '1305747', '1302775', '1298385', '1305018', '1267404', '1315548', '1306247', '1326661', '1924150', '1301956', '1298551', '1979060', '1321812', '1331211', '1322919', '1303002', '1122391', '1334801', '2290736', '1317023', '2287438', '691447', '1323140', '1306377', '1324138', '1301038', '344434', '1332391', '1903245', '344010', '1333573', '1316345', '1324961', '1301429', '1952762', '1336561', '1332099', '1325050', '1267357', '1338833', '1323825', '347095', '1323903', '1303030', '1305751', '2287422', '1318784', '1302861', '2289179', '1298327', '1924833', '1303169', '1316694', '1317715', '589972', '1924817', '1298537', '1340817', '1295809', '1338162', '1296300', '1319519', '1325572', '415544', '1402518', '12

# Cluster-GCN

In [1]:
import networkx as nx
from torch_geometric.utils import from_networkx

G = nx.read_graphml('aggregated_proteins_v2.graphml')

node_attrs = set()
for _, node_data in G.nodes(data=True):
    node_attrs.update(node_data.keys())

# Initialize missing attributes with None or a default value
for _, node_data in G.nodes(data=True):
    for attr in node_attrs:
        if attr not in node_data:
            node_data[attr] = None  # or some default value

data = from_networkx(G)


In [9]:
from torch_geometric.loader import ClusterData
from torch_geometric.loader import ClusterLoader

torch.manual_seed(12345)
# Prepare cluster data
cluster_data = ClusterData(data, num_parts=10, recursive=False) 
# Create a loader to iterate over clusters
loader = ClusterLoader(cluster_data, batch_size=1, shuffle=True)  

print()
total_num_nodes = 0
for step, sub_data in enumerate(loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of nodes in the current batch: {sub_data.num_nodes}')
    print(sub_data)
    print()
    total_num_nodes += sub_data.num_nodes

print(f'Iterated over {total_num_nodes} of {data.num_nodes} nodes!')





Computing METIS partitioning...
Done!


TypeError: expected Tensor as element 0 in argument 0, but got list

In [3]:
import torch
from torch_geometric.nn import GCNConv

class ClusterGCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(ClusterGCN, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, x, edge_index):
        x = torch.relu(self.conv1(x, edge_index))
        x = torch.dropout(x, p=0.5, train=self.training)
        x = self.conv2(x, edge_index)
        return x


In [7]:

model = ClusterGCN(num_features=200, num_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for cluster in loader:
    optimizer.zero_grad()
    out = model(cluster.x, cluster.edge_index)
    loss = criterion(out[cluster.train_mask], cluster.y[cluster.train_mask])
    loss.backward()
    optimizer.step()


TypeError: expected Tensor as element 0 in argument 0, but got list

# Stellargraph

In [2]:
# Print information about the graph (optional)
print(f"Loaded graph with {g.vcount()} nodes and {g.ecount()} edges.")

g.vs.attribute_names()
node_types = g.vs['schemaClass']
unique_node_types = set(node_types)
print("Unique Node Types:", unique_node_types)

NameError: name 'g' is not defined

In [3]:
from stellargraph.mapper import (
    CorruptedGenerator,
    FullBatchNodeGenerator,
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
    ClusterNodeGenerator,
)
from stellargraph import StellarGraph
from stellargraph.layer import GCN, DeepGraphInfomax, GraphSAGE, GAT, APPNP, HinSAGE

In [4]:
import networkx as nx
from stellargraph import StellarGraph

# Load the graph from a GraphML file
nx_graph = nx.read_gml('agg.gml')

# Convert the NetworkX graph to a StellarGraph object.
stellargraph = StellarGraph.from_networkx(nx_graph)
stellargraph.info()

'StellarGraph: Undirected multigraph\n Nodes: 2649, Edges: 2159\n\n Node types:\n  default: [2649]\n    Features: none\n    Edge types: default-default->default\n\n Edge types:\n    default-default->default: [2159]\n        Weights: all 1 (default)\n        Features: none'

In [5]:
hinsage_generator = HinSAGENodeGenerator(
    stellargraph, batch_size=1000, num_samples=[5]
)

hinsage_model = HinSAGE(
    layer_sizes=[128], activations=["relu"], generator=hinsage_generator
)
hinsage_acc = run_deep_graph_infomax(hinsage_model, hinsage_generator, epochs=epochs)

print(f"Test classification accuracy: {hinsage_acc}")

RuntimeError: This StellarGraph has no numeric feature attributes for nodesNode features are required for machine learning