In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from neo4j import GraphDatabase
import json
from py2neo import Graph
import random


In [None]:
bio_df = pd.read_excel('data/expression_data.xlsx', engine='openpyxl')

print(bio_df.head())


In [None]:
bio_df = bio_df.drop(bio_df.columns[0], axis=1)
proteins = bio_df.columns.tolist()


In [None]:
uniprot_to_reactome = pd.read_csv('data/MMU_Uniprot2Reactome.txt', sep='\t')

existing_proteins = uniprot_to_reactome[uniprot_to_reactome['V1'].isin(proteins)]['V1'].unique()

existing_proteins_list = existing_proteins.tolist()
len(existing_proteins_list)

In [None]:
existing_proteins_list


In [None]:



graph = Graph("bolt://localhost:7688", auth=("neo4j", "123456789"))


query = """
WITH "MATCH (r:Reaction)-[rel]-(connectedNodes)
WHERE r.schemaClass = 'Reaction' AND r.speciesName = 'Mus musculus'
RETURN connectedNodes,rel" AS query
CALL apoc.export.graphml.query(query, "mouse_python.graphml", {format:"gephi", useTypes:true, readLabels:True})
YIELD file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data
RETURN file, source, format, nodes, relationships, properties, time, rows, batchSize, batches, done, data;

"""
graph.run(query).data()




In [None]:

def fetch_subgraphs(driver, protein_names):
    with driver.session() as session:
        result = session.run("""
        UNWIND $proteinNames AS proteinName
        MATCH (p)-[rel]-(connectedNodes)
        WHERE ('EntityWithAccessionedSequence' IN labels(p) OR 'GenomeEncodedEntity' IN labels(p)) AND connectedNodes.speciesName = 'Mus musculus' AND ANY(name IN p.name WHERE name = proteinName)
        RETURN p AS protein, collect(rel) AS relationships, collect(connectedNodes) AS nodes
        """, proteinNames=protein_names)
        return list(result)

def create_subgraph(record):
    subgraph = nx.MultiGraph()
    protein_node = record['protein']
    relationships = record['relationships']
    connected_nodes = record['nodes']
    
    # Add the protein node with its properties
    subgraph.add_node(protein_node.id, **protein_node._properties)
    
    # Add connected nodes and relationships
    for rel, node in zip(relationships, connected_nodes):
        subgraph.add_node(node.id, **node._properties)
        subgraph.add_edge(protein_node.id, node.id, **rel._properties)
    
    return subgraph

def combine_subgraphs(results):

    subgraphs = [create_subgraph(record) for record in results]
    

    aggregated_graph = nx.compose_all(subgraphs)
    
    return aggregated_graph

def convert_attributes_to_strings(G):
    for node, data in G.nodes(data=True):
        for key, value in data.items():
            if isinstance(value, (list, dict)):  
                G.nodes[node][key] = json.dumps(value)
    
    for u, v, data in G.edges(data=True):
        for key, value in data.items():
            if isinstance(value, (list, dict)): 
                G.edges[u, v][key] = json.dumps(value)


driver = GraphDatabase.driver("bolt://localhost:7688", auth=("neo4j", "123456789"))


subgraphs = fetch_subgraphs(driver, existing_proteins_list[:400])
aggregated_graph = combine_subgraphs(subgraphs)
convert_attributes_to_strings(aggregated_graph)
nx.write_graphml(aggregated_graph, "aggregated_proteins_v2.graphml")

driver.close()


In [None]:
G = nx.read_graphml("aggregated_proteins_v2.graphml")

num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")


In [None]:
def find_nodes_at_distance(G, node, distance):
    """
    Find all unique nodes that are exactly 'distance' steps away from the given 'node'.
    Uses a BFS approach to avoid revisiting nodes.
    """
    if distance == 0:
        return [node]
    visited = {node}
    current_level = [node]
    for _ in range(distance):
        next_level = []
        for current_node in current_level:
            for neighbor in G.neighbors(current_node):
                if neighbor not in visited:
                    visited.add(neighbor)
                    next_level.append(neighbor)
        current_level = next_level
    return current_level



def random_walk_sampling(G, sample_size, distance):
    current_node = random.choice(list(G.nodes))
    sampled_nodes = [current_node]
    attempts = 0
    max_attempts = 1  # Adjust as necessary

    while len(sampled_nodes) < sample_size and attempts < max_attempts:
        potential_next_nodes = find_nodes_at_distance(G, current_node, distance)
        potential_next_nodes = [node for node in potential_next_nodes if node not in sampled_nodes]
        if not potential_next_nodes:
            attempts += 1
            print(f"Attempt {attempts}: No new nodes at distance {distance} from node {current_node}.")
            if attempts >= max_attempts:
                print("Maximum attempts reached. Sampling may be less than requested.")
            continue
        current_node = random.choice(potential_next_nodes)
        sampled_nodes.append(current_node)
        attempts = 0  # Reset attempts after a successful step

    return sampled_nodes



In [None]:

sample_size = 200
sampled_nodes = random_walk_sampling(G, sample_size, distance=2)
print("Sampled Nodes:", sampled_nodes)

# Stellargraph

In [2]:
# Print information about the graph (optional)
print(f"Loaded graph with {g.vcount()} nodes and {g.ecount()} edges.")

g.vs.attribute_names()
node_types = g.vs['schemaClass']
unique_node_types = set(node_types)
print("Unique Node Types:", unique_node_types)

NameError: name 'g' is not defined

In [3]:
from stellargraph.mapper import (
    CorruptedGenerator,
    FullBatchNodeGenerator,
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
    ClusterNodeGenerator,
)
from stellargraph import StellarGraph
from stellargraph.layer import GCN, DeepGraphInfomax, GraphSAGE, GAT, APPNP, HinSAGE

In [4]:
import networkx as nx
from stellargraph import StellarGraph

# Load the graph from a GraphML file
nx_graph = nx.read_gml('agg.gml')

# Convert the NetworkX graph to a StellarGraph object.
stellargraph = StellarGraph.from_networkx(nx_graph)
stellargraph.info()

'StellarGraph: Undirected multigraph\n Nodes: 2649, Edges: 2159\n\n Node types:\n  default: [2649]\n    Features: none\n    Edge types: default-default->default\n\n Edge types:\n    default-default->default: [2159]\n        Weights: all 1 (default)\n        Features: none'

In [5]:
hinsage_generator = HinSAGENodeGenerator(
    stellargraph, batch_size=1000, num_samples=[5]
)

hinsage_model = HinSAGE(
    layer_sizes=[128], activations=["relu"], generator=hinsage_generator
)
hinsage_acc = run_deep_graph_infomax(hinsage_model, hinsage_generator, epochs=epochs)

print(f"Test classification accuracy: {hinsage_acc}")

RuntimeError: This StellarGraph has no numeric feature attributes for nodesNode features are required for machine learning