# Benchmarking Aura features

In [10]:
import os
import timeit

import numpy as np
import seaborn as sns
from tqdm import tqdm

from graphdatascience.aura_sessions import AuraSessions
from graphdatascience.query_runner.aura_db_arrow_query_runner import AuraDbConnectionInfo

In [2]:
CLIENT_ID = ""
CLIENT_SECRET = ""
DB_PASSWORD = "" 
DB_ID = "347d1f74"
os.environ["AURA_ENV"] = "devstrawberry"

db_connection_info = AuraDbConnectionInfo(
        f"neo4j+s://{DB_ID}-{os.environ['AURA_ENV']}.databases.neo4j-dev.io", ("neo4j", DB_PASSWORD)
    )

In [23]:
sessions = AuraSessions(db_connection_info, (CLIENT_ID, CLIENT_SECRET))
session_name = "benchmark-session"
session_pw = "my-password"

print("Starting GDS session")
if session_name in [i.name for i in sessions.list_sessions()]:
    gds = sessions.connect(session_name, session_pw)
else:
    gds = sessions.create_gds(session_name, session_pw, "24GB")

Starting GDS session


In [24]:
def measure(func, setup, iterations, warmup_iterations):
    pbar = tqdm(total=iterations + warmup_iterations)
    
    def wrapper(): 
        pbar.update(1)
        return func()
    
    pbar.set_description("Warmup")
    warmup = timeit.repeat(
        wrapper,
        setup=setup,
        number=1,
        repeat=warmup_iterations
    )

    pbar.set_description("Measurment")
    measurement = timeit.repeat(
        wrapper,
        setup=setup,
        number=1,
        repeat=iterations
    )
    
    pbar.close()
    
    return {"iterations": measurement, "mean": np.mean(measurement), "avg": np.average(measurement)}

## Projection

In [25]:
def run_remote_projection(query, concurrency):  
  with gds.graph.project.remoteDb(
        "graph",
        query,
    concurrency=concurrency
    ) as G:
        ()

In [26]:
run_remote_projection(
    """
    CYPHER runtime = parallel
    MATCH (u)
    OPTIONAL MATCH (u)-[r]->(t)
    RETURN gds.graph.project.remote(u, t, {})
    """,
    concurrency=1)


ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.graph.project.remoteDb`: Caused by: com.neo4j.gds.shaded.org.apache.arrow.flight.FlightRuntimeException: UNKNOWN: org.apache.arrow.flight.FlightRuntimeException: UNKNOWN: Failure allocating buffer.}

### Project entire graph, structure only

#### No parallel Runtime

In [11]:
data = {}
for concurrency in [1, 2, 4, 8]:
    result = measure(
                lambda: run_remote_projection(
            """
            MATCH (u)
            OPTIONAL MATCH (u)-[r]->(t)
            RETURN gds.graph.project.remote(u, t, {})
            """,
            concurrency=concurrency
        ),
        lambda: gds.graph.drop("graph", failIfMissing=False),
        iterations=10,
        warmup_iterations=5
    )
    data[concurrency] = result

plot = sns.barplot({k: v["mean"] for k, v in data.items()})
plot.set(xlabel="conurrency", ylabel="average runtime")


  0%|          | 0/15 [00:00<?, ?it/s]Failed to read from defunct connection ResolvedIPv4Address(('35.241.161.236', 7687)) (ResolvedIPv4Address(('35.241.161.236', 7687)))
Unable to retrieve routing information
Unable to connect to the Neo4j DBMS. Trying again...
  7%|▋         | 1/15 [00:02<00:30,  2.16s/it]Failed to read from defunct connection IPv4Address(('6b3b0e05-devstrawberry.databases.neo4j-dev.io', 7687)) (ResolvedIPv4Address(('35.241.161.236', 7687)))


SessionExpired: Failed to read from defunct connection IPv4Address(('6b3b0e05-devstrawberry.databases.neo4j-dev.io', 7687)) (ResolvedIPv4Address(('35.241.161.236', 7687)))

#### Parallel Runtime

In [None]:
data = {}
for concurrency in [1, 2, 4, 8]:
    result = measure(
        lambda: run_remote_projection(
            """
            CYPHER runtime = parallel
            MATCH (u)
            OPTIONAL MATCH (u)-[r]->(t)
            RETURN gds.graph.project.remote(u, t, {})
            """,
            concurrency=concurrency
        ),
        lambda: (),
        iterations=10,
        warmup_iterations=5
    )
    data[concurrency] = result

plot = sns.barplot({k: v["mean"] for k, v in data.items()})
plot.set(xlabel="conurrency", ylabel="average runtime")

### Project entire graph, with properties and labels

In [None]:
query = """
            MATCH (u)
            OPTIONAL MATCH (u)-[r]->(t)
            RETURN gds.graph.project.remote(u, t, {
                sourceNodeLabels: labels(u),
                sourceNodeProperties: {id: id(u)},
                targetNodeLabels: labels(t),
                targetNodeProperties: {id: id(t)},
                relationshipType: type(r),
                relationshipProperties: {id: id(r)}
            })
        """

data = {}
for concurrency in [1, 2, 4, 8]:
    result = measure(
        lambda: run_remote_projection(
            query,
            concurrency=concurrency
        ),
        lambda: (),
        iterations=10,
        warmup_iterations=5
    )
    data[concurrency] = result

plot = sns.barplot({k: v["mean"] for k, v in data.items()})
plot.set(xlabel="conurrency", ylabel="average runtime")

## Write back

In [None]:
G, _ = gds.graph.project.remoteDb(
        "graph",
        """
    CYPHER runtime = parallel
    MATCH (u)
    OPTIONAL MATCH (u)-[r]->(t)
    RETURN gds.graph.project.remote(u, t, {})
    """,
    concurrency=4
    )

gds.degree.mutate(G, mutateProperty="degree")
gds.fastRP.mutate(G, mutateProperty="embedding", embeddingDimension=128, iterationWeights=[1.0])
gds.knn.mutate(G, topK=4, mutateRelationshipType="KNN_RELS", mutateProperty="score", maxIterations=1, nodeProperties=["embedding"])

In [None]:
result = timeit.repeat(
    lambda: gds.run_cypher("MATCH (n) SET n.degree = null"),
    lambda: gds.graph.nodeProperties.write(G, node_properties=["degree"]), 
    number=1, 
    repeat=iterations)

print(f"scalar property: {result}")

In [None]:
result = timeit.repeat(
    lambda: gds.run_cypher("MATCH (n) SET n.embedding = null"),
    lambda: gds.graph.nodeProperties.write(G, node_properties=["embedding"]), 
    number=1, 
    repeat=iterations)

print(f"array property: {result}")

In [None]:
result = timeit.repeat(
    lambda: gds.run_cypher("MATCH (n)-[r:KNN_RELS]->() DELETE r"),
    lambda: gds.graph.relationship.write(G, relationship_type="KNN_RELS", relationship_property="score"), 
    number=1, 
    repeat=iterations)

print(f"relationships: {result}")

## Cleanup

In [None]:
G.drop()

In [15]:
sessions.delete_gds(session_name)

True

In [None]:
# cleanup

from graphdatascience.aura_api import AuraApi

aura_api = AuraApi(CLIENT_ID, CLIENT_SECRET)
aura_api.delete_instance(DB_ID)