In [28]:
from langchain_community.graphs import Neo4jGraph
from dotenv import load_dotenv
import os

load_dotenv()
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

## Build Graph Communities

In [29]:
from graphdatascience import GraphDataScience
# project graph

gds = GraphDataScience(
    os.environ["NEO4J_URI"],
    auth=(os.environ["NEO4J_USER"], os.environ["NEO4J_PASSWORD"])
)

In [30]:
from graphdatascience import GraphDataScience

G, result = gds.graph.project(
    "communities",  #  Graph name
    "*",  #  Node projection
    {
        "_ALL_": {
            "type": "*",
            "orientation": "UNDIRECTED",
            "properties": {"weight": {"property": "*", "aggregation": "COUNT"}},
        }
    },
)

The authors employed the Leiden algorithm, a hierarchical clustering method, to identify communities within the graph. One advantage of using a hierarchical community detection algorithm is the ability to examine communities at multiple levels of granularity. The authors suggest summarizing all communities at each level, providing a comprehensive understanding of the graph’s structure.

First, we will use the Weakly Connected Components (WCC) algorithm to assess the connectivity of our graph. This algorithm identifies isolated sections within the graph, meaning it detects subsets of nodes or components that are connected to each other but not to the rest of the graph. These components help us understand the fragmentation within the network and identify groups of nodes that are independent from others. WCC is vital for analyzing the overall structure and connectivity of the graph.

In [31]:
wcc = gds.wcc.stats(G)
print(f"Component count: {wcc['componentCount']}")
print(f"Component distribution: {wcc['componentDistribution']}")

Component count: 60
Component distribution: {'min': 6, 'p5': 6, 'max': 166178, 'p999': 166178, 'p99': 166178, 'p1': 6, 'p10': 6, 'p90': 15, 'p50': 8, 'p25': 6, 'p75': 12, 'p95': 16, 'mean': 2779.016666666667}


Next, we will run the Leiden algorithm, which is also available in the GDS library, and enable the includeIntermediateCommunities parameter to return and store communities at all levels. We have also included a relationshipWeightProperty parameter to run the weighted variant of the Leiden algorithm. Using the write mode of the algorithm stores the results as a node property.

In [32]:
gds.leiden.write(
    G,
    writeProperty="communities",
    includeIntermediateCommunities=True,
    relationshipWeightProperty="weight",
)

ranLevels                                                                6
didConverge                                                           True
nodeCount                                                           166741
communityCount                                                         249
preProcessingMillis                                                      3
computeMillis                                                          457
postProcessingMillis                                                     4
writeMillis                                                            798
nodePropertiesWritten                                               166741
communityDistribution    {'min': 6, 'p5': 6, 'max': 5579, 'p999': 5579,...
modularities             [0.5811948831561885, 0.7638721701592919, 0.814...
modularity                                                        0.826609
configuration            {'writeProperty': 'communities', 'theta': 0.01...
Name: 0, dtype: object

Now, we create a distinct node for each community and represent their hierarchical structure as an interconnected graph. Later, we will also store community summaries and other attributes as node properties.

In [42]:
graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (c:__Community__) REQUIRE c.id IS UNIQUE;")

[]

In [44]:
graph.query("""
MATCH (e)
UNWIND range(0, size(e.communities) - 1 , 1) AS index
CALL {
  WITH e, index
  WITH e, index
  WHERE index = 0
  MERGE (c:`__Community__` {id: toString(index) + '-' + toString(e.communities[index])})
  ON CREATE SET c.level = index
  MERGE (e)-[:IN_COMMUNITY]->(c)
  RETURN count(*) AS count_0
}
CALL {
  WITH e, index
  WITH e, index
  WHERE index > 0
  MERGE (current:`__Community__` {id: toString(index) + '-' + toString(e.communities[index])})
  ON CREATE SET current.level = index
  MERGE (previous:`__Community__` {id: toString(index - 1) + '-' + toString(e.communities[index - 1])})
  ON CREATE SET previous.level = index - 1
  MERGE (previous)-[:IN_COMMUNITY]->(current)
  RETURN count(*) AS count_1
}
RETURN count(*)
""")

[{'count(*)': 1000446}]

The authors also introduce a community rank, indicating the number of distinct text chunks in which the entities within the community appear:

In [48]:
graph.query("""
MATCH (c:__Community__)<-[:IN_COMMUNITY*]-(:__Entity__)<-[:MENTIONS]-(d:Document)
WITH c, count(distinct d) AS rank
SET c.community_rank = rank;
""")

[]

In [51]:
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD
)

community_info = graph.query("""
MATCH (c:`__Community__`)<-[:IN_COMMUNITY*]-(n)
WHERE c.level IN [0,1,4]
WITH c, collect(n) AS nodes
WHERE size(nodes) > 1
CALL apoc.path.subgraphAll(nodes[0], {
    whitelistNodes: nodes
})
YIELD relationships
RETURN c.id AS communityId,
       [node IN nodes | {id: node.id, description: node.description, type: head(labels(node))}] AS nodes,
       [r IN relationships | {start: startNode(r).id, type: type(r), end: endNode(r).id, description: r.description}] AS rels
""")

In [54]:
community_info[5000]

{'communityId': '0-97068',
 'nodes': [{'id': None, 'description': None, 'type': 'MOF'},
  {'id': None, 'description': None, 'type': 'Paper'},
  {'id': None, 'description': None, 'type': 'Text'},
  {'id': None, 'description': None, 'type': 'Precursor'},
  {'id': None, 'description': None, 'type': 'Text'},
  {'id': None, 'description': None, 'type': 'Text'},
  {'id': None, 'description': None, 'type': 'Property'},
  {'id': None, 'description': None, 'type': 'Text'},
  {'id': None, 'description': None, 'type': 'Property'},
  {'id': None, 'description': None, 'type': 'Text'},
  {'id': None, 'description': None, 'type': 'Property'},
  {'id': None, 'description': None, 'type': 'Text'},
  {'id': None, 'description': None, 'type': 'Text'}],
 'rels': [{'start': None,
   'description': None,
   'type': 'HAS_SOURCE',
   'end': None},
  {'start': None, 'description': None, 'type': 'HAS_PROPERTY', 'end': None},
  {'start': None, 'description': None, 'type': 'HAS_PROPERTY', 'end': None},
  {'start':

## Generate Community Summaries

In [34]:
import os
from dotenv import load_dotenv
load_dotenv()
print("API Key:", os.getenv("OPENAI_API_KEY"))

API Key: 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


In [35]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI


llm = ChatOpenAI(temperature=0, model_name="gpt-5-mini")


community_template = """Based on the provided nodes and relationships that belong to the same graph community,
generate a natural language summary of the provided information:
{community_info}

Summary:"""  # noqa: E501

community_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Given an input triples, generate the information summary. No pre-amble.",
        ),
        ("human", community_template),
    ]
)

community_chain = community_prompt | llm | StrOutputParser()


In [36]:
def prepare_string(data):
    nodes_str = "Nodes are:\n"
    for node in data['nodes']:
        node_id = node['id']
        node_type = node['type']
        if 'description' in node and node['description']:
            node_description = f", description: {node['description']}"
        else:
            node_description = ""
        nodes_str += f"id: {node_id}, type: {node_type}{node_description}\n"

    rels_str = "Relationships are:\n"
    for rel in data['rels']:
        start = rel['start']
        end = rel['end']
        rel_type = rel['type']
        if 'description' in rel and rel['description']:
            description = f", description: {rel['description']}"
        else:
            description = ""
        rels_str += f"({start})-[:{rel_type}]->({end}){description}\n"

    return nodes_str + "\n" + rels_str

def process_community(community):
    stringify_info = prepare_string(community)
    summary = community_chain.invoke({'community_info': stringify_info})
    return {"community": community['communityId'], "summary": summary}

In [37]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

summaries = []
limited_communities = community_info[:15]
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_community, community): community for community in limited_communities}

    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing communities"):
        summaries.append(future.result())

Processing communities: 0it [00:00, ?it/s]


In [38]:
summaries

[]