In [1]:
import json
from neo4j import GraphDatabase
from langchain_community.graphs import Neo4jGraph
from graphdatascience import GraphDataScience
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
_ = load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [7]:
graph = Neo4jGraph(refresh_schema=False)



# Initialize the GraphDataScience instance
gds = GraphDataScience(
    os.environ["NEO4J_URI"],
    auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
)

graph_name = "communities"

# Check if the graph already exists
graph_exists, _ = gds.graph.exists(graph_name)

if graph_exists:
    # If the graph exists, get a handle to it
    G = gds.graph.get(graph_name)
else:
    # If the graph does not exist, create a new projection
    G, result = gds.graph.project(
        graph_name,  # Graph name
        "__Entity__",  # Node projection
        {
            "_ALL_": {
                "type": "*",
                "orientation": "UNDIRECTED",
                "properties": {"weight": {"property": "*", "aggregation": "COUNT"}},
            }
        },
    )

In [8]:
wcc = gds.wcc.stats(G)
print(f"Component count: {wcc['componentCount']}")
print(f"Component distribution: {wcc['componentDistribution']}")

Component count: 766
Component distribution: {'min': 1, 'p5': 1, 'max': 3941, 'p999': 3941, 'p99': 8, 'p1': 1, 'p10': 1, 'p90': 3, 'p50': 1, 'p25': 1, 'p75': 2, 'p95': 4, 'mean': 6.673629242819843}


In [9]:
gds.leiden.write(
    G,
    writeProperty="communities",
    includeIntermediateCommunities=True,
    relationshipWeightProperty="weight",
)

writeMillis                                                             71
nodePropertiesWritten                                                 5112
ranLevels                                                                5
didConverge                                                           True
nodeCount                                                             5112
communityCount                                                         799
communityDistribution    {'min': 1, 'p5': 1, 'max': 291, 'p999': 291, '...
modularity                                                         0.76171
modularities             [0.6224430347602946, 0.7223060643391526, 0.758...
postProcessingMillis                                                     3
preProcessingMillis                                                      0
computeMillis                                                           97
configuration            {'writeProperty': 'communities', 'theta': 0.01...
Name: 0, dtype: object

In [10]:
graph.query("""
MATCH (e:`__Entity__`)
UNWIND range(0, size(e.communities) - 1 , 1) AS index
CALL {
  WITH e, index
  WITH e, index
  WHERE index = 0
  MERGE (c:`__Community__` {id: toString(index) + '-' + toString(e.communities[index])})
  ON CREATE SET c.level = index
  MERGE (e)-[:IN_COMMUNITY]->(c)
  RETURN count(*) AS count_0
}
CALL {
  WITH e, index
  WITH e, index
  WHERE index > 0
  MERGE (current:`__Community__` {id: toString(index) + '-' + toString(e.communities[index])})
  ON CREATE SET current.level = index
  MERGE (previous:`__Community__` {id: toString(index - 1) + '-' + toString(e.communities[index - 1])})
  ON CREATE SET previous.level = index - 1
  MERGE (previous)-[:IN_COMMUNITY]->(current)
  RETURN count(*) AS count_1
}
RETURN count(*)
""")

[{'count(*)': 25560}]

In [11]:
graph.query("""
MATCH (c:__Community__)<-[:IN_COMMUNITY*]-(:__Entity__)-[:APPEARED_IN]->(d:__Chunk__)
WITH c, count(distinct d) AS rank
SET c.community_rank = rank;
""")

[]

In [12]:
community_size = graph.query(
    """
MATCH (c:__Community__)<-[:IN_COMMUNITY*]-(e:__Entity__)
WITH c, count(distinct e) AS entities
RETURN split(c.id, '-')[0] AS level, entities
"""
)
community_size_df = pd.DataFrame.from_records(community_size)
percentiles_data = []
for level in community_size_df["level"].unique():
    subset = community_size_df[community_size_df["level"] == level]["entities"]
    num_communities = len(subset)
    percentiles = np.percentile(subset, [25, 50, 75, 90, 99])
    percentiles_data.append(
        [
            level,
            num_communities,
            percentiles[0],
            percentiles[1],
            percentiles[2],
            percentiles[3],
            percentiles[4],
            max(subset)
        ]
    )

# Create a DataFrame with the percentiles
percentiles_df = pd.DataFrame(
    percentiles_data,
    columns=[
        "Level",
        "Number of communities",
        "25th Percentile",
        "50th Percentile",
        "75th Percentile",
        "90th Percentile",
        "99th Percentile",
        "Max"
    ],
)
percentiles_df

Unnamed: 0,Level,Number of communities,25th Percentile,50th Percentile,75th Percentile,90th Percentile,99th Percentile,Max
0,0,1480,1.0,2.0,3.0,6.0,33.0,188
1,1,1377,1.0,2.0,5.0,14.0,129.72,382
2,2,992,2.0,4.0,8.0,25.9,372.17,632
3,3,847,6.0,10.0,19.0,86.2,542.74,870
4,4,799,7.0,10.0,19.0,88.4,526.62,885


In [13]:
community_info = graph.query("""
MATCH (c:`__Community__`)<-[:IN_COMMUNITY*]-(e:__Entity__)
WHERE c.level IN [0,1,4]
WITH c, collect(e ) AS nodes
WHERE size(nodes) > 1
CALL apoc.path.subgraphAll(nodes[0], {
	whitelistNodes:nodes
})
YIELD relationships
RETURN c.id AS communityId,
       [n in nodes | {id: n.id, description: n.description, type: [el in labels(n) WHERE el <> '__Entity__'][0]}] AS nodes,
       [r in relationships | {start: startNode(r).id, type: type(r), end: endNode(r).id, description: r.description}] AS rels
""")

In [14]:
community_info[5]

{'communityId': '0-15',
 'nodes': [{'id': None, 'description': None, 'type': 'Publication'},
  {'id': None, 'description': None, 'type': 'Type'},
  {'id': None, 'description': None, 'type': 'Model'},
  {'id': None, 'description': None, 'type': 'TimePeriod'},
  {'id': None, 'description': None, 'type': 'Publication'},
  {'id': None, 'description': None, 'type': 'Figure'}],
 'rels': [{'start': None,
   'description': None,
   'type': 'isUsedIn',
   'end': None},
  {'start': None, 'description': None, 'type': 'isGeneratedBy', 'end': None},
  {'start': None, 'description': None, 'type': 'isTimePeriodOf', 'end': None},
  {'start': None, 'description': None, 'type': 'isCitedIn', 'end': None},
  {'start': None, 'description': None, 'type': 'produces', 'end': None}]}

In [15]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

community_template = """Based on the provided nodes and relationships that belong to the same graph community,
generate a natural language summary of the provided information:
{community_info}

Summary:"""  # noqa: E501

llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini-2024-07-18")

community_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Given an input triples, generate the information summary. No pre-amble.",
        ),
        ("human", community_template),
    ]
)

community_chain = community_prompt | llm | StrOutputParser()

In [18]:
import time
import random
def prepare_string(data):
    nodes_str = "Nodes are:\n"
    for node in data['nodes']:
        node_id = node['id']
        node_type = node['type']
        if 'description' in node and node['description']:
            node_description = f", description: {node['description']}"
        else:
            node_description = ""
        nodes_str += f"id: {node_id}, type: {node_type}{node_description}\n"

    rels_str = "Relationships are:\n"
    for rel in data['rels']:
        start = rel['start']
        end = rel['end']
        rel_type = rel['type']
        if 'description' in rel and rel['description']:
            description = f", description: {rel['description']}"
        else:
            description = ""
        rels_str += f"({start})-[:{rel_type}]->({end}){description}\n"

    return nodes_str + "\n" + rels_str

def process_community(community):
    stringify_info = prepare_string(community)
      
    summary = community_chain.invoke({'community_info': stringify_info})
    return {"community": community['communityId'], "summary": summary}

In [19]:
print(prepare_string(community_info[3]))

Nodes are:
id: None, type: Project
id: None, type: Acronym
id: None, type: Publication
id: None, type: __unknown__
id: None, type: Publication
id: None, type: Acronym

Relationships are:
(None)-[:hasData]->(None)
(None)-[:hasData]->(None)
(None)-[:hasData]->(None)
(None)-[:hasPublication]->(None)
(None)-[:hasPublication]->(None)



In [20]:
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Semaphore
from tqdm import tqdm

# Define the maximum number of concurrent requests allowed
MAX_CONCURRENT_REQUESTS = 5  # Adjust this number based on your rate limit
DELAY_BETWEEN_REQUESTS = 0.2  # Delay in seconds between each request

# Create a semaphore with a limited number of concurrent requests
semaphore = Semaphore(MAX_CONCURRENT_REQUESTS)

def process_community_with_throttle(community):
    with semaphore:  # Ensure that only a limited number of threads enter this block
        result = process_community(community)
        time.sleep(DELAY_BETWEEN_REQUESTS)  # Add a delay to control the rate
        return result

summaries = []
with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
    futures = {executor.submit(process_community_with_throttle, community): community for community in community_info}

    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing communities"):
        summaries.append(future.result())

Processing communities: 100%|██████████| 2601/2601 [14:35<00:00,  2.97it/s]


In [21]:
graph.query("""
UNWIND $data AS row
MERGE (c:__Community__ {id:row.community})
SET c.summary = row.summary
""", params={"data": summaries})

[]