In [1]:
!pip install neo4j openai PyMuPDF

Collecting neo4j
  Downloading neo4j-5.27.0-py3-none-any.whl.metadata (5.9 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading neo4j-5.27.0-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.7/301.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF, neo4j
Successfully installed PyMuPDF-1.25.1 neo4j-5.27.0


In [2]:
!pip install pymupdf



In [8]:
!pip install openai==0.28


Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m71.7/76.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.5
    Uninstalling openai-1.54.5:
      Successfully uninstalled openai-1.54.5
Successfully installed openai-0.28.0


In [1]:
pdf_path = "/content/David Romer - Advanced Macroeconomics-McGraw-Hill Education (2018).pdf"

In [None]:
from neo4j import GraphDatabase
import fitz  # PyMuPDF for PDF processing
import openai
import json

# Set OpenAI API key
def set_openai_api_key():
    openai.api_key = input("Enter your OpenAI API key: ")

# Function to split text into smaller chunks that fit within the token limit
def split_text_into_chunks(text, max_tokens=4096):
    words = text.split()
    chunks = []
    current_chunk = []
    token_count = 0

    for word in words:
        token_count += 1
        if token_count > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            token_count = 1
        else:
            current_chunk.append(word)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text())
    return text

# Step 2: Summarize the text using OpenAI with chunking
def summarize_text(text):
    chunks = split_text_into_chunks(text)
    summaries = []

    for chunk in chunks:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a summarization assistant."},
                {"role": "user", "content": f"Summarize the following text:\n{chunk}"}
            ]
        )
        summaries.append(response['choices'][0]['message']['content'])

    return "\n".join(summaries)

# Step 3: Extract key nodes and relationships using OpenAI with chunking
def extract_nodes_and_relationships(summary):
    chunks = split_text_into_chunks(summary)
    all_nodes = []
    all_relationships = []

    for chunk in chunks:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an information extraction assistant."},
                {"role": "user", "content": f"Extract the key nodes and relationships in structured format (JSON) from the following summary:\n{chunk}. Each node should have a name, and relationships should define source, target, and type."}
            ]
        )

        extracted = response['choices'][0]['message']['content']
        print(f"OpenAI Response:\n{extracted}\n")

        try:
            parsed_data = json.loads(extracted)
            nodes = parsed_data.get("nodes", [])
            relationships = parsed_data.get("relationships", [])
            all_nodes.extend(nodes)
            all_relationships.extend(relationships)
        except json.JSONDecodeError:
            print("Failed to decode OpenAI response as JSON. Response was:", extracted)

    return all_nodes, all_relationships

# Function to add nodes to Neo4j
def add_node(tx, node):
    # Check if 'name' or 'id' exists in the node
    node_name = node.get("name") or node.get("id")  # Use 'id' if 'name' is not present
    query = "CREATE (n:Node {name: $name})"
    tx.run(query, name=node_name)  # Use 'name' or 'id' as the node name


# Function to add relationships to Neo4j
def add_relationship(tx, relationship):
    relationship_type = relationship.get("type", "default_type")  # Default type if 'type' is missing
    query = (
        "MATCH (a:Node {name: $source}), (b:Node {name: $target}) "
        "CREATE (a)-[r:`$type`]->(b)"
    )
    tx.run(query, source=relationship["source"], target=relationship["target"], type=relationship_type)

# Function to populate Neo4j with nodes and relationships
def populate_neo4j(nodes, relationships, driver):
    with driver.session() as session:
        for node in nodes:
            session.write_transaction(add_node, node)
        for relationship in relationships:
            session.write_transaction(add_relationship, relationship)

# Main function
def main():
    set_openai_api_key()

    pdf_path = "/content/David Romer - Advanced Macroeconomics-McGraw-Hill Education (2018).pdf"  # Update with your PDF path

    # Extract text by chapter
    chapters = extract_text_from_pdf(pdf_path)

    # Neo4j connection
    uri = "neo4j+s://0aa8f456.databases.neo4j.io"
    username = "neo4j"
    password = "3MsV_P37NaEw0GjM4PLLoV4nQt01irr_wo48HOpd0L4"
    driver = GraphDatabase.driver(uri, auth=(username, password))

    for i, chapter_text in enumerate(chapters):
        print(f"Processing Chapter {i + 1}...")

        # Summarize the chapter text
        summary = summarize_text(chapter_text)
        print(f"Summary for Chapter {i + 1}: {summary}\n")

        # Extract nodes and relationships
        nodes, relationships = extract_nodes_and_relationships(summary)

        # Populate Neo4j with nodes and relationships
        populate_neo4j(nodes, relationships, driver)

    print("Knowledge graph created in Neo4j.")
    driver.close()

if __name__ == "__main__":
    main()



Processing Chapter 1...
Summary for Chapter 1: 

Processing Chapter 2...
Summary for Chapter 2: The text is about advanced macroeconomics, likely discussing more complex economic concepts and theories. It is the fifth edition of the book.

OpenAI Response:
```json
{
    "nodes": [
        {"name": "Advanced Macroeconomics", "type": "concept"},
        {"name": "Economic Concepts", "type": "sub-concept"},
        {"name": "Theories", "type": "sub-concept"},
        {"name": "Book", "type": "resource"},
        {"name": "Fifth Edition", "type": "edition"}
    ],
    "relationships": [
        {"source": "Advanced Macroeconomics", "target": "Economic Concepts", "type": "includes"},
        {"source": "Advanced Macroeconomics", "target": "Theories", "type": "includes"},
        {"source": "Book", "target": "Advanced Macroeconomics", "type": "features"},
        {"source": "Book", "target": "Fifth Edition", "type": "version"}
    ]
}
```

Failed to decode OpenAI response as JSON. Response w

  session.write_transaction(add_node, node)
  session.write_transaction(add_relationship, relationship)


[1;30;43mStreaming af output blev afkortet til de sidste 5000 linjer.[0m
Processing Chapter 256...
Summary for Chapter 256: The text discusses deriving the log-linearized equation of motion for capital, showing relationships between different variables and parameters such as growth rate, interest rate, and ratios on the balanced growth path. It also discusses redoing a regression analysis with different data sets and variables.

OpenAI Response:
```json
{
  "nodes": [
    {
      "id": "Capital",
      "label": "Capital"
    },
    {
      "id": "Growth Rate",
      "label": "Growth Rate"
    },
    {
      "id": "Interest Rate",
      "label": "Interest Rate"
    },
    {
      "id": "Ratios",
      "label": "Ratios"
    },
    {
      "id": "Log-linearized Equation",
      "label": "Log-linearized Equation"
    },
    {
      "id": "Balanced Growth Path",
      "label": "Balanced Growth Path"
    },
    {
      "id": "Regression Analysis",
      "label": "Regression Analysis"
    }