In [46]:
import re
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase


def chunk_by_page_tags(company_name, extra=100):
    """
    Returns chunks where each chunk contains:
    - the <page_number>X</page_number> tag
    - all text until the next <page_number> tag
    - PLUS extra characters (default = 100) after the page content
    """

    with open(f"../data/{company_name}_ocr.txt", "r", encoding="utf-8") as f:
        text = f.read()

    # Find all page tags in the document
    pattern = r"<page_number>\s*\d+\s*</page_number>"
    matches = list(re.finditer(pattern, text))

    chunks = []

    for i, match in enumerate(matches):
        # Start at the beginning of the page tag
        start = match.start()

        # End at the next page tag (or end of text)
        if i < len(matches) - 1:
            end = matches[i+1].start()
        else:
            end = len(text)

        # Add extra characters after this chunk
        end_with_extra = min(len(text), end + extra)

        chunk = text[start:end_with_extra].strip()
        chunks.append(chunk)
    print(f"succesfully created chunks for {company_name}")
    print(len(chunks))

    return chunks




chunks_meta = chunk_by_page_tags("meta")
chunks_nvidia = chunk_by_page_tags("nvidia")
chunks_google = chunk_by_page_tags("google")


succesfully created chunks for meta
69
succesfully created chunks for nvidia
37
succesfully created chunks for google
99


In [55]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')


embeddings_meta = model.encode(chunks_meta)
embeddings_nvidia = model.encode(chunks_nvidia)
embeddings_google = model.encode(chunks_google)

In [70]:
driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "graphgraph"))

def populate_graph(company_name, driver, chunks, embeddings):

    with driver.session(database=company_name) as session:

        # Create vector index
        session.run(
            f"""
            CREATE VECTOR INDEX {company_name} IF NOT EXISTS
            FOR (c:Chunk)
            ON (c.embedding)
            OPTIONS {{
                indexConfig: {{
                    `vector.dimensions`: 384,
                    `vector.similarity_function`: "cosine"
                }}
            }}
            """
        )

        # Insert data
        session.run(
            """
            UNWIND range(0, size($chunks)-1) AS i
            MERGE (c:Chunk { index: i })
            SET c.text = $chunks[i],
                c.embedding = $embeddings[i]
            """,
            chunks=chunks,
            embeddings=embeddings
        )



populate_graph("google",driver = driver ,chunks = chunks_google ,embeddings = embeddings_google)


In [71]:
populate_graph("nvidia",driver = driver,chunks=chunks_nvidia ,embeddings=embeddings_nvidia)
populate_graph("meta",driver = driver,chunks=chunks_meta ,embeddings=embeddings_meta)

In [76]:
def rag_query(question, model, driver, company_name):

    question_embedding = model.encode([question])[0]

    query = f"""
    CALL db.index.vector.queryNodes('{company_name}', $k, $question_embedding)
    YIELD node AS hits, score
    RETURN hits.text AS text, score, hits.index AS index
    """

    with driver.session(database=company_name) as session:
        result = session.run(
            query,
            question_embedding=question_embedding,
            k=4
        )
        records = list(result)

    for record in records:
        print(record["text"])
        print(record["score"], record["index"])
        print("======")

question = "emission scope 1 google table"
rag_query(question, model = model, driver=driver, company_name = "google")

<page_number>105</page_number># Carbon intensity

| GHG emissions by type | Unit | Scope 1 | Scope 2 (market-based) | Scope 2 (location-based) |
|-----------------------|------|---------|------------------------|--------------------------|
| CO₂                   | tCO₂e | 50,700  | 3,027,400              | 11,207,600              |
| CH₄                   | tCO₂e | 100     | 4,200                 | 22,300                  |
| N₂O                   | tCO₂e | 200     | 8,500                 | 34,300                  |
| HFCs                  | tCO₂e | 22,100  | 19,000                | 19,000                  |
| Total                 | tCO₂e | 73,100  | 3,059,100             | 11,283,200              |

| GHG emissions by region | Unit | Scope 1 | Scope 2 (market-based) | Scope 2 (location-based) |
|-------------------------|------|---------|------------------------|--------------------------|
| North America           | tCO₂e | 53,600  | 1,161,200              | 8,293,200              