In [1]:
import re
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase
driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "graphgraph"))

model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:


def chunk_by_page_tags(company_name, extra=100):
    """
    Returns chunks where each chunk contains:
    - the <page_number>X</page_number> tag
    - all text until the next <page_number> tag
    - PLUS extra characters (default = 100) after the page content
    """

    with open(f"../data/{company_name}_ocr.txt", "r", encoding="utf-8") as f:
        text = f.read()

    # Find all page tags in the document
    pattern = r"<page_number>\s*\d+\s*</page_number>"
    matches = list(re.finditer(pattern, text))

    chunks = []

    for i, match in enumerate(matches):
        # Start at the beginning of the page tag
        start = match.start()

        # End at the next page tag (or end of text)
        if i < len(matches) - 1:
            end = matches[i+1].start()
        else:
            end = len(text)

        # Add extra characters after this chunk
        end_with_extra = min(len(text), end + extra)

        chunk = text[start:end_with_extra].strip()
        chunks.append(chunk)
    print(f"succesfully created chunks for {company_name}")
    print(len(chunks))

    return chunks




chunks_meta = chunk_by_page_tags("meta")
chunks_nvidia = chunk_by_page_tags("nvidia")
chunks_google = chunk_by_page_tags("google")


succesfully created chunks for meta
69
succesfully created chunks for nvidia
37
succesfully created chunks for google
99


In [10]:

embeddings_meta = model.encode(chunks_meta)
embeddings_nvidia = model.encode(chunks_nvidia)
embeddings_google = model.encode(chunks_google)

In [11]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "graphgraph"))

def populate_graph(company_name, driver, chunks, embeddings):

    with driver.session(database=company_name) as session:

        # Create vector index
        session.run(
            f"""
            CREATE VECTOR INDEX {company_name} IF NOT EXISTS
            FOR (c:Chunk)
            ON (c.embedding)
            OPTIONS {{
                indexConfig: {{
                    `vector.dimensions`: 384,
                    `vector.similarity_function`: "cosine"
                }}
            }}
            """
        )

        # Insert data
        session.run(
            """
            UNWIND range(0, size($chunks)-1) AS i
            MERGE (c:Chunk { index: i })
            SET c.text = $chunks[i],
                c.embedding = $embeddings[i]
            """,
            chunks=chunks,
            embeddings=embeddings
        )



populate_graph("google",driver = driver ,chunks = chunks_google ,embeddings = embeddings_google)


In [12]:
populate_graph("nvidia",driver = driver,chunks=chunks_nvidia ,embeddings=embeddings_nvidia)
populate_graph("meta",driver = driver,chunks=chunks_meta ,embeddings=embeddings_meta)

In [13]:
def rag_query(question, model, driver, company_name):

    question_embedding = model.encode([question])[0]

    query = f"""
    CALL db.index.vector.queryNodes('{company_name}', $k, $question_embedding)
    YIELD node AS hits, score
    RETURN hits.text AS text, score, hits.index AS index
    """

    with driver.session(database=company_name) as session:
        result = session.run(
            query,
            question_embedding=question_embedding,
            k=4
        )
        records = list(result)

    for record in records:
        print(record["text"])
        print(record["score"], record["index"])
        print("======")

question = "Meta Scope 1 emissions data"
rag_query(question, model = model, driver=driver, company_name = "meta")

<page_number>14</page_number>

# Understanding Emissions

Comprehensive data and advanced tooling provide the fundamental building blocks to fully understand and effectively manage our emissions.

Identifying the sources of our emissions enables us to prioritize and deploy interventions to reduce emissions where we can make the most meaningful progress on our path to net zero.

As Meta seeks to decarbonize our value chain, the data and tooling that drive our climate work will continue to evolve and improve, particularly given the challenge to accurately measure and influence emissions reduction deep in our entire value chain. We leverage our expertise in data science to create emissions management tools for our teams and improve the granularity, accuracy, and measurement of our GHG data.

## Forecasting Future-State Emissions

Our Net Zero Hub tool forecasts our future-state emissions based on numerous data inputs, such as planned business growth, to model emissions scenarios and provi