### We set up the connexion

In [104]:
from langchain_community.graphs import Neo4jGraph

url = "bolt://localhost:7687"
username = "neo4j"
password = "password"

graph = Neo4jGraph(url=url, username=username, password=password)

In [106]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="llama3.2",
)

In [107]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from typing import List, Dict

In [108]:
def _extract_keywords(text: str, top_n: int = 5) -> List[str]:

    import re
    from collections import Counter

    words = re.findall(r"\w+", text.lower())

    stop_words = set(
        [
            "the",
            "a",
            "an",
            "and",
            "or",
            "but",
            "in",
            "on",
            "at",
            "to",
            "for",
            "of",
            "with",
            "by",
        ]
    )
    filtered_words = [
        word for word in words if word not in stop_words and len(word) > 2
    ]

    return [word for word, count in Counter(filtered_words).most_common(top_n)]

In [109]:
def load_and_process_pdf(
    pdf_path: str, chunk_size: int = 1000, chunk_overlap: int = 200
) -> List[Dict]:

    loader = PyPDFLoader(pdf_path)

    pages = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )

    splits = text_splitter.split_documents(pages)

    processed_chunks = []
    for i, chunk in enumerate(splits):
        metadata = {
            "chunk_id": i,
            "source": pdf_path,
            "page_number": chunk.metadata.get("page", None),
            "total_length": len(chunk.page_content),
            "keywords": _extract_keywords(chunk.page_content),
            "text_preview": (
                chunk.page_content[:100] + "..."
                if len(chunk.page_content) > 100
                else chunk.page_content
            ),
        }

        processed_chunks.append({"text": chunk.page_content, "metadata": metadata})

    return processed_chunks

In [112]:
pdf_path = "Grokking Deep Reinforcement Learning by Miguel Morales 1.pdf"


chunks = load_and_process_pdf(pdf_path)

print(f"Total chunks created: {len(chunks)}")

for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i}:")
    print(f"Text Preview: {chunk['metadata']['text_preview']}")
    print(f"Keywords: {chunk['metadata']['keywords']}")
    print(f"Page Number: {chunk['metadata']['page_number']}")

Total chunks created: 1281

Chunk 0:
Text Preview: grokking
Deep 
Reinforcement 
Learning
Keywords: ['grokking', 'deep', 'reinforcement', 'learning']
Page Number: 2

Chunk 1:
Text Preview: grokking
Deep 
Reinforcement 
Learning
Miguel Morales
Foreword by Charles Isbell, Jr.
MANNING
Shelte...
Keywords: ['grokking', 'deep', 'reinforcement', 'learning', 'miguel']
Page Number: 4

Chunk 2:
Text Preview: For online information and ordering of this and other Manning books, please visit
www  .manning  .co...
Keywords: ['manning', 'this', 'publications', 'designations', 'information']
Page Number: 5


### We use only a subset of chunks and create a knowledge graph

In [88]:
def create_graph_from_chunks(chunks: List[Dict]):

    graph.query("MATCH (n) DETACH DELETE n")

    create_chunk_query = """
    MERGE (chunk:Chunk {chunk_id: $chunk_id})
    ON CREATE SET 
        chunk.source = $source,
        chunk.page_number = $page_number,
        chunk.total_length = $total_length,
        chunk.text_preview = $text_preview,
        chunk.full_text = $full_text
    
    // Create keyword nodes and relationships
    WITH chunk
    UNWIND $keywords AS keyword
    MERGE (kw:Keyword {name: keyword})
    MERGE (chunk)-[:HAS_KEYWORD]->(kw)
    
    RETURN chunk
    """

    for i, chunk in enumerate(chunks):
        graph.query(
            create_chunk_query,
            params={
                "chunk_id": chunk["metadata"]["chunk_id"],
                "source": chunk["metadata"]["source"],
                "page_number": chunk["metadata"]["page_number"],
                "total_length": chunk["metadata"]["total_length"],
                "text_preview": chunk["metadata"]["text_preview"],
                "full_text": chunk["text"],
                "keywords": chunk["metadata"]["keywords"],
            },
        )


create_graph_from_chunks(chunks[:200])

In [None]:
verification_query = """
MATCH (c:Chunk)
RETURN count(c) as chunk_count
"""
result = graph.query(verification_query)
print(f"Number of chunks created: {result[0]['chunk_count']}")

sample_query = """
MATCH (c:Chunk)-[:HAS_KEYWORD]->(k:Keyword)
WITH c, collect(k.name) as keywords
RETURN c.chunk_id, c.text_preview, keywords
LIMIT 1
"""
result = graph.query(sample_query)
print("\nSample chunk:", result[0])

Number of chunks created: 200

Sample chunk: {'c.chunk_id': 0, 'c.text_preview': 'grokking\nDeep \nReinforcement \nLearning', 'keywords': ['learning', 'deep', 'reinforcement', 'grokking']}


In [90]:
graph.query(
    """
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunk_id IS UNIQUE
"""
)

embedding_dim = 3072

In [91]:
def generate_embedding(text: str) -> List[float]:
   
    try:
        embedding = embeddings.embed_query(text)

        embedding = [float(x) for x in embedding]

        magnitude = sum(x * x for x in embedding) ** 0.5
        if magnitude > 0:
            embedding = [x / magnitude for x in embedding]

        if len(embedding) != embedding_dim:
            if len(embedding) < embedding_dim:
                embedding.extend([0.0] * (embedding_dim - len(embedding)))
            else:
                embedding = embedding[:embedding_dim]

        return embedding

    except Exception as e:
        print(f"Error generating embedding: {e}")
        return [0.0] * embedding_dim

In [92]:
def create_vector_index(chunks: List[Dict]):

    try:
        graph.query(
            """
            DROP INDEX chunk_vector_index IF EXISTS 
        """
        )

        graph.query(
            """
            CALL db.index.vector.createNodeIndex(
                'chunk_vector_index',
                'Chunk',
                'embedding',
                $dim,
                'cosine'
            )
            """,
            params={"dim": embedding_dim},
        )

        batch_size = 10
        total_processed = 0

        for i in range(0, len(chunks), batch_size):
            batch = chunks[i : i + batch_size]
            batch_embeddings = []

            for chunk in batch:
                embedding = generate_embedding(chunk["text"])
                batch_embeddings.append(
                    {"chunk_id": chunk["metadata"]["chunk_id"], "embedding": embedding}
                )

            batch_update_query = """
            UNWIND $batch AS item
            MATCH (chunk:Chunk {chunk_id: item.chunk_id})
            SET chunk.embedding = item.embedding
            """

            graph.query(batch_update_query, params={"batch": batch_embeddings})

            total_processed += len(batch)
            print(f"Processed {total_processed}/{len(chunks)} chunks")

    except Exception as e:
        print(f"Error creating vector index: {e}")
        raise


try:
    create_vector_index(chunks[:200])
except Exception as e:
    print(f"Failed to create vector index: {e}")

Processed 10/200 chunks
Processed 20/200 chunks
Processed 30/200 chunks
Processed 40/200 chunks
Processed 50/200 chunks
Processed 60/200 chunks
Processed 70/200 chunks
Processed 80/200 chunks
Processed 90/200 chunks
Processed 100/200 chunks
Processed 110/200 chunks
Processed 120/200 chunks
Processed 130/200 chunks
Processed 140/200 chunks
Processed 150/200 chunks
Processed 160/200 chunks
Processed 170/200 chunks
Processed 180/200 chunks
Processed 190/200 chunks
Processed 200/200 chunks


In [93]:
def verify_vector_index():
    query = """
    SHOW INDEXES
    YIELD name, type, labelsOrTypes, properties, options
    WHERE name = 'chunk_vector_index'
    """
    return graph.query(query)


def vector_search(query: str, top_k: int = 3) -> List[Dict]:
   
    try:
        query_embedding = embeddings.embed_query(query)

        search_query = """
        MATCH (c:Chunk)
        WITH c, vector.similarity.cosine(c.embedding, $embedding) AS score
        WHERE score > 0.7
        RETURN 
            c.chunk_id AS chunk_id,
            c.source AS source,
            c.page_number AS page_number,
            c.text_preview AS text_preview,
            c.full_text AS full_text,
            c.total_length AS total_length,
            score
        ORDER BY score DESC
        LIMIT $limit
        """

        results = graph.query(
            search_query, params={"embedding": query_embedding, "limit": top_k}
        )

        return results

    except Exception as e:
        print(f"Vector search error: {e}")
        return []

In [95]:
print(verify_vector_index())
results = vector_search("what is the definition of deep reinforcement learning")

results

[{'name': 'chunk_vector_index', 'type': 'VECTOR', 'labelsOrTypes': ['Chunk'], 'properties': ['embedding'], 'options': {'indexProvider': 'vector-2.0', 'indexConfig': {'vector.hnsw.m': 16, 'vector.hnsw.ef_construction': 100, 'vector.dimensions': 3072, 'vector.similarity_function': 'COSINE', 'vector.quantization.enabled': True}}}]


[{'chunk_id': 0,
  'source': 'Grokking Deep Reinforcement Learning by Miguel Morales 1.pdf',
  'page_number': 2,
  'text_preview': 'grokking\nDeep \nReinforcement \nLearning',
  'full_text': 'grokking\nDeep \nReinforcement \nLearning',
  'total_length': 38,
  'score': 0.8552869558334351},
 {'chunk_id': 97,
  'source': 'Grokking Deep Reinforcement Learning by Miguel Morales 1.pdf',
  'page_number': 49,
  'text_preview': '26 Chapter 1  I  Introduction to deep reinforcement learning\nThe examples in these chapters are repe...',
  'full_text': '26 Chapter 1  I  Introduction to deep reinforcement learning\nThe examples in these chapters are repeated throughout agents of the same type to make \ncomparing and contrasting agents more accessible. You still explore fundamentally different \nkinds of problems, from small, continuous to image-based state spaces, and from discrete to \ncontinuous action spaces. But, the book’s focus isn’t about modeling problems, which is a \nskill of its own; inst

In [96]:
from langchain_community.vectorstores import Neo4jVector

neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=embeddings,  
    url=url,
    username=username,
    password=password,
    index_name='chunk_vector_index',  
    node_label='Chunk',  
    text_node_properties=['full_text'], 
    embedding_node_property='embedding'
)

In [97]:
retriever = neo4j_vector_store.as_retriever()

In [111]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Initialize the Ollama model
llm = Ollama(model="llama3.2")

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("what is a markov decision process?")

'A Markov Decision Process (MDP) is a mathematical model used in decision-making problems under uncertainty, characterized by its state space S, action space A, transition function T, reward signal R, initial state distribution Sθ, discount factor γ, and horizon H.\n\nThanks for asking!'