In [4]:
from semantic_router.encoders import HuggingFaceEncoder
from semantic_chunkers import StatisticalChunker

encoder = HuggingFaceEncoder(
    name = "NeuML/pubmedbert-base-embeddings"
)
chunker = StatisticalChunker(encoder=encoder)


In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document

def generate_chunks(file_path: str, chunker)-> list:
    
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split()

    document_text = "\n".join([page.page_content for page in pages])

    chunks = chunker(docs=[document_text])
    chunked_data = []

    for chunk in chunks[0]:  
        combined_chunk = " ".join(chunk.splits)  
        chunked_data.append(Document(page_content=combined_chunk)) 

    return chunked_data

In [6]:
chunked_data = generate_chunks("../../dataset/pdfs/anesthesia/3.pdf", chunker)

[32m2024-10-16 12:54:21 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.[0m


  0%|          | 0/1619 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
from langchain.graphs import Neo4jGraph
from dotenv import load_dotenv
import os 

load_dotenv()

graph = Neo4jGraph(
    url = 'neo4j+s://2e177e83.databases.neo4j.io',
    username = os.getenv('NEO4J_USERNAME'),
    password = os.getenv('NEO4J_PASSWORD'),

)

In [14]:
import json

with open('4.json', 'r') as file:

    data = json.load(file)

In [15]:
from langchain.graphs.graph_document import GraphDocument, Node, Relationship
from langchain.schema import Document

graph_docs = []

for i, chunk in enumerate(data):

    node_list = []

    for node in chunk['nodes']:
        node = Node(id=node['id'], type=node['type'], properties = node['properties'])
        node_list.append(node)


    relationship_list  = []

    source_doc = Document(page_content = chunked_data[i].page_content, metadata = chunked_data[i].metadata)


    temp_doc = GraphDocument(
        nodes =  node_list,
        relationships = relationship_list,
        source = source_doc
        
    )

    graph_docs.append(temp_doc)

In [17]:
graph.add_graph_documents(
    graph_docs,
    baseEntityLabel=True,
    include_source=True
)