In [1]:
import os
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.utils import filter_complex_metadata
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain


from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Tuple, List, Optional
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_community.graphs import Neo4jGraph
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_core.runnables import ConfigurableField, RunnableParallel, RunnablePassthrough

In [2]:
load_dotenv()
directory_path = "pdfs"
llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo-0125")
emb_model = MistralAIEmbeddings(model="mistral-embed")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=24)

In [3]:
pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
all_docs = []
for pdf_file in pdf_files:
    full_path = os.path.join(directory_path, pdf_file)
    docs = PyPDFLoader(file_path=full_path).load()
    all_docs.extend(docs)
documents = text_splitter.split_documents(all_docs)
documents = filter_complex_metadata(documents)

In [4]:
# build the graph database
graph = Neo4jGraph()
llm_transformer = LLMGraphTransformer(llm)
graph_documents = llm_transformer.convert_to_graph_documents(all_docs)
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [5]:
# build the standard vector database
vector_store = Chroma.from_documents(
    documents=documents,
    embedding=emb_model,
    persist_directory=f"{directory_path}/chroma_db")