# LANGCHAIN RAG PIPELINE

For this implementation, while the initial steps for classical RAG and Ontology RAG can be unified, implementing RAG with KG requires a separate block of code.

# CLASSIC RAG AND ONTOLOGY RAG

## Import

In [3]:
import os
import json
import nest_asyncio

from dotenv import load_dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.vectorstores import FAISS  
from langchain.chains import RetrievalQA 
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.document_loaders import (
    TextLoader, 
    PyPDFLoader, 
    CSVLoader, 
    JSONLoader 
)

## Configuration

In [None]:
# Apply the asyncio patch to allow nested event loops.
nest_asyncio.apply()

# Load environment variables from a .env file.
load_dotenv()

# Retrieve the OpenAI API key from the environment variables.
openai_api_key = os.getenv("OPENAI_API_KEY")

## File loading

In [4]:
# Load a PDF document from the specified path using PyPDFLoader.
loader = PyPDFLoader('C:/Users/filow/OneDrive/Desktop/Phd/02_Secondo anno/PAPERS/03_Paper_CAiSE_Technical_2025_[Submitted]/OFF_CODE_01_25/document/caise2025_paper_238.pdf')
# loader = TextLoader('path/to/your/directory/document.txt')
# loader = CSVLoader('path/to/your/directory/document.csv')
# loader = JSONLoader('path/to/your/directory/document.json')

document = loader.load()


### Parsing, Embedding, Indexing

In [5]:
# Split documents into smaller chunks for easier processing.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200
)

# Split the documents into smaller chunks using the text splitter.
docs = text_splitter.split_documents(document)

# Create embeddings using the "text-embedding-3-large" model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Create a FAISS vector store from the documents and embeddings for efficient similarity search.
vectorstore = FAISS.from_documents(docs, embeddings)

# Initialize the ChatOpenAI model with specific configurations for response generation.
llm = ChatOpenAI(
    model="gpt-4o",              
    temperature=0,          # from 0 to 1
    max_tokens=2000,        # from 0 to infinity
    top_p=0.8,              # from 0 to 1
    api_key=openai_api_key)

# Set up a RetrievalQA chain for querying
qa_chain = RetrievalQA.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(),
) 

## Query and response

In [6]:
user_query = input("Enter your query: ")
response = qa_chain.invoke(user_query)

# KG RAG

## Import

In [1]:
import os

from dotenv import load_dotenv
from neo4j import GraphDatabase

from langchain_openai import ChatOpenAI 
from langchain.chains import GraphCypherQAChain 
from langchain.graphs import Neo4jGraph 
from langchain.prompts import PromptTemplate 

## Configuration and KG loading

In [None]:
# Load environment variables from a .env file
load_dotenv()

# Retrieve API keys and database credentials from a .env file
openai_api_key = os.getenv("OPENAI_API_KEY")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")

# Retrieve the Neo4j database URI and set authentication credentials.
URI = os.getenv("NEO4J_URI")
AUTH = (neo4j_username, neo4j_password) 

# Initialize the Neo4jGraph connection with enhanced schema support.
graph = Neo4jGraph(
    url=URI, 
    username=neo4j_username, 
    password=neo4j_password, 
    enhanced_schema=True, 
)

# Define a Cypher query to load CSV data from a specific URL and create nodes/relationships. (comment it if you don't need it)
cypher_query = """
LOAD CSV WITH HEADERS FROM 'specific URL to load CSV file' AS row

// Replace the following MERGE clauses with your specific data modeling logic.
// Use MERGE to create nodes and relationships based on the data in the CSV.
// Add your domain-specific MERGE clauses here
"""
graph.query(cypher_query)

## Pipeline set-up

In [3]:
# Define a custom prompt template for user input, allowing future customization.
chat_prompt = PromptTemplate(
    input_variables=["user_input"],
    template=""  # Template left intentionally blank for customization
)

# Initialize the OpenAI model with specific configurations for response generation
llm = ChatOpenAI(
    model="gpt-4o",              
    temperature=0,          # from 0 to 1
    max_tokens=2000,        # from 0 to infinity
    top_p=0.8,              # from 0 to 1
    api_key=openai_api_key)

# Create a GraphCypherQAChain for querying the Neo4j graph using natural language.
chain = GraphCypherQAChain.from_llm(
    llm, 
    graph=graph, 
    prompt=chat_prompt, 
    verbose=True, 
    return_intermediate_steps=True, 
    validate_cypher=True, 
    top_k=5, 
    allow_dangerous_requests=True
)

## Query and response

In [None]:
user_query = input("Enter your query: ")
response = chain.invoke(user_query)