In [None]:
## aggiungi installazione di librerie

# LLAMA INDEX RAG PIPELINE

Import, configuration and file loading can be used for all three different framework (classic RAG, RAG with KG and RAG with ontologies). "Query and response" block is reported for each framework for seimplicity but it can be used for all as well.

## Import

In [2]:
import openai  
import os  
import fitz  
import nest_asyncio

from rdflib import Graph
from dotenv import load_dotenv

from llama_index.core import (
    Document, 
    VectorStoreIndex, 
    PropertyGraphIndex,
    SimpleDirectoryReader
)
from llama_index.llms.openai import OpenAI 

## Configuration

In [None]:
# Apply the asyncio patch to allow nested event loops.
nest_asyncio.apply()

# Load environment variables from a .env file.
load_dotenv()

# Retrieve the OpenAI and Llama API key from the environment variables.
openai_api_key = os.getenv("OPENAI_API_KEY")

## File Loading

In [15]:
# Specify required extensions as a list (if needed)
required_exts = [] # .png, .jpg, .pdf, .md, .txt, .csv etc...

# Load documents from a specified directory
reader = SimpleDirectoryReader(
    input_dir="C:/Users/filow/OneDrive/Desktop/Phd/02_Secondo anno/PAPERS/03_Paper_CAiSE_Technical_2025_[Submitted]/OFF_CODE_01_25/document",
    input_files=["C:/Users/filow/OneDrive/Desktop/Phd/02_Secondo anno/PAPERS/03_Paper_CAiSE_Technical_2025_[Submitted]/OFF_CODE_01_25/document/ont_sm.owl"], #,
                #"document/document_2"]
    required_exts=required_exts
)

# Load and parse document from the reader
document = reader.load_data(num_workers=4)

In [None]:
first_file = str(reader.input_files[0])
print(first_file)

# CLASSIC RAG

## Pipeline set-up

In [6]:
# Create a vector index from the parsed documents.
index = VectorStoreIndex.from_documents(document)

# Initialize the OpenAI language model (LLM) for generating responses.
llm = OpenAI(
    model="gpt-4-turbo",
    strict=True,        # True default
    temperature=0,      # from 0 to 1
    max_tokens=2000,    # from 0 to infinity (None default)
    top_p=0.8)          # from 0 to 1

# Convert the vector index into a query engine for running queries on the indexed documents.
query_engine = index.as_query_engine(
    llm=llm,
    response_mode = "compact", 
    streaming = True,
    similarity_top_k=2,
    verbose=True)

## Query and response

In [None]:
# Insert you query 
user_query = input("Enter your query: ")

# Execute a query and retrieve the response
response = query_engine.query(user_query)
print(f"Answer: {response}")

# RAG WITH KG

## Pipeline set-up

In [None]:
# Create a PropertyGraphIndex from the parsed documents.
index = PropertyGraphIndex.from_documents(document)

# Initialize the OpenAI language model (LLM) for generating responses.
llm = OpenAI(
    model="gpt-4-turbo",
    strict=True,        # True default
    temperature=0,      # from 0 to 1
    max_tokens=2000,    # from 0 to infinity (None default)
    top_p=0.8)          # from 0 to 1

# Convert the vector index into a query engine for running queries on the indexed documents.
query_engine = index.as_query_engine(
    llm=llm,
    response_mode = "compact", 
    streaming = True,
    similarity_top_k=2,
    verbose=True)

## Query and response

In [None]:
# Insert you query 
user_query = input("Enter your query: ")

# Execute a query and retrieve the response
response = query_engine.query(user_query)
print(f"Answer: {response}")

# ONTOLOGY RAG

## Pipeline set-up

In [None]:
# Initialize an RDF graph and parse the selected XML file into it.
g = Graph()
g.parse(str(reader.input_files[0]), format="xml") # must be .owl, .json or .csv

# Create a list of nodes from the RDF graph
nodes = []
documents_ontology = []

for s, p, o in g:
    nodes.append({
        "subject": str(s),
        "predicate": str(p),
        "object": str(o), })

# Generate documents from RDF nodes and store them in the documents_ontology list.
for node in nodes:
    content = f"Subject: {node['subject']}, Predicate: {node['predicate']}, Object: {node['object']}"
    documents_ontology.append(Document(text=content)) 

# Index the documents into a PropertyGraphIndex for semantic search.
index_ontology = PropertyGraphIndex.from_documents(documents_ontology)

# Initialize the OpenAI model with specific configurations for response generation.
llm = OpenAI(
    model="gpt-4-turbo",
    strict=True,        # True default
    temperature=0,      # from 0 to 1
    max_tokens=2000,    # from 0 to infinity (None default)
    top_p=0.8),         # from 0 to 1

# Set up the query engine to perform semantic search on ontology index.
query_engine = index_ontology.as_query_engine(
    llm=Settings.llm,
    response_mode = "compact", 
    streaming = True,
    similarity_top_k=2,
    verbose=True)

## Query and response

In [None]:
# Insert you query 
user_query = input("Enter your query: ")

# Execute a query and retrieve the response
response = query_engine.query(user_query)
print(f"Answer: {response}")
