In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import RetrievalQA
from fastapi import FastAPI
from pydantic import BaseModel
from dotenv import load_dotenv
from chromadb.config import Settings
import time

In [None]:
load_dotenv()
persist_directory = "chroma_db"

# settings = Settings(persist_directory=persist_directory)


In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

llm = ChatOpenAI(
    model="gpt-4o-mini-2024-07-18",
    temperature=0
)

vector_db = Chroma(
    # persist_directory=persist_directory,
    embedding_function=embeddings,
    # client_settings=settings
)

retriever = vector_db.as_retriever()


: 

In [None]:
pdf_dir = "data"
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

docs = []
for path in pdf_files:
    loader = PyPDFLoader(path)
    docs.extend(loader.load())

print("Start processing...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(docs)

print(texts)
print(len(texts))
print("Add to db...")
vector_db.add_documents(texts)
vector_db.persist()

print("Ingestione completata con successo.")


🔍 Caricamento PDF...
 → Carico: data\[RL1]IntroRL-1-8.pdf
✂️  Split in chunk...
📄 Totale chunk generati: 8

🔹 Esempi di chunk:

--- CHUNK 1 ---
Intelligenza Artificiale
a.a. 2024/2025
Reinforcement Learning

--- CHUNK 2 ---
Intelligenza Artificiale                                                                                                     a.a. 2024/2025
Libri di testo 
 Richard S. Sutton and Andrew G. Barto, Reinforcement 
Learning: An Introduction, Second Edition, MIT Press 
(disponibile online) 
 Reinforcem

--- CHUNK 3 ---
Intelligenza Artificiale                                                                                                     a.a. 2024/2025
Outline
 Introduzione al Reinforcement Learning
 Formalizzazione del problema
 Agente basato su RL
 Problematiche

⚙️  Setup embedding e Vector DB...

🚀 Calcolo embedding di tutti i chunk in batch e salvataggio nel vector DB...


In [None]:
query = "How are graphs used to improve reasoning or retrieval in GraphRAG?"

retrieved_docs = retriever.invoke(query)

for i, doc in enumerate(retrieved_docs):
    print(f"\n--- Document {i + 1} ---")
    print(doc.page_content[:1000])  # Mostra i primi 1000 caratteri
