# Set up

In [1]:
from dotenv import load_dotenv
from llama_index.core import Document
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core import VectorStoreIndex, ServiceContext
from llama_index.core.settings import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.query_engine import RetrieverQueryEngine # para crear una query_engine con un retriever específico.
from llama_index.core.evaluation import RetrieverEvaluator
import pandas as pd
import itertools
from llama_index.llms.openai import OpenAI

# Carga del texto limpio
data_path = "../data/plain_text/plain_text.txt"

with open(data_path, "r", encoding = "utf-8") as f:
    content = f.read()

# Export boolean
export_csv = False

# Key
load_dotenv()

# Set model llm
llm = OpenAI(model="gpt-4o-mini", temperature=0)
Settings.llm = llm

# Set embedding model
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Def function to create Score DataFrame

In [2]:
# Function aux
def get_responses_similarity(query_str, query_engine, top_k):
    "Extraer la información de los nodos devueltos por el motor de consulta al lanzar la query."
    response = query_engine.query(query_str)

    info_nodes = {
        "query" : [query_str] * top_k,
        "position_node": [],
        "node_content": [],
        "node_score": [],
        "response" : [response] * top_k
    }
    for i, node_with_score in enumerate(response.source_nodes):
        info_nodes["node_content"].append(node_with_score.node.get_content())
        info_nodes["node_score"].append(node_with_score.score)
        info_nodes["position_node"].append(i+1)

    return info_nodes

def create_score_DataFrame(
    content_text,
    chunk_sizes,
    chunk_overlap = 20,
    top_k = 10,
    save_storage = False,
    load_storage = False,
    storage_path = "../data/index_storage/",
    queries = []
): 
    "Almacenar de forma ordenada los resultados de las distintas pruebas de chunk_size. "
    pdf_doc = Document(text = content.strip())
    all_rows = []

    for chunk_size in chunk_sizes: # para cada valor de prueba de tamaño de nodo
        # se realiza la división con dicho tamaño de nodo
        parser = SimpleNodeParser().from_defaults(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
        nodes = parser.get_nodes_from_documents([pdf_doc])

        # se realiza el embedding para el funcionamiento del retriever
        Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
        index = VectorStoreIndex(nodes)
        
        ## Save or Load Storage #####  --------------------------------------------------------------------
        if save_storage:
            index.storage_context.persist(persist_dir="../data/index_storage/")
        if load_storage:
            storage_context = StorageContext.from_defaults(
                docstore=SimpleDocumentStore.from_persist_dir(persist_dir="../data/index_storage/"),
                vector_store=SimpleVectorStore.from_persist_dir(
                    persist_dir="data/index_storage/"
                ),
                index_store=SimpleIndexStore.from_persist_dir(persist_dir="../data/index_storage/"),
            )
        ## ------------------------------------------------------------------------------------------------
        
        # Retriever -- definición y ejecución sobre varias queries pasadas con el argumento queries
        retriever = index.as_retriever(similarity_top_k=top_k) # antes query_engine usaba k=2
        query_engine_VSI = RetrieverQueryEngine.from_args(retriever=retriever)
    
        # Get nodes with score
        chunk_size_rows = []
        for query in queries:
            query_info = get_responses_similarity(query, query_engine = query_engine_VSI, top_k=top_k)
            # Añadir chunk_size como metadato adicional para la comprobación 
            query_info["chunk_size"] = [chunk_size] * top_k
            query_info["chunk_overlap"] = [chunk_overlap] * top_k
            chunk_df = pd.DataFrame(query_info)
            all_rows.append(chunk_df)

    # End of chunk_size loop
    score_df = pd.concat(all_rows, ignore_index = True)
    return score_df

# Test on differents sizes

In [3]:
chunk_sizes = [200, 300, 500]
queries = [
    "What does collect() function do in pyspark?",
    "What is the role of the Driver component?",
    "What does a 'broadcast join' mean?",
    "How does persist() differ from cache() in PySpark?"
]

score_df = create_score_DataFrame(
    content_text=content,
    chunk_sizes=chunk_sizes,
    top_k=5,
    queries=queries
)

In [4]:
# score_df.to_csv("../data/scores_retriever/scores_by_chunksize.csv", sep = ";") -- revisar

### Resumen de resultados

In [5]:
def analyze_score_retriever(df):
    agg1 = df.groupby("chunk_size")["node_score"].mean().reset_index()
    agg1.columns = ["chunk_size", "mean_score_all_positions"]

    agg2 = df[df["position_node"] == 1].groupby("chunk_size")["node_score"].mean().reset_index()
    agg2.columns = ["chunk_size", "mean_score_top1"]
    result = pd.merge(agg1, agg2, on="chunk_size")
    return result.sort_values("chunk_size")

In [6]:
analyze_score_retriever(score_df)

Unnamed: 0,chunk_size,mean_score_all_positions,mean_score_top1
0,200,0.568016,0.629079
1,300,0.520839,0.588275
2,500,0.480259,0.575582


A menor chunk_size mayor score, o eso parece ser la tendencia. Vamos a probar chunk_size más finos para ver, por ejemplo 100, 150. 

In [7]:
chunk_sizes = [100, 150, 200]

score_df = create_score_DataFrame(
    content_text=content,
    chunk_sizes=chunk_sizes,
    top_k=5,
    queries=queries
)

In [8]:
analyze_score_retriever(score_df)

Unnamed: 0,chunk_size,mean_score_all_positions,mean_score_top1
0,100,0.576822,0.618374
1,150,0.572786,0.607733
2,200,0.568125,0.628987


# Exportar los resultados del mejor chunk_size (sin prueba de chunk_overlap)

```
pdf_doc = Document(text=content.strip())

# Parsear el documento en nodos con chunking optimizado
chunk_size = 200
chunk_overlap = 20

parser = SimpleNodeParser().from_defaults(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
nodes = parser.get_nodes_from_documents([pdf_doc])

# Definir el modelo de embeddings definitivo
# Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Crear el índice final
index = VectorStoreIndex(nodes)

# Persistir el índice final para el pipeline RAG
storage_path = "../data/index_storage/"
index.storage_context.persist(persist_dir=storage_path)

print("✅ VectorStoreIndex final creado y guardado correctamente en:", storage_path)

```

```
#Tests with that index_storage
retriever= index.as_retriever(similarity_top_k =5)
query = "What does collect() function do in PySpark?"
nodes = retriever.retrieve(query)
query_engine =  RetrieverQueryEngine.from_args(retriever=retriever)
response = query_engine.query(query)

# 6️⃣ Mostrar los chunks devueltos
print(f"\nConsulta: {query}\n{'='*60}")

for i, node in enumerate(response.source_nodes):
    print(f"\n--- Nodo {i+1} ---")
    print(f"Score: {node.score:.4f}")
    print(node.node.get_content())
```

# Iterar entre chunk_size y chunk_overlap -- PRUEBA FINAL

In [11]:

def create_score_DataFrame_v2(
    content_text,
    chunk_sizes,
    chunk_overlaps,
    top_k = 10,
    save_storage = False,
    load_storage = False,
    storage_path = "../data/index_storage/",
    queries = []
): 
    """Almacenar de forma ordenada los resultados de las distintas pruebas de chunk_size y chunk_overlap."""

    pdf_doc = Document(text=content_text.strip())
    all_rows = []

    # Generamos todas las combinaciones de chunk_size y chunk_overlap
    combinations = list(itertools.product(chunk_sizes, chunk_overlaps))

    for chunk_size, chunk_overlap in combinations:
        # División de documentos con cada combinación
        parser = SimpleNodeParser().from_defaults(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        nodes = parser.get_nodes_from_documents([pdf_doc])

        # Embedding para el funcionamiento del retriever
        Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
        index = VectorStoreIndex(nodes)
        
        ## Save or Load Storage #####  
        if save_storage:
            index.storage_context.persist(persist_dir=storage_path)
        if load_storage:
            storage_context = StorageContext.from_defaults(
                docstore=SimpleDocumentStore.from_persist_dir(persist_dir=storage_path),
                vector_store=SimpleVectorStore.from_persist_dir(persist_dir=storage_path),
                index_store=SimpleIndexStore.from_persist_dir(persist_dir=storage_path),
            )
        ## ------------------------------------------------------------

        # Retriever y evaluación de queries
        retriever = index.as_retriever(similarity_top_k=top_k)
        query_engine_VSI = RetrieverQueryEngine.from_args(retriever=retriever)
    
        for query in queries:
            query_info = get_responses_similarity(query, query_engine=query_engine_VSI, top_k=top_k)
            query_info["chunk_size"] = [chunk_size] * top_k
            query_info["chunk_overlap"] = [chunk_overlap] * top_k
            chunk_df = pd.DataFrame(query_info)
            all_rows.append(chunk_df)

    # Unimos resultados de todas las pruebas
    score_df = pd.concat(all_rows, ignore_index=True)
    return score_df

def analyze_score_retriever_v2(df):
    """
    Calcula métricas agregadas de node_score agrupando por chunk_size y chunk_overlap.
    """
    # Media de todos los nodos recuperados (todas las posiciones)
    agg1 = df.groupby(["chunk_size", "chunk_overlap"])["node_score"].mean().reset_index()
    agg1.columns = ["chunk_size", "chunk_overlap", "mean_score_all_positions"]

    # Media solo del top-1 (posición 1)
    agg2 = df[df["position_node"] == 1].groupby(["chunk_size", "chunk_overlap"])["node_score"].mean().reset_index()
    agg2.columns = ["chunk_size", "chunk_overlap", "mean_score_top1"]

    # Unimos ambos resultados
    result = pd.merge(agg1, agg2, on=["chunk_size", "chunk_overlap"])

    # Ordenamos para mejor lectura
    return result.sort_values(["chunk_size", "chunk_overlap"]).reset_index(drop=True)


In [12]:
chunk_sizes = [150, 200, 300, 500]
chunk_overlaps = [20, 50, 90]
queries = [
    "What does collect() function do in pyspark?",
    "What is the role of the Driver component?",
    "What does a 'broadcast join' mean?"
]
df_scores = create_score_DataFrame_v2(
    content_text=content,
    chunk_sizes=chunk_sizes,
    chunk_overlaps=chunk_overlaps,
    top_k=5,
    queries=queries
)

In [13]:
# df_scores.to_csv("../data/scores_retriever/scores_by_chunksize_chunkoverlap.csv", sep = ";") 

In [14]:
results = analyze_score_retriever_v2(df_scores)
results.sort_values(["mean_score_all_positions", "mean_score_top1"], ascending=False)

Unnamed: 0,chunk_size,chunk_overlap,mean_score_all_positions,mean_score_top1
2,150,90,0.570297,0.620202
5,200,90,0.55573,0.609322
0,150,20,0.548184,0.589283
4,200,50,0.545805,0.603085
1,150,50,0.541294,0.589623
3,200,20,0.541042,0.609723
8,300,90,0.527067,0.607967
7,300,50,0.519124,0.593811
6,300,20,0.497696,0.558183
11,500,90,0.486329,0.54121


In [15]:
# exportar el 200, 90. 
pdf_doc = Document(text=content.strip())

# Parsear el documento en nodos con chunking optimizado
chunk_size = 200
chunk_overlap = 90

parser = SimpleNodeParser().from_defaults(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
nodes = parser.get_nodes_from_documents([pdf_doc])

# Definir el modelo de embeddings definitivo
# Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Crear el índice final
index = VectorStoreIndex(nodes)

# Persistir el índice final para el pipeline RAG
storage_path = "../data/index_storage/"
index.storage_context.persist(persist_dir=storage_path)

print("✅ VectorStoreIndex final creado y guardado correctamente en:", storage_path)

✅ VectorStoreIndex final creado y guardado correctamente en: data/index_storage_200_90/
