In [1]:
import pandas as pd
import os

import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
    MarkdownNodeParser,
)

from llama_index.core import Document
from llama_index.core.vector_stores.types import VectorStoreQueryMode
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding


In [2]:
datapath = "../../data"


In [3]:
embed_model_name = "text-embedding-3-large"

embed_model = OpenAIEmbedding(
    model=embed_model_name,
    embed_batch_size=10,
    max_retries=10,
    timeout=180,
    reuse_client=False,
)


In [4]:
chroma_client = chromadb.EphemeralClient()
# delete collection if it exists
if any(coll.name == "test" for coll in chroma_client.list_collections()):
    chroma_client.delete_collection("test")
chroma_collection = chroma_client.create_collection("test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)


In [5]:
# SEMANTIC SPLITTER
buffer_size = 1
splitter = SemanticSplitterNodeParser(
    buffer_size=buffer_size,
    breakpoint_percentile_threshold=81,
    include_prev_next_rel=False,
    embed_model=embed_model,
)


In [6]:
# create a simple ingestion pipeline: chunk the documents and create embeddings
pipeline = IngestionPipeline(
    transformations=[
        splitter,
        embed_model,
    ]
)


In [7]:
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)


## Load Files


In [8]:
origin_paths = [f"{datapath}/out_sep_4/from_html/", f"{datapath}/out_sep_4/from_pdf/"]

# Read the document names from the directories:
files_list = [path + item for path in origin_paths for item in os.listdir(path)]

files_list.sort()
files_list[0:10]


['../../data/out_sep_4/from_html/-Admission-Requirements.md',
 '../../data/out_sep_4/from_html/-After-PathwayConnect.md',
 '../../data/out_sep_4/from_html/-Answers-to-Your-Questions-about-Ecclesiastical-Endorsement.md',
 '../../data/out_sep_4/from_html/-Application-Process.md',
 '../../data/out_sep_4/from_html/-Assistance-for-Students-with-Disabilities.md',
 '../../data/out_sep_4/from_html/-BYU-Idaho-Course-Exceptions.md',
 '../../data/out_sep_4/from_html/-BYU-Pathway-Support.md',
 '../../data/out_sep_4/from_html/-BYU-Pathway-Worldwide-Website.md',
 '../../data/out_sep_4/from_html/-Common-Misconceptions-about-Choosing-Certificates.md',
 '../../data/out_sep_4/from_html/-Communication-Resources.md']

In [9]:
documents = []

for i, filepath in enumerate(files_list):
    with open(filepath, "r", encoding="utf-8") as file:
        document = Document(text=file.read(), metadata={"filepath": filepath})

        # add the document to a single entry list
        documents.append(document)


In [10]:
# run the pipeline to generate nodes
nodes = pipeline.run(documents=documents)
# print('nodes', len(nodes))

# Inserta nodos en el índice
index.insert_nodes(nodes)

print(f"Nodes inserted: {len(nodes)}")


Nodes inserted: 2660


In [11]:
query_mode = VectorStoreQueryMode.DEFAULT

top_k = 16
sparse_top_k = 16 * 5

# create a retriever from the index
retriever = index.as_retriever(
    vector_store_query_mode=query_mode,
    similarity_top_k=top_k,
    sparse_top_k=sparse_top_k,
)


## Retrieve the data from CSV


In [15]:
df = pd.read_csv(f"{datapath}/temporary/index_single_quotes.csv")

df.head()


Unnamed: 0,Question,Ideal Answer,Link,Quote
0,How do I know if a student has a scholarship?,Missionaries can see if a student has a schola...,https://missionaries.prod.byu-pathway.psdops.c...,Discounts/Scholarships Tab\nView the scholarsh...
1,How do I know if a student is registered for a...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,Checking for Institute Registration for Colleg...
2,How do I know if student is member of the church?,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,Verify a Learner’s Membership Status\n1. In th...
3,What information should I track for each student?,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,Monitoring Student Progress\nTime in Course—ho...
4,What to do If a student has already taken this...,For a student that is in a course that they ha...,https://pathway-missionary.powerappsportals.co...,Link 1:\nWHAT TO WATCH FOR\nWhen you start a n...


In [17]:
# drop the empty questions
df = df.dropna(subset=["Question"])

# cuenta cuantas rows hay
print(f"Number of rows: {len(df)}")


Number of rows: 112


In [19]:
import numpy as np

retrieved_chunks_dict = {}  # Para almacenar los chunks recuperados para cada pregunta
error_count = 0

# Procesa cada pregunta y recupera los chunks
for idx, row in df.iterrows():
    question = row["Question"]
    quote = row["Quote"]
    link = row["Link"]

    # Asegurarse de que quote sea una cadena de texto, si no, omitir la pregunta
    if pd.isna(quote):
        print(f"Skipping question {question} due to missing quote.")
        continue

    quote = str(quote)  # Convertir quote a cadena de texto, si es necesario

    # Recuperar chunks relacionados con la pregunta
    try:
        retrieved_chunks = retriever.retrieve(question)
    except Exception as e:
        error_count += 1
        print(f"Error retrieving chunks for question: {question}")
        print(e)
        continue

    # Calcular el recall: cuántos chunks recuperados contienen el quote
    relevant_chunks = []
    for chunk in retrieved_chunks:
        chunk_text = (
            str(chunk.text) if isinstance(chunk.text, str) else ""
        )  # Asegurarse de que el chunk sea texto
        if quote in chunk_text:
            relevant_chunks.append(chunk)

    recall = len(relevant_chunks) / len(retrieved_chunks) if retrieved_chunks else 0

    # Almacenar los datos en el diccionario
    retrieved_chunks_dict[question] = {
        "quote": quote,
        "link": link,
        "retrieved_chunks": [chunk.text for chunk in retrieved_chunks],
        "recall": recall,
    }

print(f"Total errors encountered: {error_count}")


Skipping question How do students contact a tutor?  due to missing quote.
Skipping question How can a student change the name on their PathwayConnect certificate? due to missing quote.
Skipping question Is there an exception request link for online degree students? due to missing quote.
Skipping question How can I give missionaries or a BOM to friends of the church? due to missing quote.
Skipping question What scholarships are available? due to missing quote.
Skipping question How can I help students pay tuition? due to missing quote.
Skipping question What to do after the gathering due to missing quote.
Total errors encountered: 0


In [32]:
# Guardar los resultados en un archivo CSV
# las columnas son: Question,Quote,Quote URL (Link),retrieved chunks,recall

output_path = f"{datapath}/temporary/retrieved_chunks.csv"
output_df = pd.DataFrame.from_dict(retrieved_chunks_dict, orient="index")

# the actual index must be a column
output_df.reset_index(inplace=True)

# Add the titles

output_df.columns = ["Question", "Quote", "Link", "Retrieved Chunks", "Recall"]

# ignore the index when saving

output_df.to_csv(output_path, index=False)

output_df.head()


Unnamed: 0,Question,Quote,Link,Retrieved Chunks,Recall
0,How do I know if a student has a scholarship?,Discounts/Scholarships Tab\nView the scholarsh...,https://missionaries.prod.byu-pathway.psdops.c...,"[1. , Go to the Discounts/Scholarships tab on ...",0.0
1,How do I know if a student is registered for a...,Checking for Institute Registration for Colleg...,https://missionaries.prod.byu-pathway.psdops.c...,[The best method is to have the student ask th...,0.0
2,How do I know if student is member of the church?,Verify a Learner’s Membership Status\n1. In th...,https://missionaries.prod.byu-pathway.psdops.c...,[1. Go to the Details tab in the Student Infor...,0.0
3,What information should I track for each student?,Monitoring Student Progress\nTime in Course—ho...,https://missionaries.prod.byu-pathway.psdops.c...,[```markdown\n# Monitoring Student Progress Pa...,0.0
4,What to do If a student has already taken this...,How a Student Withdraws From a Program\nPURPOS...,https://missionaries.prod.byu-pathway.psdops.c...,[# 3.4 Course Retakes and Returning Students\n...,0.0


In [18]:
retrieved_chunks_dict = {}  # Para almacenar los chunks recuperados para cada pregunta
error_count = 0

# Procesa cada pregunta y recupera los chunks
for idx, row in df.iterrows():
    question = row["Question"]
    quote = row["Quote"]
    link = row["Link"]

    # Recuperar chunks relacionados con la pregunta
    try:
        retrieved_chunks = retriever.retrieve(question)
    except Exception as e:
        error_count += 1
        print(f"Error retrieving chunks for question: {question}")
        print(e)
        continue

    # Calcular el recall: cuántos chunks recuperados contienen el quote
    relevant_chunks = [chunk for chunk in retrieved_chunks if quote in chunk.text]
    recall = len(relevant_chunks) / len(retrieved_chunks) if retrieved_chunks else 0

    # Almacenar los datos en el diccionario
    retrieved_chunks_dict[question] = {
        "quote": quote,
        "link": link,
        "retrieved_chunks": [chunk.text for chunk in retrieved_chunks],
        "recall": recall,
    }


TypeError: 'in <string>' requires string as left operand, not float