In [1]:
import pandas as pd
import os
import re

import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import (
    SemanticSplitterNodeParser,
)

from llama_index.core import Document
from llama_index.core.vector_stores.types import VectorStoreQueryMode
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/isaiaszc/pathway/pathway-
[nltk_data]     indexer/.venv/lib/python3.12/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
datapath = "../../data"


In [3]:
embed_model_name = "text-embedding-3-large"

embed_model = OpenAIEmbedding(
    model=embed_model_name,
    embed_batch_size=10,
    max_retries=10,
    timeout=180,
    reuse_client=False,
)


In [5]:
chroma_client = chromadb.PersistentClient()
# delete collection if it exists
if any(coll.name == "test" for coll in chroma_client.list_collections()):
    chroma_client.delete_collection("test")
chroma_collection = chroma_client.create_collection("test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)


In [6]:
# SEMANTIC SPLITTER
buffer_size = 1
splitter = SemanticSplitterNodeParser(
    buffer_size=buffer_size,
    breakpoint_percentile_threshold=81,
    include_prev_next_rel=False,
    embed_model=embed_model,
)


In [7]:
# create a simple ingestion pipeline: chunk the documents and create embeddings
pipeline = IngestionPipeline(
    transformations=[
        splitter,
        embed_model,
    ]
)


In [8]:
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)


## Load Files


In [9]:
origin_paths = [f"{datapath}/testdata/"]

# Read the document names from the directories:
files_list = [path + item for path in origin_paths for item in os.listdir(path)]

files_list.sort()
files_list[0:10]


['../../data/testdata/-Admission-Requirements.md',
 '../../data/testdata/-After-PathwayConnect.md',
 '../../data/testdata/-Answers-to-Your-Questions-about-Ecclesiastical-Endorsement.md',
 '../../data/testdata/-Application-Process.md',
 '../../data/testdata/-Assistance-for-Students-with-Disabilities.md',
 '../../data/testdata/-BYU-Idaho-Course-Exceptions.md',
 '../../data/testdata/-BYU-Pathway-Support.md',
 '../../data/testdata/-BYU-Pathway-Worldwide-Website.md',
 '../../data/testdata/-Common-Misconceptions-about-Choosing-Certificates.md',
 '../../data/testdata/-Communication-Resources.md']

In [10]:
documents = []

for i, filepath in enumerate(files_list):
    with open(filepath, "r", encoding="utf-8") as file:

        ## Hay metadata dentro de --- ---, necesito obtener el url: que hay dentro de los ---
        readed = file.read()
        url = ""
        heading = ""

        # split the metadata from the text
        try:
            metadata, text = readed.split("---\n", 2)[1:]

            # get the url from the metadata
            url = metadata.split("url: ")[1].split("\n")[0]
            # get heading
            heading = metadata.split("heading: ")[1].split("\n")[0]
            # get subheading
            subheading = metadata.split("subheading: ")[1].split("\n")[0]
        except:
            print(f"not enough values in file {filepath}. Something went wrong.")
            # continue

        document = Document(
            text=readed,
            metadata={"url": url, "title": heading, "subheading": subheading},
        )

        # add the document to a single entry list
        documents.append(document)


not enough values in file ../../data/testdata/ACM-Council-Agendas.md. Something went wrong.
not enough values in file ../../data/testdata/Policy-Executive-Secretaries--Communication.txt. Something went wrong.


In [10]:
# split_nodes = splitter.get_nodes_from_documents(documents)


In [11]:
# run the pipeline to generate nodes
nodes = pipeline.run(documents=documents)
# print('nodes', len(nodes))


In [12]:
# Inserta nodos en el índice
index.insert_nodes(nodes)

print(f"Nodes inserted: {len(nodes)}")


Nodes inserted: 2660


In [13]:
nodes[0]


TextNode(id_='b1c4ed33-0bde-4e86-bf6a-60e2542b1598', embedding=[0.016047151759266853, -0.005430771969258785, -0.01018548384308815, 0.023446671664714813, -0.03132166340947151, 0.01349149364978075, 0.0006690956652164459, 0.020044080913066864, -0.014100691303610802, 0.02062356099486351, 0.017339838668704033, 0.0016465046210214496, -0.005623932462185621, -0.018127337098121643, -0.012079939246177673, 0.022094549611210823, -0.015824275091290474, 0.03188628330826759, -0.008677348494529724, -0.004513261374086142, -0.013446918688714504, -0.005304474849253893, -0.016195736825466156, 0.003941209986805916, 0.00908595696091652, -0.0002890437317546457, -0.022971199825406075, -0.000924939988180995, 0.0048884376883506775, 0.022406578063964844, 0.010118620470166206, -0.00347874010913074, 0.017027810215950012, -0.0037889108061790466, 0.019999505952000618, -0.05438201501965523, 0.01927144080400467, 0.018885120749473572, -0.07601594924926758, 0.040920235216617584, -0.008409895934164524, 0.0024906515609472

## Chunks File


We must create a file with: Link, chunk metadata, chunk text. Saved as `chunks.csv`.


In [34]:
# now, i need to save all chunks in a csv with its respective, url, chunk metadata, and chunk text

# create a dataframe with the chunks
chunks = []

for node in nodes:
    chunks.append(
        {
            "url": node.metadata["url"],
            "chunk_metadata": node.metadata,
            "chunk_text": node.text,
        }
    )


In [35]:
# convert the chunks to a Dataframe
df = pd.DataFrame(chunks)

df.head()


Unnamed: 0,url,chunk_metadata,chunk_text
0,https://www.byupathway.edu/policies/handbook/2...,{'url': 'https://www.byupathway.edu/policies/h...,---\nheading: Missionary Software & Uses\nsubh...
1,https://www.byupathway.edu/policies/handbook/2...,{'url': 'https://www.byupathway.edu/policies/h...,* Be a) at least 17 years old with a high scho...
2,https://www.byupathway.edu/policies/handbook/2...,{'url': 'https://www.byupathway.edu/policies/h...,* Attend a weekly gathering [(see 5.0 Gatherin...
3,https://www.byupathway.edu/policies/handbook/2...,{'url': 'https://www.byupathway.edu/policies/h...,* Complete an English Placement Assessment (fo...
4,https://www.byupathway.edu/policies/handbook/2...,{'url': 'https://www.byupathway.edu/policies/h...,* Strive to live by the [CES Honor Code](https...


In [36]:
output_path = f"{datapath}/temporary/chunks.csv"

df.to_csv(output_path, index=False)

df.head()


Unnamed: 0,url,chunk_metadata,chunk_text
0,https://www.byupathway.edu/policies/handbook/2...,{'url': 'https://www.byupathway.edu/policies/h...,---\nheading: Missionary Software & Uses\nsubh...
1,https://www.byupathway.edu/policies/handbook/2...,{'url': 'https://www.byupathway.edu/policies/h...,* Be a) at least 17 years old with a high scho...
2,https://www.byupathway.edu/policies/handbook/2...,{'url': 'https://www.byupathway.edu/policies/h...,* Attend a weekly gathering [(see 5.0 Gatherin...
3,https://www.byupathway.edu/policies/handbook/2...,{'url': 'https://www.byupathway.edu/policies/h...,* Complete an English Placement Assessment (fo...
4,https://www.byupathway.edu/policies/handbook/2...,{'url': 'https://www.byupathway.edu/policies/h...,* Strive to live by the [CES Honor Code](https...


## Retrieve the data from CSV


create the file `retrieved_chunks.csv`


In [37]:
query_mode = VectorStoreQueryMode.DEFAULT

top_k = 16
sparse_top_k = 16 * 5

# create a retriever from the index
retriever = index.as_retriever(
    vector_store_query_mode=query_mode,
    similarity_top_k=top_k,
    sparse_top_k=sparse_top_k,
)


In [38]:
df = pd.read_csv(f"{datapath}/temporary/index_single_quotes.csv")

df.head()


Unnamed: 0,Question,Ideal Answer,Link,Quote
0,How do I know if a student has a scholarship?,Missionaries can see if a student has a schola...,https://missionaries.prod.byu-pathway.psdops.c...,Discounts/Scholarships Tab\nView the scholarsh...
1,How do I know if a student is registered for a...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,Checking for Institute Registration for Colleg...
2,How do I know if student is member of the church?,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,Verify a Learner’s Membership Status\n1. In th...
3,What information should I track for each student?,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,Monitoring Student Progress\nTime in Course—ho...
4,What to do If a student has already taken this...,For a student that is in a course that they ha...,https://pathway-missionary.powerappsportals.co...,Link 1:\nWHAT TO WATCH FOR\nWhen you start a n...


In [39]:
# drop the empty questions
df = df.dropna(subset=["Question"])

# cuenta cuantas rows hay
print(f"Number of rows: {len(df)}")


Number of rows: 112


In [40]:
## ORIGINAL

import numpy as np

retrieved_chunks_dict = {}  # Para almacenar los chunks recuperados para cada pregunta
error_count = 0

# Procesa cada pregunta y recupera los chunks
for idx, row in df.iterrows():
    question = row["Question"]
    quote = row["Quote"]
    link = row["Link"]
    answer = row["Ideal Answer"]

    # Asegurarse de que quote sea una cadena de texto, si no, omitir la pregunta
    if pd.isna(quote):
        print(f"Skipping question {question} due to missing quote.")
        continue

    quote = str(quote)  # Convertir quote a cadena de texto, si es necesario


    # Recuperar chunks relacionados con la pregunta
    try:
        retrieved_chunks = retriever.retrieve(question)
    except Exception as e:
        error_count += 1
        print(f"Error retrieving chunks for question: {question}")
        print(e)
        continue

    # Almacenar los datos en el diccionario
    retrieved_chunks_dict[question] = {
        "quote": f"{link}\n\n{quote}",
        "normal_quote": quote,
        "answer": answer,
        "retrieved_chunks": '\n\n\n'.join([f'{chunk.metadata['url']}\n{chunk.text}' for chunk in retrieved_chunks]),
        "normal_chunks": [chunk.text for chunk in retrieved_chunks],
    }

print(f"Total errors encountered: {error_count}")


Skipping question How do students contact a tutor?  due to missing quote.
Skipping question How can a student change the name on their PathwayConnect certificate? due to missing quote.
Skipping question Is there an exception request link for online degree students? due to missing quote.
Skipping question How can I give missionaries or a BOM to friends of the church? due to missing quote.
Skipping question What scholarships are available? due to missing quote.
Skipping question How can I help students pay tuition? due to missing quote.
Skipping question What to do after the gathering due to missing quote.
Total errors encountered: 0


In [41]:
def tokenize(text):
    """Función para dividir el texto en palabras y normalizar el texto"""
    return re.findall(r"\w+", text.lower())  # Extrae palabras y convierte a minúsculas


In [42]:
def quote_in_chunk_percentage(quote, chunk_text):
    """Calcula el porcentaje del quote que está presente en el chunk text"""
    quote_tokens = set(tokenize(quote))  # Tokeniza el quote
    chunk_tokens = set(tokenize(chunk_text))  # Tokeniza el chunk

    # How many words are in the chunk
    common_tokens = quote_tokens.intersection(chunk_tokens)

    # calculate % of words in quote that are in chunk
    if len(quote_tokens) == 0:
        return 0
    return len(common_tokens) / len(quote_tokens)


In [43]:
# create a function to calculate the similarity recall between the quote and the chunk text
def similarity_recall(quote, chunk):
    # convert the chunk list to a string
    chunk_text = " ".join(chunk)
    return quote_in_chunk_percentage(quote, chunk_text)


In [44]:
output_df = pd.DataFrame.from_dict(retrieved_chunks_dict, orient="index")

# the actual index must be a column
output_df.reset_index(inplace=True)

# Add the titles

# output_df.columns = ["Question", "Quote", "Link", "Retrieved Chunks"]

output_df.head()


Unnamed: 0,index,quote,normal_quote,answer,retrieved_chunks,normal_chunks
0,How do I know if a student has a scholarship?,https://missionaries.prod.byu-pathway.psdops.c...,Discounts/Scholarships Tab\nView the scholarsh...,Missionaries can see if a student has a schola...,https://missionaries.prod.byu-pathway.psdops.c...,[Go to the Discounts/Scholarships tab on the S...
1,How do I know if a student is registered for a...,https://missionaries.prod.byu-pathway.psdops.c...,Checking for Institute Registration for Colleg...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,[The best method is to have the student ask th...
2,How do I know if student is member of the church?,https://missionaries.prod.byu-pathway.psdops.c...,Verify a Learner’s Membership Status\n1. In th...,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,[1. Go to the Details tab in the Student Infor...
3,What information should I track for each student?,https://missionaries.prod.byu-pathway.psdops.c...,Monitoring Student Progress\nTime in Course—ho...,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,[- **Date Of Birth (DOB)**: Recognize Students...
4,What to do If a student has already taken this...,https://missionaries.prod.byu-pathway.psdops.c...,How a Student Withdraws From a Program\nPURPOS...,For a student that is in a course that they ha...,https://missionaries.prod.byu-pathway.psdops.c...,"[For example, if the student received a poor g..."


In [45]:
# apply the function to the dataframe, the columns are Quote and "Retrieved Chunks"
output_df["Recall"] = output_df.apply(
    lambda x: similarity_recall(x["normal_quote"], x["retrieved_chunks"]), axis=1
)

# drop columns normal_quote and normal_chunks
output_df.drop(columns=["normal_quote", "normal_chunks"], inplace=True)

output_df.head()


Unnamed: 0,index,quote,answer,retrieved_chunks,Recall
0,How do I know if a student has a scholarship?,https://missionaries.prod.byu-pathway.psdops.c...,Missionaries can see if a student has a schola...,https://missionaries.prod.byu-pathway.psdops.c...,0.0
1,How do I know if a student is registered for a...,https://missionaries.prod.byu-pathway.psdops.c...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,0.068493
2,How do I know if student is member of the church?,https://missionaries.prod.byu-pathway.psdops.c...,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,0.104167
3,What information should I track for each student?,https://missionaries.prod.byu-pathway.psdops.c...,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,0.026316
4,What to do If a student has already taken this...,https://missionaries.prod.byu-pathway.psdops.c...,For a student that is in a course that they ha...,https://missionaries.prod.byu-pathway.psdops.c...,0.05


In [97]:
# Guardar los resultados en un archivo CSV
# las columnas son: Question,Quote,Quote URL (Link),retrieved chunks,recall

output_path = f"{datapath}/temporary/chunks_recall.csv"

output_df.columns = ["Question", "Quote", "Answer", "Retrieved Chunks", "Recall"]
# ignore the index when saving

output_df.to_csv(output_path, index=False)

output_df.head()


Unnamed: 0,Question,Quote,Answer,Retrieved Chunks,Recall
0,How do I know if a student has a scholarship?,https://missionaries.prod.byu-pathway.psdops.c...,Missionaries can see if a student has a schola...,[https://missionaries.prod.byu-pathway.psdops....,1.0
1,How do I know if a student is registered for a...,https://missionaries.prod.byu-pathway.psdops.c...,There is not a way for Missionaries to verify ...,[https://missionaries.prod.byu-pathway.psdops....,0.986301
2,How do I know if student is member of the church?,https://missionaries.prod.byu-pathway.psdops.c...,Missionaries can see the Church membership sta...,[https://missionaries.prod.byu-pathway.psdops....,0.979167
3,What information should I track for each student?,https://missionaries.prod.byu-pathway.psdops.c...,The most important things Missionaries should ...,[https://missionaries.prod.byu-pathway.psdops....,1.0
4,What to do If a student has already taken this...,https://missionaries.prod.byu-pathway.psdops.c...,For a student that is in a course that they ha...,[https://missionaries.prod.byu-pathway.psdops....,0.9
