In [1]:
import pandas as pd
import os
import re

import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import (
    SemanticSplitterNodeParser,
)
from transformers import GPT2Tokenizer
from llama_index.core import Document
from llama_index.core.vector_stores.types import VectorStoreQueryMode
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding

from utils.retrieve import (
    _generate_ngrams_from_texts,
    extract_question_ngrams,
    precision_recall,
)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/isaiaszc/pathway/pathway-
[nltk_data]     indexer/.venv/lib/python3.12/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datapath = "../../data"
ngram_size = 2


In [3]:
embed_model_name = "text-embedding-3-large"

embed_model = OpenAIEmbedding(
    model=embed_model_name,
    embed_batch_size=10,
    max_retries=10,
    timeout=180,
    reuse_client=False,
)


In [4]:
chroma_client = chromadb.EphemeralClient()
# delete collection if it exists
if any(coll.name == "test" for coll in chroma_client.list_collections()):
    chroma_client.delete_collection("test")
chroma_collection = chroma_client.create_collection("test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)


In [5]:
# SEMANTIC SPLITTER
buffer_size = 1
splitter = SemanticSplitterNodeParser(
    buffer_size=buffer_size,
    breakpoint_percentile_threshold=81,
    include_prev_next_rel=False,
    embed_model=embed_model,
)


In [6]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


def split_text_by_tokens(text, max_tokens=8000):
    """Divide the text into chunks that do not exceed the maximum number of tokens"""
    words = text.split()
    chunks = []
    chunk = []
    tokens_count = 0

    for word in words:
        tokens_count += len(tokenizer.tokenize(word))
        if tokens_count >= max_tokens:
            chunks.append(" ".join(chunk))
            chunk = []
            tokens_count = len(tokenizer.tokenize(word))
        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))
    return chunks




In [7]:
# create a simple ingestion pipeline: chunk the documents and create embeddings
pipeline = IngestionPipeline(
    transformations=[
        splitter,
        embed_model,
    ]
)


In [8]:
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)


In [9]:
def format_metadata(d):
    return "---\n" + "\n".join(f"{k}: {v}" for k, v in d.items()) + "\n---"


## Load Files


In [10]:
origin_paths = [
    f"{datapath}/data_16_09_24/out/from_html/",
    f"{datapath}/data_16_09_24/out/from_pdf/",
]

# Read the document names from the directories:
files_list = [path + item for path in origin_paths for item in os.listdir(path)]

# drop txt files
files_list = [item for item in files_list if not item.endswith(".txt")]

files_list.sort()
files_list[0:10]


['../../data/data_16_09_24/out/from_html/105adb7a.md',
 '../../data/data_16_09_24/out/from_html/1376e9a9.md',
 '../../data/data_16_09_24/out/from_html/141279b1.md',
 '../../data/data_16_09_24/out/from_html/144bfd06.md',
 '../../data/data_16_09_24/out/from_html/14627860.md',
 '../../data/data_16_09_24/out/from_html/15c11982.md',
 '../../data/data_16_09_24/out/from_html/15ffa1be.md',
 '../../data/data_16_09_24/out/from_html/16ebd247.md',
 '../../data/data_16_09_24/out/from_html/179960e.md',
 '../../data/data_16_09_24/out/from_html/17e70845.md']

In [11]:
len(files_list)


539

In [12]:
documents = []

for i, filepath in enumerate(files_list):
    with open(filepath, "r", encoding="utf-8") as file:

        ## Hay metadata dentro de --- ---, necesito obtener el url: que hay dentro de los ---
        readed = file.read()
        url = ""
        title = ""
        subheading = ""
        heading = ""

        # split the metadata from the text
        try:
            metadata, text = readed.split("---\n", 2)[1:]
            # get the url from the metadata
            # get url only if exist
            if "url: " in metadata:
                url = metadata.split("url: ")[1].split("\n")[0]

            if "title: " not in metadata:
                title = metadata.split("title: ")[1].split("\n")[0]
            # get subheading
            if "subheading: " in metadata:
                subheading = metadata.split("subheading: ")[1].split("\n")[0]

            # get heading
            if "heading: " in metadata:
                heading = metadata.split("heading: ")[1].split("\n")[0]
        except:
            print(f"not enough values in file {filepath}. Something went wrong.")
            # continue

        # Verify if the text exceeds the maximum number of tokens allowed
        tokens = tokenizer.tokenize(text)

        if len(tokens) > 8000:
            print(f"File {filepath} has more than 8000 tokens. Splitting it.")
            chunks = split_text_by_tokens(text)
            for chunk in chunks:
                document = Document(
                    text=chunk,
                    metadata={
                        "heading": heading,
                        "subheading": subheading,
                        "url": url,
                        "title": title,
                    },  # , "subheading": subheading
                )
                documents.append(document)
        else:
            document = Document(
                text=text,
                metadata={
                    "heading": heading,
                    "subheading": subheading,
                    "url": url,
                    "title": title,
                },  # , "subheading": subheading
            )

        # add the document to a single entry list
        documents.append(document)


File ../../data/data_16_09_24/out/from_html/613dffc5.md has more than 8000 tokens. Splitting it.
File ../../data/data_16_09_24/out/from_html/9e88d1a9.md has more than 8000 tokens. Splitting it.


In [13]:
# run the pipeline to generate nodes
nodes = pipeline.run(documents=documents)
# print('nodes', len(nodes))


In [14]:
# Inserta nodos en el índice
index.insert_nodes(nodes)

print(f"Nodes inserted: {len(nodes)}")


Nodes inserted: 3148


In [15]:
nodes[0]


TextNode(id_='4e795fd4-74e7-4840-aada-2af8fa508c1b', embedding=[0.020620955154299736, 0.0018274312606081367, -0.009215765632689, 0.045651812106370926, 0.007581462152302265, -0.024906635284423828, -0.0015644286759197712, 0.03981335088610649, -0.021335234865546227, -0.014308893121778965, 0.013353931717574596, 0.0019953257869929075, -0.01336945965886116, -0.012624124065041542, -0.036024559289216995, 0.031102240085601807, -0.021537097170948982, 0.0327947735786438, -0.0019040999468415976, 0.0037441474851220846, -0.006129609886556864, -0.021661320701241493, 0.004798098932951689, 0.04096240922808647, 0.01781041920185089, 0.016738997772336006, -0.017049554735422134, -0.012895860709249973, 0.0005279461620375514, 0.001306278514675796, 0.013516973704099655, 0.04360214248299599, 0.001217963988892734, -0.011971955187618732, -0.01013967115432024, -0.02701842039823532, 0.037018340080976486, 0.024658190086483955, -0.08546516299247742, -0.01841600425541401, 0.020636484026908875, 0.001355773420073092, 0

## Chunks File


We must create a file with: Link, chunk metadata, chunk text. Saved as `chunks.csv`.


In [16]:
# now, i need to save all chunks in a csv with its respective, url, chunk metadata, and chunk text

# create a dataframe with the chunks
chunks = []

for node in nodes:
    chunks.append(
        {
            "url": node.metadata["url"],
            "chunk_metadata": node.metadata,
            "chunk_text": node.text,
        }
    )


In [17]:
# convert the chunks to a Dataframe
df = pd.DataFrame(chunks)

df.head()


Unnamed: 0,url,chunk_metadata,chunk_text
0,https://www.byupathway.edu/policies/handbook/6...,"{'heading': '6. Student Records', 'subheading'...",```markdown\n# 6.4 Pathway Certificates\n\n\n\...
1,https://www.byupathway.edu/policies/handbook/6...,"{'heading': '6. Student Records', 'subheading'...","PC 101/101L, PC 102/102L, PC 103/103L\n2. GS 1..."
2,https://www.byupathway.edu/policies/handbook/6...,"{'heading': '6. Student Records', 'subheading'...",Log in to their BYU-Pathway Portal\n2. Click o...
3,https://www.byupathway.edu/policies/handbook/6...,"{'heading': '6. Student Records', 'subheading'...",Instructions are linked near the top of that p...
4,https://www.byupathway.edu/policies/handbook/6...,"{'heading': '6. Student Records', 'subheading'...",```\n\n


In [18]:
output_path = f"{datapath}/temporary/chunks.csv"

df.to_csv(output_path, index=False)

df.head()


Unnamed: 0,url,chunk_metadata,chunk_text
0,https://www.byupathway.edu/policies/handbook/6...,"{'heading': '6. Student Records', 'subheading'...",```markdown\n# 6.4 Pathway Certificates\n\n\n\...
1,https://www.byupathway.edu/policies/handbook/6...,"{'heading': '6. Student Records', 'subheading'...","PC 101/101L, PC 102/102L, PC 103/103L\n2. GS 1..."
2,https://www.byupathway.edu/policies/handbook/6...,"{'heading': '6. Student Records', 'subheading'...",Log in to their BYU-Pathway Portal\n2. Click o...
3,https://www.byupathway.edu/policies/handbook/6...,"{'heading': '6. Student Records', 'subheading'...",Instructions are linked near the top of that p...
4,https://www.byupathway.edu/policies/handbook/6...,"{'heading': '6. Student Records', 'subheading'...",```\n\n


## Retrieve the data from CSV


create the file `retrieved_chunks.csv`


In [19]:
query_mode = VectorStoreQueryMode.DEFAULT

top_k = 16
sparse_top_k = 16 * 5

# create a retriever from the index
retriever = index.as_retriever(
    vector_store_query_mode=query_mode,
    similarity_top_k=top_k,
    sparse_top_k=sparse_top_k,
)


In [20]:
df = pd.read_csv(f"{datapath}/temporary/index_single_quotes.csv")

df.head()


Unnamed: 0,Question,Ideal Answer,Link,Quote
0,How do I know if a student has a scholarship?,Missionaries can see if a student has a schola...,https://missionaries.prod.byu-pathway.psdops.c...,Discounts/Scholarships Tab\nView the scholarsh...
1,How do I know if a student is registered for a...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,Checking for Institute Registration for Colleg...
2,How do I know if student is member of the church?,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,Verify a Learner’s Membership Status\n1. In th...
3,What information should I track for each student?,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,Monitoring Student Progress\nTime in Course—ho...
4,What to do If a student has already taken this...,For a student that is in a course that they ha...,https://pathway-missionary.powerappsportals.co...,Link 1:\nWHAT TO WATCH FOR\nWhen you start a n...


In [21]:
# drop the empty questions
df = df.dropna(subset=["Question"])

df = df[df["Quote"].notna() & (df["Quote"] != "")]

# cuenta cuantas rows hay
print(f"Number of rows: {len(df)}")


Number of rows: 105


In [22]:
# generate bigrams (ngram size=2) for each manual quote
# and store them in the question_ngrams dictionary
question_ngrams = extract_question_ngrams(df, ngram_size)


In [23]:
# asociate each question with its ngrams in the df
df["ngrams"] = df["Question"].apply(lambda x: question_ngrams[x])


In [24]:
# print rows with empty Quote
df[df["Quote"].isnull()]


Unnamed: 0,Question,Ideal Answer,Link,Quote,ngrams


In [25]:
## ORIGINAL

import numpy as np

retrieved_chunks_dict = {}  # Para almacenar los chunks recuperados para cada pregunta
error_count = 0

# Procesa cada pregunta y recupera los chunks
for idx, row in df.iterrows():
    question = row["Question"]
    quote = row["Quote"]
    link = row["Link"]
    answer = row["Ideal Answer"]
    true_ngrams = row["ngrams"]

    # Asegurarse de que quote sea una cadena de texto, si no, omitir la pregunta
    if pd.isna(quote):
        print(f"Skipping question {question} due to missing quote.")
        continue

    quote = str(quote)  # Convertir quote a cadena de texto, si es necesario


    # Recuperar chunks relacionados con la pregunta
    try:
        retrieved_chunks = retriever.retrieve(question)
    except Exception as e:
        error_count += 1
        print(f"Error retrieving chunks for question: {question}")
        print(e)
        continue

    # CALCULATE RECALL
    predicted_ngrams = _generate_ngrams_from_texts([node.text for node in retrieved_chunks], ngram_size=ngram_size)
    precision, recall = precision_recall(predicted_ngrams, true_ngrams)

    # Almacenar los datos en el diccionario
    retrieved_chunks_dict[question] = {
        "quote": f"{link}\n\n{quote}",
        "normal_quote": quote,
        "answer": answer,
        "retrieved_chunks": '\n\n\n'.join([f'{chunk.metadata['url']}\n{format_metadata(chunk.metadata)}\n{chunk.text}' for chunk in retrieved_chunks]),
        "original_retrieved": retrieved_chunks,
        "normal_chunks": [chunk.text for chunk in retrieved_chunks],
        "recall": recall,
    }

print(f"Total errors encountered: {error_count}")


Total errors encountered: 0


In [26]:
def tokenize(text):
    """Función para dividir el texto en palabras y normalizar el texto"""
    return re.findall(r"\w+", text.lower())  # Extrae palabras y convierte a minúsculas


In [27]:
def quote_in_chunk_percentage(quote, chunk_text):
    """Calcula el porcentaje del quote que está presente en el chunk text"""
    quote_tokens = set(tokenize(quote))  # Tokeniza el quote
    chunk_tokens = set(tokenize(chunk_text))  # Tokeniza el chunk

    # How many words are in the chunk
    common_tokens = quote_tokens.intersection(chunk_tokens)

    # calculate % of words in quote that are in chunk
    if len(quote_tokens) == 0:
        return 0
    return len(common_tokens) / len(quote_tokens)


In [28]:
# create a function to calculate the similarity recall between the quote and the chunk text
def similarity_recall(quote, chunk):
    # convert the chunk list to a string
    # chunk_text = " ".join(chunk)
    # imprime el type de chunk
    return quote_in_chunk_percentage(quote, chunk)


In [29]:
output_df = pd.DataFrame.from_dict(retrieved_chunks_dict, orient="index")

# the actual index must be a column
output_df.reset_index(inplace=True)

# Add the titles

# output_df.columns = ["Question", "Quote", "Link", "Retrieved Chunks"]

output_df.head()


Unnamed: 0,index,quote,normal_quote,answer,retrieved_chunks,original_retrieved,normal_chunks,recall
0,How do I know if a student has a scholarship?,https://missionaries.prod.byu-pathway.psdops.c...,Discounts/Scholarships Tab\nView the scholarsh...,Missionaries can see if a student has a schola...,https://missionaries.prod.byu-pathway.psdops.c...,[Node ID: d761fbcd-0d02-44d7-ac44-fab9ab22d34a...,[Go to the **Discounts/Scholarships** tab on t...,1.0
1,How do I know if a student is registered for a...,https://missionaries.prod.byu-pathway.psdops.c...,Checking for Institute Registration for Colleg...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,[Node ID: f68ee6ae-e045-4369-bcf7-5c2edf925ab8...,[The best method is to have the student ask th...,0.952381
2,How do I know if student is member of the church?,https://missionaries.prod.byu-pathway.psdops.c...,Verify a Learner’s Membership Status\n1. In th...,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,[Node ID: 99cd2fc3-1622-482f-8865-446cabe22bfd...,[1. Go to the Details tab in the Student Infor...,0.952381
3,What information should I track for each student?,https://missionaries.prod.byu-pathway.psdops.c...,Monitoring Student Progress\nTime in Course—ho...,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,[Node ID: b8ed8ae3-eda1-46a3-a883-59bf12e998fa...,[```markdown\n# Revised July New Student Visit...,0.26
4,What to do If a student has already taken this...,https://missionaries.prod.byu-pathway.psdops.c...,How a Student Withdraws From a Program\nPURPOS...,For a student that is in a course that they ha...,https://missionaries.prod.byu-pathway.psdops.c...,[Node ID: 122d19a4-c461-4423-8bff-f4621ff6cc81...,"[If they did not mean to enroll in the course,...",0.375


In [30]:
# apply the function to the dataframe, the columns are Quote and "Retrieved Chunks"
output_df["recall"] = output_df.apply(
    lambda x: similarity_recall(x["normal_quote"], x["retrieved_chunks"]), axis=1
)


In [31]:
# drop columns normal_quote and normal_chunks
output_df.drop(
    columns=["normal_quote", "original_retrieved", "normal_chunks"], inplace=True
)

output_df.head()


Unnamed: 0,index,quote,answer,retrieved_chunks,recall
0,How do I know if a student has a scholarship?,https://missionaries.prod.byu-pathway.psdops.c...,Missionaries can see if a student has a schola...,https://missionaries.prod.byu-pathway.psdops.c...,1.0
1,How do I know if a student is registered for a...,https://missionaries.prod.byu-pathway.psdops.c...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,0.986301
2,How do I know if student is member of the church?,https://missionaries.prod.byu-pathway.psdops.c...,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,0.979167
3,What information should I track for each student?,https://missionaries.prod.byu-pathway.psdops.c...,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,0.710526
4,What to do If a student has already taken this...,https://missionaries.prod.byu-pathway.psdops.c...,For a student that is in a course that they ha...,https://missionaries.prod.byu-pathway.psdops.c...,0.85


In [32]:
# Guardar los resultados en un archivo CSV
# las columnas son: Question,Quote,Quote URL (Link),retrieved chunks,recall

output_path = f"{datapath}/temporary/chunks_recall.csv"

output_df.columns = ["Question", "Quote", "Answer", "Retrieved Chunks", "Recall"]
# ignore the index when saving

# sort by recall
output_df = output_df.sort_values(by="Recall", ascending=False)

output_df.to_csv(output_path, index=False)

output_df.head()


Unnamed: 0,Question,Quote,Answer,Retrieved Chunks,Recall
0,How do I know if a student has a scholarship?,https://missionaries.prod.byu-pathway.psdops.c...,Missionaries can see if a student has a schola...,https://missionaries.prod.byu-pathway.psdops.c...,1.0
37,When should open the zoom room before gathering?,https://missionaries.prod.byu-pathway.psdops.c...,Open the meeting room 20 minutes early to allo...,https://missionaries.prod.byu-pathway.psdops.c...,1.0
71,What is the Hall scholarship?,https://hall-foundation.org/about-scholarship/...,ABOUT THE HALL FOUNDATION SCHOLARSHIP\n\nBrad ...,https://hall-foundation.org/about-scholarship/...,1.0
70,Does BYU-I have a way to download the admissio...,https://www.byui.edu/student-records/enrollmen...,BYU-I has a tool that provides standard enroll...,https://www.byui.edu/student-records/enrollmen...,1.0
67,Is the Hall scholarship different from the Heb...,https://hall-foundation.org/about-scholarship/...,The Hall Foundation scholarship is different f...,https://hall-foundation.org/requirements/\n---...,1.0
