In [16]:
import pandas as pd
import os
import re

import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import (
    SemanticSplitterNodeParser,
)
from transformers import GPT2Tokenizer
from llama_index.core import Document
from llama_index.core.vector_stores.types import VectorStoreQueryMode
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding

from utils.retrieve import (
    _generate_ngrams_from_texts,
    extract_question_ngrams,
    precision_recall,
)


In [2]:
datapath = "../../data"
ngram_size = 2


In [3]:
embed_model_name = "text-embedding-3-large"

embed_model = OpenAIEmbedding(
    model=embed_model_name,
    embed_batch_size=10,
    max_retries=10,
    timeout=180,
    reuse_client=False,
)


In [4]:
chroma_client = chromadb.EphemeralClient()
# delete collection if it exists
if any(coll.name == "test" for coll in chroma_client.list_collections()):
    chroma_client.delete_collection("test")
chroma_collection = chroma_client.create_collection("test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)


In [5]:
# SEMANTIC SPLITTER
buffer_size = 1
splitter = SemanticSplitterNodeParser(
    buffer_size=buffer_size,
    breakpoint_percentile_threshold=81,
    include_prev_next_rel=False,
    embed_model=embed_model,
)


In [6]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


def split_text_by_tokens(text, max_tokens=8000):
    """Divide the text into chunks that do not exceed the maximum number of tokens"""
    words = text.split()
    chunks = []
    chunk = []
    tokens_count = 0

    for word in words:
        tokens_count += len(tokenizer.tokenize(word))
        if tokens_count >= max_tokens:
            chunks.append(" ".join(chunk))
            chunk = []
            tokens_count = len(tokenizer.tokenize(word))
        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))
    return chunks




In [7]:
# create a simple ingestion pipeline: chunk the documents and create embeddings
pipeline = IngestionPipeline(
    transformations=[
        splitter,
        embed_model,
    ]
)


In [8]:
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)


## Load Files


In [10]:
origin_paths = [
    f"{datapath}/data_09_12_24/out_sep_12/from_html/",
    f"{datapath}/data_09_12_24/out_sep_12/from_pdf/",
]

# Read the document names from the directories:
files_list = [path + item for path in origin_paths for item in os.listdir(path)]

# drop txt files
files_list = [item for item in files_list if not item.endswith(".txt")]

files_list.sort()
files_list[0:10]


['../../data/data_09_12_24/out_sep_12/from_html/-Admission-Requirements.md',
 '../../data/data_09_12_24/out_sep_12/from_html/-After-PathwayConnect.md',
 '../../data/data_09_12_24/out_sep_12/from_html/-Answers-to-Your-Questions-about-Ecclesiastical-Endorsement.md',
 '../../data/data_09_12_24/out_sep_12/from_html/-Application-Process.md',
 '../../data/data_09_12_24/out_sep_12/from_html/-Assistance-for-Students-with-Disabilities.md',
 '../../data/data_09_12_24/out_sep_12/from_html/-BYU-Idaho-Course-Exceptions.md',
 '../../data/data_09_12_24/out_sep_12/from_html/-BYU-Pathway-Support.md',
 '../../data/data_09_12_24/out_sep_12/from_html/-BYU-Pathway-Worldwide-Website.md',
 '../../data/data_09_12_24/out_sep_12/from_html/-Common-Misconceptions-about-Choosing-Certificates.md',
 '../../data/data_09_12_24/out_sep_12/from_html/-Communication-Resources.md']

In [11]:
len(files_list)


601

In [12]:
documents = []

for i, filepath in enumerate(files_list):
    with open(filepath, "r", encoding="utf-8") as file:

        ## Hay metadata dentro de --- ---, necesito obtener el url: que hay dentro de los ---
        readed = file.read()
        url = ""
        title = ""
        subheading = ""
        heading = ""

        # split the metadata from the text
        try:
            metadata, text = readed.split("---\n", 2)[1:]
            # get the url from the metadata
            # get url only if exist
            if "url: " in metadata:
                url = metadata.split("url: ")[1].split("\n")[0]

            if "title: " not in metadata:
                title = metadata.split("title: ")[1].split("\n")[0]
            # get subheading
            if "subheading: " in metadata:
                subheading = metadata.split("subheading: ")[1].split("\n")[0]

            # get heading
            if "heading: " in metadata:
                heading = metadata.split("heading: ")[1].split("\n")[0]
        except:
            print(f"not enough values in file {filepath}. Something went wrong.")
            # continue

        # Verify if the text exceeds the maximum number of tokens allowed
        tokens = tokenizer.tokenize(readed)

        if len(tokens) > 8000:
            print(f"File {filepath} has more than 8000 tokens. Splitting it.")
            chunks = split_text_by_tokens(readed)
            for chunk in chunks:
                document = Document(
                    text=chunk,
                    metadata={
                        "heading": heading,
                        "subheading": subheading,
                        "url": url,
                        "title": title,
                    },  # , "subheading": subheading
                )
                documents.append(document)
        else:
            document = Document(
                text=readed,
                metadata={
                    "heading": heading,
                    "subheading": subheading,
                    "url": url,
                    "title": title,
                },  # , "subheading": subheading
            )

        # add the document to a single entry list
        documents.append(document)


File ../../data/data_09_12_24/out_sep_12/from_html/Devotionals--Speeches.md has more than 8000 tokens. Splitting it.
File ../../data/data_09_12_24/out_sep_12/from_html/How-do-I-find-old-Pathway-devotionals.md has more than 8000 tokens. Splitting it.
File ../../data/data_09_12_24/out_sep_12/from_html/What-is-the-link-to-the-Missionary-Services-Website.md has more than 8000 tokens. Splitting it.
not enough values in file ../../data/data_09_12_24/out_sep_12/from_pdf/Learn-About-PC-.md. Something went wrong.


In [13]:
# run the pipeline to generate nodes
nodes = pipeline.run(documents=documents)
# print('nodes', len(nodes))


In [14]:
# Inserta nodos en el índice
index.insert_nodes(nodes)

print(f"Nodes inserted: {len(nodes)}")


Nodes inserted: 3744


In [15]:
nodes[0]


TextNode(id_='678774ac-3827-4d65-ac73-b820a7673b32', embedding=[0.03173452243208885, -0.018554003909230232, -0.006875109858810902, 0.02745823934674263, -0.01951054111123085, -0.013876819983124733, 0.0033302961383014917, 0.022788086906075478, -0.012350580655038357, -0.00185680715367198, 0.009466903284192085, 0.0165424644947052, -2.8957663744222373e-05, -0.029821448028087616, -0.017105132341384888, 0.01097907591611147, -0.02074841409921646, 0.0348854660987854, 0.00389999826438725, 0.010803241282701492, -0.01133777666836977, -0.005999456625431776, -0.01298358291387558, 0.0004050775896757841, 0.010859508067369461, 0.004466183949261904, -0.02150801569223404, 0.0016510813729837537, 0.0018409821204841137, 0.021170414984226227, 0.01472785696387291, 0.011394043453037739, 0.032719191163778305, -0.008974567987024784, -0.009375469759106636, -0.05750475451350212, 0.013131284154951572, 0.015796927735209465, -0.07331574708223343, 0.042790964245796204, 0.008179797790944576, 0.02292875573039055, 0.0170

## Chunks File


We must create a file with: Link, chunk metadata, chunk text. Saved as `chunks.csv`.


In [17]:
# now, i need to save all chunks in a csv with its respective, url, chunk metadata, and chunk text

# create a dataframe with the chunks
chunks = []

for node in nodes:
    chunks.append(
        {
            "url": node.metadata["url"],
            "chunk_metadata": node.metadata,
            "chunk_text": node.text,
        }
    )


In [18]:
# convert the chunks to a Dataframe
df = pd.DataFrame(chunks)

df.head()


Unnamed: 0,url,chunk_metadata,chunk_text
0,https://www.byupathway.edu/policies/handbook/2...,"{'heading': '2. Admission and Tuition', 'subhe...",---\nheading: 2. Admission and Tuition\nsubhea...
1,https://www.byupathway.edu/policies/handbook/2...,"{'heading': '2. Admission and Tuition', 'subhe...",* Be a) at least 17 years old with a high scho...
2,https://www.byupathway.edu/policies/handbook/2...,"{'heading': '2. Admission and Tuition', 'subhe...",* Attend a weekly gathering [(see 5.0 Gatherin...
3,https://www.byupathway.edu/policies/handbook/2...,"{'heading': '2. Admission and Tuition', 'subhe...",* Complete an English Placement Assessment (fo...
4,https://www.byupathway.edu/policies/handbook/2...,"{'heading': '2. Admission and Tuition', 'subhe...",* Strive to live by the [CES Honor Code](https...


In [19]:
output_path = f"{datapath}/temporary/chunks.csv"

df.to_csv(output_path, index=False)

df.head()


Unnamed: 0,url,chunk_metadata,chunk_text
0,https://www.byupathway.edu/policies/handbook/2...,"{'heading': '2. Admission and Tuition', 'subhe...",---\nheading: 2. Admission and Tuition\nsubhea...
1,https://www.byupathway.edu/policies/handbook/2...,"{'heading': '2. Admission and Tuition', 'subhe...",* Be a) at least 17 years old with a high scho...
2,https://www.byupathway.edu/policies/handbook/2...,"{'heading': '2. Admission and Tuition', 'subhe...",* Attend a weekly gathering [(see 5.0 Gatherin...
3,https://www.byupathway.edu/policies/handbook/2...,"{'heading': '2. Admission and Tuition', 'subhe...",* Complete an English Placement Assessment (fo...
4,https://www.byupathway.edu/policies/handbook/2...,"{'heading': '2. Admission and Tuition', 'subhe...",* Strive to live by the [CES Honor Code](https...


## Retrieve the data from CSV


create the file `retrieved_chunks.csv`


In [20]:
query_mode = VectorStoreQueryMode.DEFAULT

top_k = 16
sparse_top_k = 16 * 5

# create a retriever from the index
retriever = index.as_retriever(
    vector_store_query_mode=query_mode,
    similarity_top_k=top_k,
    sparse_top_k=sparse_top_k,
)


In [21]:
df = pd.read_csv(f"{datapath}/temporary/index_single_quotes.csv")

df.head()


Unnamed: 0,Question,Ideal Answer,Link,Quote
0,How do I know if a student has a scholarship?,Missionaries can see if a student has a schola...,https://missionaries.prod.byu-pathway.psdops.c...,Discounts/Scholarships Tab\nView the scholarsh...
1,How do I know if a student is registered for a...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,Checking for Institute Registration for Colleg...
2,How do I know if student is member of the church?,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,Verify a Learner’s Membership Status\n1. In th...
3,What information should I track for each student?,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,Monitoring Student Progress\nTime in Course—ho...
4,What to do If a student has already taken this...,For a student that is in a course that they ha...,https://pathway-missionary.powerappsportals.co...,Link 1:\nWHAT TO WATCH FOR\nWhen you start a n...


In [22]:
# drop the empty questions
df = df.dropna(subset=["Question"])

df = df[df["Quote"].notna() & (df["Quote"] != "")]

# cuenta cuantas rows hay
print(f"Number of rows: {len(df)}")


Number of rows: 105


In [23]:
# generate bigrams (ngram size=2) for each manual quote
# and store them in the question_ngrams dictionary
question_ngrams = extract_question_ngrams(df, ngram_size)


In [24]:
# asociate each question with its ngrams in the df
df["ngrams"] = df["Question"].apply(lambda x: question_ngrams[x])


In [25]:
# print rows with empty Quote
df[df["Quote"].isnull()]


Unnamed: 0,Question,Ideal Answer,Link,Quote,ngrams


In [31]:
## ORIGINAL

import numpy as np

retrieved_chunks_dict = {}  # Para almacenar los chunks recuperados para cada pregunta
error_count = 0

# Procesa cada pregunta y recupera los chunks
for idx, row in df.iterrows():
    question = row["Question"]
    quote = row["Quote"]
    link = row["Link"]
    answer = row["Ideal Answer"]
    true_ngrams = row["ngrams"]

    # Asegurarse de que quote sea una cadena de texto, si no, omitir la pregunta
    if pd.isna(quote):
        print(f"Skipping question {question} due to missing quote.")
        continue

    quote = str(quote)  # Convertir quote a cadena de texto, si es necesario


    # Recuperar chunks relacionados con la pregunta
    try:
        retrieved_chunks = retriever.retrieve(question)
    except Exception as e:
        error_count += 1
        print(f"Error retrieving chunks for question: {question}")
        print(e)
        continue

    # CALCULATE RECALL
    predicted_ngrams = _generate_ngrams_from_texts([node.text for node in retrieved_chunks], ngram_size=ngram_size)
    precision, recall = precision_recall(predicted_ngrams, true_ngrams)

    # Almacenar los datos en el diccionario
    retrieved_chunks_dict[question] = {
        "quote": f"{link}\n\n{quote}",
        "normal_quote": quote,
        "answer": answer,
        "retrieved_chunks": '\n\n\n'.join([f'{chunk.metadata['url']}\n{chunk.text}' for chunk in retrieved_chunks]),
        "original_retrieved": retrieved_chunks,
        "normal_chunks": [chunk.text for chunk in retrieved_chunks],
        "recall": recall,
    }

print(f"Total errors encountered: {error_count}")


Total errors encountered: 0


In [27]:
def tokenize(text):
    """Función para dividir el texto en palabras y normalizar el texto"""
    return re.findall(r"\w+", text.lower())  # Extrae palabras y convierte a minúsculas


In [28]:
def quote_in_chunk_percentage(quote, chunk_text):
    """Calcula el porcentaje del quote que está presente en el chunk text"""
    quote_tokens = set(tokenize(quote))  # Tokeniza el quote
    chunk_tokens = set(tokenize(chunk_text))  # Tokeniza el chunk

    # How many words are in the chunk
    common_tokens = quote_tokens.intersection(chunk_tokens)

    # calculate % of words in quote that are in chunk
    if len(quote_tokens) == 0:
        return 0
    return len(common_tokens) / len(quote_tokens)


In [29]:
# create a function to calculate the similarity recall between the quote and the chunk text
def similarity_recall(quote, chunk):
    # convert the chunk list to a string
    # chunk_text = " ".join(chunk)
    # imprime el type de chunk
    return quote_in_chunk_percentage(quote, chunk)


In [41]:
output_df = pd.DataFrame.from_dict(retrieved_chunks_dict, orient="index")

# the actual index must be a column
output_df.reset_index(inplace=True)

# Add the titles

# output_df.columns = ["Question", "Quote", "Link", "Retrieved Chunks"]

output_df.head()


Unnamed: 0,index,quote,normal_quote,answer,retrieved_chunks,original_retrieved,normal_chunks,recall
0,How do I know if a student has a scholarship?,https://missionaries.prod.byu-pathway.psdops.c...,Discounts/Scholarships Tab\nView the scholarsh...,Missionaries can see if a student has a schola...,https://sites.google.com/view/bloom-mtl-onboar...,[Node ID: 85b7e14e-e887-4ad3-a09b-ff683811b1bc...,[* If the student wants the scholarship applie...,1.0
1,How do I know if a student is registered for a...,https://missionaries.prod.byu-pathway.psdops.c...,Checking for Institute Registration for Colleg...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,[Node ID: 462bb77f-647c-475c-8e97-453c7d3e711a...,[The best method is to have the student ask th...,0.952381
2,How do I know if student is member of the church?,https://missionaries.prod.byu-pathway.psdops.c...,Verify a Learner’s Membership Status\n1. In th...,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,[Node ID: 9aa906e4-2102-4dd1-9dd9-51ea9caf657f...,[---\nheading: PATH\nsubheading: PATH for Path...,0.952381
3,What information should I track for each student?,https://missionaries.prod.byu-pathway.psdops.c...,Monitoring Student Progress\nTime in Course—ho...,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,[Node ID: 34c00a22-6e98-4840-8d84-7f9657a07ccf...,[---\nheading: PATH\nsubheading: PATH for Path...,0.98
4,What to do If a student has already taken this...,https://missionaries.prod.byu-pathway.psdops.c...,How a Student Withdraws From a Program\nPURPOS...,For a student that is in a course that they ha...,https://pathway-missionary.powerappsportals.co...,[Node ID: c13eb7e0-1891-40b2-8af7-9f7be2a67748...,[---\nheading: PathwayConnect (PC)\nsubheading...,0.208333


In [42]:
# apply the function to the dataframe, the columns are Quote and "Retrieved Chunks"
output_df["recall"] = output_df.apply(
    lambda x: similarity_recall(x["normal_quote"], x["retrieved_chunks"]), axis=1
)


In [43]:
# drop columns normal_quote and normal_chunks
output_df.drop(
    columns=["normal_quote", "original_retrieved", "normal_chunks"], inplace=True
)

output_df.head()


Unnamed: 0,index,quote,answer,retrieved_chunks,recall
0,How do I know if a student has a scholarship?,https://missionaries.prod.byu-pathway.psdops.c...,Missionaries can see if a student has a schola...,https://sites.google.com/view/bloom-mtl-onboar...,1.0
1,How do I know if a student is registered for a...,https://missionaries.prod.byu-pathway.psdops.c...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,0.986301
2,How do I know if student is member of the church?,https://missionaries.prod.byu-pathway.psdops.c...,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,0.979167
3,What information should I track for each student?,https://missionaries.prod.byu-pathway.psdops.c...,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,1.0
4,What to do If a student has already taken this...,https://missionaries.prod.byu-pathway.psdops.c...,For a student that is in a course that they ha...,https://pathway-missionary.powerappsportals.co...,0.8


In [44]:
# Guardar los resultados en un archivo CSV
# las columnas son: Question,Quote,Quote URL (Link),retrieved chunks,recall

output_path = f"{datapath}/temporary/chunks_recall.csv"

output_df.columns = ["Question", "Quote", "Answer", "Retrieved Chunks", "Recall"]
# ignore the index when saving

# sort by recall
output_df = output_df.sort_values(by="Recall", ascending=False)

output_df.to_csv(output_path, index=False)

output_df.head()


Unnamed: 0,Question,Quote,Answer,Retrieved Chunks,Recall
0,How do I know if a student has a scholarship?,https://missionaries.prod.byu-pathway.psdops.c...,Missionaries can see if a student has a schola...,https://sites.google.com/view/bloom-mtl-onboar...,1.0
71,What is the Hall scholarship?,https://hall-foundation.org/about-scholarship/...,ABOUT THE HALL FOUNDATION SCHOLARSHIP\n\nBrad ...,https://hall-foundation.org/about-scholarship/...,1.0
67,Is the Hall scholarship different from the Heb...,https://hall-foundation.org/about-scholarship/...,The Hall Foundation scholarship is different f...,https://hall-foundation.org/about-scholarship/...,1.0
66,How often could one access Mentorship Bridge S...,https://sites.google.com/view/bloom-mtl-onboar...,The Mentor Bridge Scholarship was originally o...,https://sites.google.com/view/bloom-mtl-onboar...,1.0
65,Can a student that has applied for a mentor sc...,https://pathway-missionary.powerappsportals.co...,Students may apply for and receive multiple sc...,https://pathway-missionary.powerappsportals.co...,1.0
