In [1]:
import llama_index.core
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage, SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_parse import LlamaParse
import nest_asyncio; nest_asyncio.apply()

import json
import os
import re
from dotenv import load_dotenv
load_dotenv()

from fuzzywuzzy import fuzz # for checking the levenshtein distance between retrieved chunks and oracle chunks

from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [2]:
top_k = 10

# variable for saving the report
report = []

#### Open paths to docs and oracles

In [3]:
LIST_OF_DOCS = ["./raw_texts/"+f for f in os.listdir("./raw_texts")]
#print(LIST_OF_DOCS)

LIST_OF_ORACLES = ["./JSON_oracles/"+f for f in os.listdir("./JSON_oracles/") if f.endswith('.json')]
#print(LIST_OF_ORACLES)

### Choose the parser and chunking method 

##### SimpleDirectoryReader (SDR)

In [4]:
PERSIST_DIR = "./storage_SimpleDirectoryReader"
parser_docs_type = "SDR"
report.append("with SDR")

if not os.path.exists(PERSIST_DIR):
    # load the documents and create the index
    documents = SimpleDirectoryReader("raw_texts").load_data()
    index = VectorStoreIndex.from_documents(documents)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

##### SDR manually-split docs 

In [None]:
from llama_index.core.node_parser import SentenceSplitter

# change the docs
LIST_OF_DOCS = ["./raw_texts_split_manually/"+f for f in os.listdir("./raw_texts_split_manually/")]
print(LIST_OF_DOCS)
print(f"len(docs): {len(LIST_OF_DOCS)}")

report.append("with SDR on manually split docs")
parser_docs_type = "manually_split_docs"
PERSIST_DIR = "./storage_SimpleDirectoryReader_manually_split_docs"

if not os.path.exists(PERSIST_DIR):
    # load the documents and create the index
    documents = SimpleDirectoryReader("raw_texts_split_manually").load_data()

    # make index with a large chunk size so that none of the manually split documents gets split into multiple nodes
    index = VectorStoreIndex.from_documents(documents, transformations=[SentenceSplitter(chunk_size=2048, chunk_overlap=0)])
    
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

##### SDR with semantic node splitting with default embedding model

In [4]:
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding

breakpoint_percentile_threshold = 70

embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

PERSIST_DIR = f"./storage_SDR_semantic_chunking_various_thresholds/storage_SDR_sem_{breakpoint_percentile_threshold}_text-embedding-ada-002"
parser_docs_type = "on_semantic_chunking_ada-002"
report.append("with SDR + semantic doc splittlng + text-embedding-ada-002")
report.append(f"breakpoint_percentile_threshold: {breakpoint_percentile_threshold}")

if not os.path.exists(PERSIST_DIR):
    documents = SimpleDirectoryReader("raw_texts").load_data()
    splitter = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=breakpoint_percentile_threshold, embed_model=embed_model, include_metadata=True)
    nodes = splitter.get_nodes_from_documents(documents)
    # load the documents and create the index
    report.append(f"len(nodes): {len(nodes)}")
    index = VectorStoreIndex(nodes)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

##### SDR with semantic node splitting and a bigger embedding model

In [None]:
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding

breakpoint_percentile_threshold = 70

embed_model = OpenAIEmbedding(model="text-embedding-3-large")

PERSIST_DIR = f"./storage_SDR_semantic_chunking_various_thresholds/storage_SDR_sem_{breakpoint_percentile_threshold}_text-embedding-3-large"
parser_docs_type = "on_semantic_chunking_large_embedding_model"
report.append("with SDR + semantic doc splittlng + text-embedding-3-large")
report.append(f"breakpoint_percentile_threshold: {breakpoint_percentile_threshold}")

if not os.path.exists(PERSIST_DIR):
    documents = SimpleDirectoryReader("raw_texts").load_data()
    splitter = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=breakpoint_percentile_threshold, embed_model=embed_model, include_metadata=True)
    nodes = splitter.get_nodes_from_documents(documents)
    # load the documents and create the index
    report.append(f"len(nodes): {len(nodes)}")
    index = VectorStoreIndex(nodes)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

#### SDR with semantic node splitting at optimal BP=70 with top_k=10, to test retrieval at various thresholds

In [5]:
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding

breakpoint_percentile_threshold = 70
top_k = 10
similarity_cutoff = 1
#embedding_model = "text-embedding-3-large"
embedding_model = "text-embedding-ada-002"

embed_model = OpenAIEmbedding(model="text-embedding-3-large")

PERSIST_DIR = f"./storage_SDR_semantic_chunking_various_thresholds/storage_SDR_sem_{breakpoint_percentile_threshold}_{embedding_model}"
parser_docs_type = "on_semantic_chunking_large_embedding_model"
report.append("with SDR + semantic doc splittlng + text-embedding-3-large")
report.append(f"breakpoint_percentile_threshold: {breakpoint_percentile_threshold}")

if not os.path.exists(PERSIST_DIR):
    documents = SimpleDirectoryReader("raw_texts").load_data()
    splitter = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=breakpoint_percentile_threshold, embed_model=embed_model, include_metadata=True)
    nodes = splitter.get_nodes_from_documents(documents)
    # load the documents and create the index
    report.append(f"len(nodes): {len(nodes)}")
    index = VectorStoreIndex(nodes)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

### The retriever

In [6]:
try:
    similarity_cutoff
except NameError:
    similarity_cutoff = 0.7 # defaults siimilarity cutoff to 0.7 if not set

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

# configure response synthesizer

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=similarity_cutoff)],
)

print(f"similarity cutoff: {similarity_cutoff}")

similarity cutoff: 1


In [12]:
# OR basic:
retriever = index.as_retriever()

#### Run the retriever on self-created questions and count how many retrieved docs are retreieved correctly (based on the oracle)

In [7]:
# at least this percentage of the oracle chunk has to be in the retrieved node to be counted as correctly retrieved
similarity_threshold = 70

# save total number of questions and correct answers separately for my own quizzes and GPT-made
total_nquestions = 0
total_corr_chunks = 0

document_node_titles = LIST_OF_ORACLES
chunks_from_right_doc_total = []
correctly_retrieved_chunks = []

# Iterate over each question and get GPT's response
for num_of_oracle, doc in enumerate(LIST_OF_ORACLES):
    with open(doc, 'r', encoding='UTF-8') as file:
        oracle_JSON = json.load(file)
    
    itembank_number = re.search(r"\d[a|b]?(?:\-\d)?", doc).group()
    print(f"processing oracle {itembank_number}")

    this_corr_chunks = 0 # number of correctly retrieved chunks
    this_nquestions = 0

    chunks_from_right_doc_this = []
    oracle_chunks_retrieved_this = []

    for question in oracle_JSON:
        
        this_nquestions += 1
        
        full_prompt = question['question'] + '\n' + '\n'.join(question['answers'])
        retrieved_nodes = retriever.retrieve(full_prompt)
        
        retrieved_titles = [n.node.metadata['file_name'] for n in retrieved_nodes]
        retrieved_chunks = [n.node.text.replace("\n", "") for n in retrieved_nodes]

        # make a list of lists with True/False for correctly retrived chunks from docs and right nodes
        chunks_from_right_doc_this.append([re.search(r"\d", title).group() == itembank_number for title in retrieved_titles])
        oracle_chunks_retrieved_this.append([fuzz.partial_ratio(question['correct_text_chunk'], chunk) > similarity_threshold for chunk in retrieved_chunks])
    
    chunks_from_right_doc_total.append(chunks_from_right_doc_this)
    correctly_retrieved_chunks.append(oracle_chunks_retrieved_this)

processing oracle 2
processing oracle 3
processing oracle 4
processing oracle 5
processing oracle 9


#### Counting and displaying the overall results

In [8]:
report.append("run number X")
report.append(f"retriever: {retriever}")
report.append(f"top_k = {top_k}")
report.append(f"similarity cutoff: {similarity_threshold}")

for n, (correct_doc, correct_chunk) in enumerate(zip(chunks_from_right_doc_total, correctly_retrieved_chunks)):
    
    correct_doc_counts = [0 for n in range(top_k)]
    correct_chunk_counts = [0 for n in range(top_k)]

    # Iterate over each sublist in correct_chunks_total
    for values in correct_doc:
        # Check only the first three elements of each sublist
        for i in range(top_k):
            if values[i] == True:
                correct_doc_counts[i] += 1


    for values in correct_chunk:
        # Check only the first three elements of each sublist
        for i in range(top_k):
            if values[i] == True:
                correct_chunk_counts[i] += 1

    #print(f"\nResults for {document_node_titles[n]}:")
    report.append(f"\nResults for {document_node_titles[n]}:")
    
    for i, count in enumerate(correct_doc_counts):
        #print(f"from correct doc as chunk number {i+1}: {count}/{len(correct_doc)}")
        report.append(f"from correct doc as chunk number {i+1}: {count}/{len(correct_doc)}")
    for i, count in enumerate(correct_chunk_counts):
        #print(f"from correct chunk as chunk number {i+1}: {count}/{len(correct_chunk)}")
        report.append(f"from correct chunk as chunk number {i+1}: {count}/{len(correct_chunk)}")
    
    # now handle the overall results
    overall_correct_doc_counts = [0 for _ in range(top_k)]
    overall_correct_chunk_counts = [0 for _ in range(top_k)]

    # Accumulate total counts
    for correct_doc, correct_chunk in zip(chunks_from_right_doc_total, correctly_retrieved_chunks):
        for i in range(top_k):
            overall_correct_doc_counts[i] += sum(values[i] for values in correct_doc)
            overall_correct_chunk_counts[i] += sum(values[i] for values in correct_chunk)
    # Append overall results to the report
report.append("\nOverall Results:")
for i, count in enumerate(overall_correct_doc_counts):
    print(f"Overall from correct doc as chunk number {i+1}: {count}/" f"{sum(len(sublist) for sublist in chunks_from_right_doc_total)}")
    report.append(f"Overall from correct doc as chunk number {i+1}: {count}/" f"{sum(len(sublist) for sublist in chunks_from_right_doc_total)}")
for i, count in enumerate(overall_correct_chunk_counts):
    print((f"Overall from correct chunk as chunk number {i+1}: {count}/" f"{sum(len(sublist) for sublist in correctly_retrieved_chunks)}"))
    report.append(f"Overall from correct chunk as chunk number {i+1}: {count}/" f"{sum(len(sublist) for sublist in correctly_retrieved_chunks)}")

report.append("\n\n--------------------------\n\n")

Overall from correct doc as chunk number 1: 280/300
Overall from correct doc as chunk number 2: 244/300
Overall from correct doc as chunk number 3: 220/300
Overall from correct doc as chunk number 4: 223/300
Overall from correct doc as chunk number 5: 202/300
Overall from correct doc as chunk number 6: 200/300
Overall from correct doc as chunk number 7: 204/300
Overall from correct doc as chunk number 8: 216/300
Overall from correct doc as chunk number 9: 192/300
Overall from correct doc as chunk number 10: 197/300
Overall from correct chunk as chunk number 1: 161/300
Overall from correct chunk as chunk number 2: 43/300
Overall from correct chunk as chunk number 3: 39/300
Overall from correct chunk as chunk number 4: 15/300
Overall from correct chunk as chunk number 5: 7/300
Overall from correct chunk as chunk number 6: 10/300
Overall from correct chunk as chunk number 7: 5/300
Overall from correct chunk as chunk number 8: 5/300
Overall from correct chunk as chunk number 9: 6/300
Overa

#### Appending the results to the report

In [9]:
# save the report
with open(f"./results-comparison_with_oracle_reports_{parser_docs_type}.txt", 'a') as report_file:
    report_file.write('\n'.join(report))

### Debugging

In [11]:
with open("./JSON_oracles/oracle-itembank-2.json", 'r', encoding='UTF-8') as file:
    oracle_JSON = json.load(file)

#retriever = vector_index.as_retriever()

question = oracle_JSON[24]['question'] + '\n' + '\n'.join(oracle_JSON[24]['answers'])
print(question)
nodes = retriever.retrieve(question)
#print(nodes[0])

for node in nodes:
    print(node.node.metadata['file_name'])
    #print(node.node.text.split('\n')[0])
    #print(document_node_titles[1])
    #print(document_node_titles[0] in node.node.text.split('\n')[1] )
    print(node.node.text)
    print("------------------\n\n")

Why is the /k/ in "dis[k]óvery" unaspirated?
a) Because it is at the end of the word
b) Because it is positioned after a nasal sound
c) Because it follows a fortis fricative
d) Because it is followed by a vowel
2-settext.pdf
So 
while we can find aspirated plosives in words like [ph]eak and [kh]ool, the plosives 
in s[p]eak and s[k]ool are unaspirated , and so are the  ones in dis[k]óvery and 
fíf[t]éen (cf. níne[th]éen). 

------------------


2-settext.pdf
So we could analyze the unaspirated voiceless 
plosives in s[p]eak, s[k]ool, dis[k]óvery, fíf[t]éen and káf[t]an phonologically as lenis 
plosives: s/b/eak, s/g/ool, dis/g/óvery, fíf/d/éen and káf/d/an. Actually, th e 
identification of such clusters as fortis fricative + lenis plosive clusters is reflected 
in the spelling of Wesh: ‘Spain’ is Sbaen and ‘school’ is ysgol. 

------------------


