### Importing libraries

In [1]:
import llama_index.core
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage, SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_parse import LlamaParse
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
import nest_asyncio; nest_asyncio.apply()

import json
import os
import re
from dotenv import load_dotenv
load_dotenv()

True

### Select the LLM

In [2]:
model = "gpt-4o"
embedding_model = "text-embedding-3-large"
similarity_cutoff = 0.6

# set GPT's temperature to 0
Settings.llm = OpenAI(temperature=0.0, model="gpt-4o")

In [None]:
model = "gpt-4o-mini"
embedding_model = "text-embedding-ada-002"
similarity_cutoff = 0

# set GPT's temperature to 0
Settings.llm = OpenAI(temperature=0.0, model="gpt-4o-mini")

#### Load index

In [3]:
BP_threshold = 70
top_k = 10 # set how many chunks are given as context to GPT

PERSIST_DIR = f"./storage_LlamaParse_semantic_chunking_final/storage_LlamaParse_sem_{BP_threshold}_{embedding_model}"

LIST_OF_DOCS = ["raw_texts/"+f for f in os.listdir("Raw_texts")]
print(LIST_OF_DOCS)

if not os.path.exists(PERSIST_DIR):
    embed_model_OpenAI = OpenAIEmbedding(model=embedding_model)
    documents = LlamaParse(result_type="text").load_data(LIST_OF_DOCS)
    splitter = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=BP_threshold, embed_model=embed_model_OpenAI, include_metadata=True)
    nodes = splitter.get_nodes_from_documents(documents)
    # load the documents and create the index
    index = VectorStoreIndex(nodes)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

['raw_texts/2-settext.pdf', 'raw_texts/3-settext.txt', 'raw_texts/4-settext.pdf', 'raw_texts/5-settext.pdf', 'raw_texts/9-settext.pdf']


### Open quizzes

In [5]:
GPT_made_quizzes = ["JSON_quizzes/GPT-made_quizzes/"+f for f in os.listdir("./JSON_quizzes/GPT-made_quizzes") if f.endswith('.json')]
my_own_quizzes = ["JSON_quizzes/my_own_quizzes/"+f for f in os.listdir("./JSON_quizzes/my_own_quizzes") if f.endswith('.json')]
LIST_OF_QUIZZES = GPT_made_quizzes + my_own_quizzes

print(LIST_OF_QUIZZES)
print("number of quizzes:", len(LIST_OF_QUIZZES))

['JSON_quizzes/GPT-made_quizzes/itembank-2.json', 'JSON_quizzes/GPT-made_quizzes/itembank-3.json', 'JSON_quizzes/GPT-made_quizzes/itembank-4.json', 'JSON_quizzes/GPT-made_quizzes/itembank-5.json', 'JSON_quizzes/GPT-made_quizzes/itembank-9.json', 'JSON_quizzes/my_own_quizzes/itembank-2.json', 'JSON_quizzes/my_own_quizzes/itembank-3.json', 'JSON_quizzes/my_own_quizzes/itembank-4.json', 'JSON_quizzes/my_own_quizzes/itembank-5.json', 'JSON_quizzes/my_own_quizzes/itembank-9.json']
number of quizzes: 10


### The retriever and prompt

In [6]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

# configure response synthesizer

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=similarity_cutoff)],
)

from llama_index.core import PromptTemplate

new_prompt = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query. The context is incredibly important, read and analyze it extremely carefully! Make sure that the answer is 100% correct because my life depends on it. "
    "Only answer with the letter and text of the answer which you think is correct.\n"
    "Question: {query_str}\n"
    "Answer: "
)
new_tmpl = PromptTemplate(new_prompt)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": new_tmpl}
)

### Full loop

In [7]:
for pass_n in range(5):
    print(f"\nPass over the quizzes number {pass_n+1}")    
    
    #save total number of questions and correct answers separately for my own quizzes and GPT-made
    total_nquestions_own = 0
    total_ncorrects_own = 0
    total_nquestions_GPT = 0
    total_ncorrects_GPT = 0

    itembank_reports = [] # to write the itembank number and number of correct answers.
    itembank_reports.append(f"breakpoint percentile: {BP_threshold}")
    itembank_reports.append(f"top_k: {top_k}")
    itembank_reports.append(f"embedding model: {embedding_model}")
    itembank_reports.append(f"similarity cutoff: {similarity_cutoff}")
    itembank_reports.append(f"\nPrompt:\n {new_prompt}\n")


    # Iterate over each question and get GPT's response
    for doc in LIST_OF_QUIZZES:
        with open(doc, 'r', encoding='UTF-8') as file:
            itembank_JSON = json.load(file)
        
        itembank_number = re.search(r"\d[a|b]?(?:\-\d)?", doc).group()
        quiz_type = "own_questions" if re.search(r"my_own_quizzes", doc) else "GPT-made_questions" # returns None for GPT-made questions
        this_ncorrects = 0 # for correct answers in each separate itembank
        this_nquestions = 0

        #print(f"----------processing itembank-{itembank_number}-{quiz_type}----------")

        for question in itembank_JSON:
            full_prompt = question['question'] + '\n' + '\n'.join(question['answers'])
            given_answer = query_engine.query(full_prompt).response # Get the response from GPT
            question["GPT's response"] = given_answer  # Append the response to the question
            question["correct"] = given_answer==question["correct answer(s)"]

            if quiz_type == "own_questions":
                if given_answer==question["correct answer(s)"]: 
                    total_ncorrects_own+=1
                    this_ncorrects+=1
                total_nquestions_own+=1
                this_nquestions+=1
            else:
                if given_answer==question["correct answer(s)"]: 
                    total_ncorrects_GPT+=1
                    this_ncorrects+=1
                total_nquestions_GPT+=1
                this_nquestions+=1
            
        itembank_reports.append(f"itembank-{itembank_number}_{quiz_type}:\t\tCorrect answers {this_ncorrects} / total questions {this_nquestions} * 100 = {round(this_ncorrects/this_nquestions*100, 2)}%")

        # Save the updated JSON data to a new file
        with open(f'./results/improved_RAG_responses/semantic_chunking/RAG_{model}_responses-semantic_{BP_threshold}-{itembank_number}_{quiz_type}.json', 'w') as outfile:
            json.dump(itembank_JSON, outfile, indent=2)

    itembank_reports.append(f"\nOverall correct answers for GPT-made {total_ncorrects_GPT} / total questions {total_nquestions_GPT} * 100 = {round(total_ncorrects_GPT/total_nquestions_GPT*100, 2)}%")
    itembank_reports.append(f"Overall correct answers for own {total_ncorrects_own} / total questions {total_nquestions_own} * 100 = {round(total_ncorrects_own/total_nquestions_own*100, 2)}%\n")
    itembank_reports.append("\n------------------------------------------------------------------------\n\n")

    # save the prompt and number of correct questions
    with open(f"./results/improved_rag_{model}_results.txt", 'a', encoding='UTF-8') as report_file:
        report_file.write('\n'.join(itembank_reports))


Pass over the quizzes number 1

Pass over the quizzes number 2

Pass over the quizzes number 3

Pass over the quizzes number 4

Pass over the quizzes number 5


### Observe with Phoenix (not necessary to run)

In [None]:
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={os.getenv('PHOENIX_API_KEY')}"
llama_index.core.set_global_handler("arize_phoenix", endpoint="https://llamatrace.com/v1/traces")

with open('./JSON_quizzes/my_own_quizzes/itembank-4.json', 'r', encoding='UTF-8') as file:
    questions_JSON = json.load(file)

for question in questions_JSON[18:19]:
    full_prompt = question['question'] + '\n' + '\n'.join(question['answers'])
    given_answer = pingGPT(full_prompt)  # Get the response from GPT
print(given_answer)