In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv


In [2]:
from collections import defaultdict

import nest_asyncio
import optuna
import pandas as pd

import os
from llama_index.core import Document

from utils.retrieve import (
    objective,
    extract_question_ngrams,
)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# configure
# filename = "test.md"
qa_filename = "index_single_quotes.csv"
ngram_size = 2  # use 2 instead of 3 so we don't skip 2-word header chunks
f_beta = 5  # weight recall 3 times as important as precision in f-score
n_trials = 25  # number of Optuna trials

pd.set_option("display.max_colwidth", None)


Read the question-answers


In [4]:
# qa_df = pd.read_csv(f'data/{qa_filename}', na_filter=False)
qa_df = pd.read_csv(f"../data/temporary/{qa_filename}", na_filter=False)
print(len(qa_df))
qa_df.head(3)


112


Unnamed: 0,Question,Ideal Answer,Link,Quote
0,How do I know if a student has a scholarship?,"Missionaries can see if a student has a scholarship by clicking on the student's name in Path, clicking on the Discounts/Scholarships Tab and looking under History.\n\nInformation about the Discount or Scholarship includes; Discount/Scholarship, Category, Type, Percent, Amount, Term, Status.and Process.",https://missionaries.prod.byu-pathway.psdops.com/Finding-Student-Information-in-PATH%E2%80%8B,Discounts/Scholarships Tab\nView the scholarships and/or discounts the \nstudent has received.
1,How do I know if a student is registered for an institute class for credit?,"There is not a way for Missionaries to verify that a student's current Institute class, is registered for college credit for that class.\n\nMissionaries can see if a student is currently registered for an Institute class It should appear in the Enrollment tab in the lower right hand corner. There will be a red x indicating no credit if the course has not been completed. \nEach student needs to make sure that their Institute instructor knows and has recorded that the student is listed in the course as taking it for College Credit.\n\n",https://missionaries.prod.byu-pathway.psdops.com/institute-for-college-credit,"Checking for Institute Registration for College Credit\n2. Next, click on the Enrollment tab and scroll down\n4. You will see Institute courses taken by the student as you scroll. This\r\nstudent has taken many Institute courses but has not taken them for\r\ncollege credit. The student is currently enrolled in a Spring 2024 course but\r\nshows a x NO because the course is not yet completed. Once the course is\r\ncompleted, the x No will change to a green check mark with a Yes if the\r\ncourse qualifies for credit.\n5. This student is enrolled in an institute course for Spring 2024. It is not one\r\nof the cornerstone classes. The Student should ask the instructor if this\r\ncourse will provide college credit. If so, the No will change to Yes at the\r\ncompletion of the course. The previous 2023 course taken by this student has\r\nbeen given college credit"
2,How do I know if student is member of the church?,"Missionaries can see the Church membership status by clicking on the student's name in Path. On the Student Information Page, in the Details Tab, at the right side of the page, if there are Stake and a Ward names listed, the student is a member of the church.\n\nSometimes no Stake or Ward information will be listed, but when missionaries talk to the student they may find they are members after all. In that case, missionaries can ask the student if they have another Church Account. If they can't remember, they can search for a duplicate account. Students may have to get their membership number from the ward clerk to find the real church account.\n\n",https://missionaries.prod.byu-pathway.psdops.com/Verify-a-Learner's-Church-Member-Status,"Verify a Learner’s Membership Status\n1. In the Details tab in the Student Information page, verify that a learner’s\r\nward and stake are listed if they are a member of the church.\n2. A learner may show they are a member but have no ward and stake\r\nlisted. This is a good indicator that the learner has a duplicate account.\r\no Contact your zone or district leader to investigate this further."


In [5]:
# NOTE: we shouldn't include questions in the *test* set right now,
# but people are still adding the manual quotes,
# and since we have so few questions with manual quotes so far
# we will use all of them for this demo.

# keep only rows with at least 1 manual quote
qa_df = qa_df[qa_df["Quote"].notna() & (qa_df["Quote"] != "")]
print(len(qa_df))


105


In [6]:
# generate bigrams (ngram size=2) for each manual quote
# and store them in the question_ngrams dictionary
question_ngrams = extract_question_ngrams(qa_df, ngram_size)


In [7]:
question_ngrams


{'How do I know if a student has a scholarship?': [('discounts',
   'scholarships'),
  ('scholarships', 'tab'),
  ('tab', 'view'),
  ('view', 'the'),
  ('the', 'scholarships'),
  ('scholarships', 'and'),
  ('and', 'or'),
  ('or', 'discounts'),
  ('discounts', 'the'),
  ('the', 'student'),
  ('student', 'has'),
  ('has', 'received')],
 'How do I know if a student is registered for an institute class for credit?': [('checking',
   'for'),
  ('for', 'institute'),
  ('institute', 'registration'),
  ('registration', 'for'),
  ('for', 'college'),
  ('college', 'credit'),
  ('credit', '2'),
  ('2', 'next'),
  ('next', 'click'),
  ('click', 'on'),
  ('on', 'the'),
  ('the', 'enrollment'),
  ('enrollment', 'tab'),
  ('tab', 'and'),
  ('and', 'scroll'),
  ('scroll', 'down'),
  ('down', '4'),
  ('4', 'you'),
  ('you', 'will'),
  ('will', 'see'),
  ('see', 'institute'),
  ('institute', 'courses'),
  ('courses', 'taken'),
  ('taken', 'by'),
  ('by', 'the'),
  ('the', 'student'),
  ('student', 'as')

Read the documents from the md directory


In [8]:
# Read the document names from the directories:
# ../data/markwodn
# ../data/out/md

origin_paths = ["../data/out_sep_4/from_html/", "../data/out_sep_4/from_pdf/"]

# Read the document names from the directories:
files_list = [path + item for path in origin_paths for item in os.listdir(path)]


In [9]:
files_list.sort()
files_list[0:10]


['../data/out_sep_4/from_html/-Admission-Requirements.md',
 '../data/out_sep_4/from_html/-After-PathwayConnect.md',
 '../data/out_sep_4/from_html/-Answers-to-Your-Questions-about-Ecclesiastical-Endorsement.md',
 '../data/out_sep_4/from_html/-Application-Process.md',
 '../data/out_sep_4/from_html/-Assistance-for-Students-with-Disabilities.md',
 '../data/out_sep_4/from_html/-BYU-Idaho-Course-Exceptions.md',
 '../data/out_sep_4/from_html/-BYU-Pathway-Support.md',
 '../data/out_sep_4/from_html/-BYU-Pathway-Worldwide-Website.md',
 '../data/out_sep_4/from_html/-Common-Misconceptions-about-Choosing-Certificates.md',
 '../data/out_sep_4/from_html/-Communication-Resources.md']

Save the documents into an array


In [10]:
documents = []

for i, filepath in enumerate(files_list):
    with open(filepath, "r", encoding="utf-8") as file:
        document = Document(text=file.read(), metadata={"filepath": filepath})

        # add the document to a single entry list
        documents.append(document)


In [11]:
def get_objective_fn(documents, ngram_size, question_ngrams, f_beta):
    def inner(trial):
        return objective(trial, documents, ngram_size, question_ngrams, f_beta)

    return inner


In [12]:
# ask Optuna to find the best hyperparameters

study_name = "test"  # Unique identifier of the study.
storage_name = f"sqlite:///optuna-{study_name}.db"
print(
    f"To see a dashboard, open a terminal, activate the virtual environment, and run: optuna-dashboard {storage_name}"
)
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
    direction="maximize",
)
# study.optimize(
#     get_objective_fn(
#         documents=documents, ngram_size=ngram_size, question_ngrams=question_ngrams
#     ),
#     n_trials=n_trials,
# )
study.optimize(
    lambda trial: objective(
        trial, documents, ngram_size, question_ngrams, f_beta=f_beta
    ),
    n_trials=n_trials,
)

study.best_params


To see a dashboard, open a terminal, activate the virtual environment, and run: optuna-dashboard sqlite:///optuna-test.db


[I 2024-09-05 19:22:00,190] Using an existing study with name 'test' instead of creating a new one.


Nodes inserted: 2142


[I 2024-09-05 19:50:28,807] Trial 1 finished with value: 0.22025309614955627 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'semantic', 'include_prev_next_rel': True, 'buffer_size': 1, 'breakpoint_percentile_threshold': 86, 'index': 'chromadb', 'top_k': 47}. Best is trial 1 with value: 0.22025309614955627.


Nodes inserted: 993


[I 2024-09-05 19:53:35,438] Trial 2 finished with value: 0.143968623871961 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'sentence', 'include_prev_next_rel': False, 'chunk_size': 499, 'chunk_overlap': 193, 'index': 'chromadb', 'top_k': 50}. Best is trial 1 with value: 0.22025309614955627.


Nodes inserted: 632


[I 2024-09-05 19:55:54,990] Trial 3 finished with value: 0.20963701889455993 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'sentence', 'include_prev_next_rel': True, 'chunk_size': 848, 'chunk_overlap': 25, 'index': 'chromadb', 'top_k': 14}. Best is trial 1 with value: 0.22025309614955627.


Nodes inserted: 1659


[I 2024-09-05 20:00:17,827] Trial 4 finished with value: 0.1920337434947818 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': True, 'index': 'chromadb', 'top_k': 37}. Best is trial 1 with value: 0.22025309614955627.


Nodes inserted: 609


[I 2024-09-05 20:02:32,638] Trial 5 finished with value: 0.18190482385822515 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'sentence', 'include_prev_next_rel': False, 'chunk_size': 968, 'chunk_overlap': 131, 'index': 'chromadb', 'top_k': 19}. Best is trial 1 with value: 0.22025309614955627.


Nodes inserted: 2660


[I 2024-09-05 20:33:19,777] Trial 6 finished with value: 0.3130006907152221 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'semantic', 'include_prev_next_rel': False, 'buffer_size': 1, 'breakpoint_percentile_threshold': 81, 'index': 'chromadb', 'top_k': 16}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1216


[I 2024-09-05 20:57:34,707] Trial 7 finished with value: 0.16826666045857697 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'semantic', 'include_prev_next_rel': False, 'buffer_size': 1, 'breakpoint_percentile_threshold': 95, 'index': 'chromadb', 'top_k': 46}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-05 21:01:41,273] Trial 8 finished with value: 0.25484988126304065 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': False, 'index': 'chromadb', 'top_k': 18}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 705


[I 2024-09-05 21:04:11,663] Trial 9 finished with value: 0.21628840349784034 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'sentence', 'include_prev_next_rel': False, 'chunk_size': 682, 'chunk_overlap': 39, 'index': 'chromadb', 'top_k': 16}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-05 21:11:20,772] Trial 10 finished with value: 0.2739579160231638 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': True, 'index': 'chromadb', 'top_k': 12}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 4300


[I 2024-09-05 21:40:04,545] Trial 11 finished with value: 0.12930897379069278 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'semantic', 'include_prev_next_rel': False, 'buffer_size': 3, 'breakpoint_percentile_threshold': 65, 'index': 'chromadb', 'top_k': 2}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-05 21:43:39,452] Trial 12 finished with value: 0.2987378639048168 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': True, 'index': 'chromadb', 'top_k': 5}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-05 21:47:16,121] Trial 13 finished with value: 0.24647924396661128 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': True, 'index': 'chromadb', 'top_k': 2}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 3476


[I 2024-09-05 22:12:04,274] Trial 14 finished with value: 0.2877196820673843 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'semantic', 'include_prev_next_rel': True, 'buffer_size': 2, 'breakpoint_percentile_threshold': 73, 'index': 'chromadb', 'top_k': 27}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-05 22:15:45,442] Trial 15 finished with value: 0.29877035010811054 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': False, 'index': 'chromadb', 'top_k': 8}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 2722


[I 2024-09-05 22:38:48,995] Trial 16 finished with value: 0.2906865441122019 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'semantic', 'include_prev_next_rel': False, 'buffer_size': 1, 'breakpoint_percentile_threshold': 80, 'index': 'chromadb', 'top_k': 23}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-05 22:42:24,863] Trial 17 finished with value: 0.29182112853899317 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': False, 'index': 'chromadb', 'top_k': 9}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 4720


[I 2024-09-05 23:08:35,983] Trial 18 finished with value: 0.2961548152822784 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'semantic', 'include_prev_next_rel': False, 'buffer_size': 2, 'breakpoint_percentile_threshold': 61, 'index': 'chromadb', 'top_k': 30}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 2033


[I 2024-09-06 00:02:56,834] Trial 19 finished with value: 0.2934811295253365 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'semantic', 'include_prev_next_rel': False, 'buffer_size': 3, 'breakpoint_percentile_threshold': 87, 'index': 'chromadb', 'top_k': 9}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-06 00:06:24,911] Trial 20 finished with value: 0.19973302955268737 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': False, 'index': 'chromadb', 'top_k': 34}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-06 00:09:53,936] Trial 21 finished with value: 0.24178492879119987 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': False, 'index': 'chromadb', 'top_k': 21}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-06 00:13:22,091] Trial 22 finished with value: 0.2888461750861621 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': True, 'index': 'chromadb', 'top_k': 6}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-06 00:16:52,529] Trial 23 finished with value: 0.2871929330367342 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': True, 'index': 'chromadb', 'top_k': 8}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-06 00:20:25,346] Trial 24 finished with value: 0.275592973135385 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': True, 'index': 'chromadb', 'top_k': 12}. Best is trial 6 with value: 0.3130006907152221.


Nodes inserted: 1659


[I 2024-09-06 00:23:50,362] Trial 25 finished with value: 0.28474028476618973 and parameters: {'embed_model': 'text-embedding-3-large', 'splitter': 'markdown', 'include_prev_next_rel': False, 'index': 'chromadb', 'top_k': 6}. Best is trial 6 with value: 0.3130006907152221.


{'embed_model': 'text-embedding-3-large',
 'splitter': 'semantic',
 'include_prev_next_rel': False,
 'buffer_size': 1,
 'breakpoint_percentile_threshold': 81,
 'index': 'chromadb',
 'top_k': 16}