In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv


In [2]:
from collections import defaultdict

import nest_asyncio
import optuna
import pandas as pd

import os
from llama_index.core import Document

from utils.retrieve import (
    objective,
    extract_question_ngrams,
)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# configure
# filename = "test.md"
qa_filename = "question_list.csv"
ngram_size = 2  # use 2 instead of 3 so we don't skip 2-word header chunks
f_beta = 3  # weight recall 3 times as important as precision in f-score
n_trials = 25  # number of Optuna trials

pd.set_option("display.max_colwidth", None)


Read the question-answers


In [4]:
# qa_df = pd.read_csv(f'data/{qa_filename}', na_filter=False)
qa_df = pd.read_csv(f"../data/qa_list/{qa_filename}", na_filter=False)
print(len(qa_df))
qa_df.head(3)


171


Unnamed: 0,Initials,Questions,Ideal Answer,Link to Ideal Answer,Quotes
0,,Student Info,,,
1,,How do I know if a student has a scholarship?,"Missionaries can see if a student has a scholarship by clicking on the student's name in Path, clicking on the Discounts/Scholarships Tab and looking under History.\n\nInformation about the Discount or Scholarship includes; Discount/Scholarship, Category, Type, Percent, Amount, Term, Status.and Process.",https://missionaries.prod.byu-pathway.psdops.com/Finding-Student-Information-in-PATH%E2%80%8B,Discounts/Scholarships Tab\nView the scholarships and/or discounts the \nstudent has received.
2,,How do I know if a student is registered for an institute class for credit?,"There is not a way for Missionaries to verify that a student's current Institute class, is registered for college credit for that class.\n\nMissionaries can see if a student is currently registered for an Institute class It should appear in the Enrollment tab in the lower right hand corner. There will be a red x indicating no credit if the course has not been completed. \nEach student needs to make sure that their Institute instructor knows and has recorded that the student is listed in the course as taking it for College Credit.\n\n",https://missionaries.prod.byu-pathway.psdops.com/institute-for-college-credit,


In [5]:
# NOTE: we shouldn't include questions in the *test* set right now,
# but people are still adding the manual quotes,
# and since we have so few questions with manual quotes so far
# we will use all of them for this demo.

# keep only rows with at least 1 manual quote
qa_df = qa_df[qa_df["Quotes"].notna() & (qa_df["Quotes"] != "")]
print(len(qa_df))


1


In [6]:
# generate bigrams (ngram size=2) for each manual quote
# and store them in the question_ngrams dictionary
question_ngrams = extract_question_ngrams(qa_df, ngram_size)


In [7]:
question_ngrams


{'How do I know if a student has a scholarship?': [('discounts',
   'scholarships'),
  ('scholarships', 'tab'),
  ('tab', 'view'),
  ('view', 'the'),
  ('the', 'scholarships'),
  ('scholarships', 'and'),
  ('and', 'or'),
  ('or', 'discounts'),
  ('discounts', 'the'),
  ('the', 'student'),
  ('student', 'has'),
  ('has', 'received')]}

Read the documents from the md directory


In [8]:
# Read the document names from the directories:
# ../data/markwodn
# ../data/out/md

origin_paths = ["../data/markdown/", "../data/out/md/"]

# Read the document names from the directories:
files_list = [path + item for path in origin_paths for item in os.listdir(path)]


In [9]:
files_list.sort()
files_list[0:10]


['../data/markdown/-Admission-Requirements.md',
 '../data/markdown/-After-PathwayConnect.md',
 '../data/markdown/-Answers-to-Your-Questions-about-Ecclesiastical-Endorsement.md',
 '../data/markdown/-Application-Process.md',
 '../data/markdown/-Assistance-for-Students-with-Disabilities.md',
 '../data/markdown/-BYU-Idaho-Course-Exceptions.md',
 '../data/markdown/-BYU-Pathway-Support.md',
 '../data/markdown/-BYU-Pathway-Worldwide-Website.md',
 '../data/markdown/-Communication-Resources.md',
 '../data/markdown/-Confidentiality-of-Student-Records.md']

Save the documents into an array


In [10]:
documents = []

for i, filepath in enumerate(files_list[:5]):
    with open(filepath, "r", encoding="utf-8") as file:
        document = Document(text=file.read(), metadata={"filepath": filepath})

        # add the document to a single entry list
        documents.append(document)


In [11]:
def get_objective_fn(documents, ngram_size, question_ngrams):
    def inner(trial):
        return objective(trial, documents, ngram_size, question_ngrams)

    return inner


In [14]:
# ask Optuna to find the best hyperparameters

study_name = "test"  # Unique identifier of the study.
storage_name = f"sqlite:///optuna-{study_name}.db"
print(
    f"To see a dashboard, open a terminal, activate the virtual environment, and run: optuna-dashboard {storage_name}"
)
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
    direction="maximize",
)
# study.optimize(
#     get_objective_fn(
#         documents=documents, ngram_size=ngram_size, question_ngrams=question_ngrams
#     ),
#     n_trials=n_trials,
# )
study.optimize(
    lambda trial: objective(trial, documents, ngram_size, question_ngrams),
    n_trials=n_trials,
)

study.best_params


[I 2024-08-21 15:23:40,544] Using an existing study with name 'test' instead of creating a new one.
[W 2024-08-21 15:23:40,619] Trial 5 failed with parameters: {'embed_model': 'voyage-large-2-instruct'} because of the following error: AuthenticationError(message="No API key provided. You can set your API key in code using 'voyageai.api_key = <API-KEY>', or you can set the environment variable VOYAGE_API_KEY=<API-KEY>). If your API key is stored in a file, you can point the voyageai module at it with 'voyageai.api_key_path = <PATH>'. You can generate API keys in Voyage AI's dashboard (https://dash.voyageai.com).", http_status=None, request_id=None).
Traceback (most recent call last):
  File "/home/isaiaszc/pathway/pathway-indexer/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_12875/109579318.py", line 21, in <lambda>
    lambda trial: objective(trial, docume

To see a dashboard, open a terminal, activate the virtual environment, and run: optuna-dashboard sqlite:///optuna-test.db


AuthenticationError: No API key provided. You can set your API key in code using 'voyageai.api_key = <API-KEY>', or you can set the environment variable VOYAGE_API_KEY=<API-KEY>). If your API key is stored in a file, you can point the voyageai module at it with 'voyageai.api_key_path = <PATH>'. You can generate API keys in Voyage AI's dashboard (https://dash.voyageai.com).