Evaluate Retrieval Impact
===

Questions:
 - Does retrieval matter much?
 - Does it matter if the retrieval is relevant to the user's query?



In [65]:
import json
import os
import pickle
import random
import time
from pathlib import Path

import dotenv
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import openai
import pandas as pd
import scipy
import sklearn.metrics
import tiktoken
from tqdm import tqdm

from llm_math_education import prompt_utils, retrieval, retrieval_strategies
from llm_math_education.prompts import mathqa as mathqa_prompts

In [2]:
dotenv.load_dotenv("../.env")

True

In [3]:
os.environ["OPENAI_API_KEY"][:3]

'sk-'

In [4]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [5]:
data_dir = Path("../data")
assert data_dir.exists()
with open(data_dir / "derived" / "rori_lessons.json") as infile:
    df = pd.read_json(infile)
df.shape

(67, 18)

### Load query data

In [13]:
mn_general_student_queries_filepath = data_dir / "app_data" / "mn_general_student_queries.csv"
query_df = pd.read_csv(mn_general_student_queries_filepath)
student_queries = [
    row.post_content.strip().replace("[Continued:]", "\n")
    for row in query_df.sample(frac=1, random_state=87896).itertuples()
    if row.is_respondable_query == "general"
]
len(student_queries)

51

In [14]:
student_queries[0]

'Can I get the steps for factoring quadratics'

### Load dbs

In [None]:
app_embedding_dir = data_dir / "app_data"
recipe_embedding_dir = data_dir / "derived" / "embeddings"

In [8]:
rori_db = retrieval.RetrievalDb(app_embedding_dir, "rori_microlesson", "db_string")
openstax_db = retrieval.RetrievalDb(app_embedding_dir, "openstax_subsection", "db_string")
recipe_db = retrieval.RetrievalDb(recipe_embedding_dir, "recipenlg", "db_string")

In [23]:
rori_microlesson_db_info = {
    "db": rori_db,
    "max_tokens": 1000,
    "prefix": "Here is some lesson content that might be relevant:\n",
}
openstax_subsection_db_info = {
    "db": openstax_db,
    "max_tokens": 1000,
    "prefix": "Here are some excerpts from a math textbook. If they are relevant to the question, feel free to use language or examples from these excerpts:\n",
}
recipenlg_db_info = {
    "db": recipe_db,
    "max_tokens": 2000,
    "prefix": "Here are some step-by-step descriptions. If they are relevant to the question, feel free to use language or examples from these excerpts:\n",
}
math_strategy = retrieval_strategies.MappedEmbeddingRetrievalStrategy(
    {
        "rori_microlesson_texts": rori_microlesson_db_info,
        "openstax_subsection_texts": openstax_subsection_db_info,
    },
)
nomath_strategy = retrieval_strategies.MappedEmbeddingRetrievalStrategy(
    {
        "rori_microlesson_texts": recipenlg_db_info,
    },
)
noretrieval_strategy = retrieval_strategies.NoRetrievalStrategy()

### Generate responses

In [None]:
ds = []

In [37]:
for user_query in tqdm(student_queries, desc="Generating responses"):
    for retrieval_strategy_name, retrieval_strategy in [
        ("math", math_strategy),
        ("recipe", nomath_strategy),
        ("none", noretrieval_strategy),
    ]:
        already_generated = False
        for d in ds:
            if d["user_query"] == user_query and d["retrieval_strategy"] == retrieval_strategy_name:
                already_generated = True
        if already_generated:
            continue
        intro_prompt_messages = mathqa_prompts.intro_prompts["general_math_qa_intro"]["messages"]
        prompt_manager = prompt_utils.PromptManager()
        prompt_manager.set_intro_messages(intro_prompt_messages)
        prompt_manager.set_retrieval_strategy(retrieval_strategy)
        messages = prompt_manager.build_query(user_query)
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0613",
            messages=messages,
        )
        assistant_message = completion["choices"][0]["message"]["content"]
        ds.append(
            {
                "user_query": user_query,
                "retrieval_strategy": retrieval_strategy_name,
                "response": assistant_message,
                "messages": messages,
            }
        )
        time.sleep(0.1)  # being a bit polite on repeated api calls

Generating responses: 100%|████████████████████████████████████████████████████████████████████████████████████████| 51/51 [01:23<00:00,  1.64s/it]


In [38]:
len(ds)

153

In [42]:
eval_dir = data_dir / "derived" / "eval"
eval_dir.mkdir(exist_ok=True)

In [44]:
with open(eval_dir / "generations_20230807.pkl", "wb") as outfile:
    pickle.dump(ds, outfile)

In [52]:
# load generated responses
with open(eval_dir / "generations_20230807.pkl", "rb") as infile:
    pickle.load(infile)

In [48]:
df = (
    pd.DataFrame(ds, columns=["user_query", "retrieval_strategy", "response"])
    .reset_index()
    .rename(columns={"index": "response_id"})
)
df.shape

(153, 4)

In [50]:
df.sample(n=3)

Unnamed: 0,response_id,user_query,retrieval_strategy,response
5,5,I need a standard way of recognizing polynomia...,none,Of course! I can help you with that. A polynom...
49,49,if all the x values in a function are differnt...,recipe,"Yes, you are correct! In a function, every inp..."
139,139,How do you find the domain and range that are ...,recipe,"To find the domain and range of a function, yo..."


In [51]:
df.retrieval_strategy.value_counts()

retrieval_strategy
math      51
recipe    51
none      51
Name: count, dtype: int64

### Construct surveys

Generate two surveys.

 - Randomize query order.
 - Create two questions, comparing recipe vs none and math vs none.
 - Assign every other question to survey 1 or survey 2, alternatingly.

In [81]:
def get_question_with_random_order(group: pd.DataFrame, option_set: list):
    question = {}
    for i, option in enumerate(random.sample(option_set, k=len(option_set))):
        question[f"response{i+1}"] = group[group.retrieval_strategy == option].iloc[0].response
        question[f"response{i+1}_retrieval_strategy"] = option
    return question


survey1 = []
survey2 = []
for user_query in df.user_query.drop_duplicates().sample(frac=1, random_state=19881):
    group = df[df.user_query == user_query]
    q1 = get_question_with_random_order(group, ["recipe", "none"])
    q2 = get_question_with_random_order(group, ["math", "none"])
    q1 = {"user_query": user_query, **q1}
    q2 = {"user_query": user_query, **q2}
    survey1.append(q1)
    survey2.append(q2)
    survey1, survey2 = survey2, survey1

In [82]:
eval_dir = data_dir / "derived" / "eval"
eval_dir.mkdir(exist_ok=True)

In [83]:
pd.DataFrame(survey1).to_csv(eval_dir / "survey1_20230807.csv", index=False)
pd.DataFrame(survey2).to_csv(eval_dir / "survey2_20230807.csv", index=False)