## Install the required packages
You also need to install [Ollama](https://ollama.com/) to run our project. After installing, enter Ollama run llama3 in the terminal to start the model.

In [None]:
%pip install langchain
%pip install pypdf
%pip install gpt4all
%pip install chromadb
%pip install pandas
%pip install numpy

## Setup Steps

In [None]:
from langchain_community.llms import Ollama
from langchain.embeddings import GPT4AllEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

import time as timer
import pandas as pd
import numpy as np


### Load the textbook data and split the data into chunks

In [None]:
# Load the data from the PDFs
pdf_paths = [
    "APBiology-OP.pdf",
    "Theodore E. Brown et al. - Chemistry_ The Central Science-Pearson (2017).pdf",
    "David Halliday, Robert Resnick, Jearl Walker - Fundamentals of Physics Extended-Wiley (2013).pdf"
]

# Initialize a list to hold all text chunks from all books
all_documents = []

for pdf_path in pdf_paths:
    # Load the data from each PDF
    loader = PyPDFLoader(pdf_path)
    data = loader.load()
    
    # Split the data into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    all_splits = text_splitter.split_documents(data)
    # Collect all chunks in a single list
    if len(all_documents) == 0:
        all_documents = all_splits
        vector_store = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())
    else:
        all_documents += all_splits
        vector_store.add_documents(all_splits)

retriever = vector_store.as_retriever()

### initialize the model

In [None]:
llm = Ollama(model = "llama3", format = "json")

### Import and format data

In [None]:
df = pd.read_csv("train.csv")
questions = df['prompt'].tolist()
options = df[['A', 'B', 'C', 'D', 'E']].values.tolist()

In [None]:
def format_mcq(question, choices):
    """
    Formats a multiple-choice question and its corresponding choices into separate dictionaries.

    Parameters:
        question (str): The text of the multiple-choice question.
        choices (list): A list of strings where each string is a potential answer choice.

    Returns:
        tuple: A tuple containing two dictionaries:
            - The first dictionary with the key 'question' pointing to the question text.
            - The second dictionary with the key 'options' containing a dictionary of options,
              where each key is a letter ('A', 'B', 'C', etc.) corresponding to the choice's index in the input list.

    """
    formatted_question = {"question": question}
    formatted_options= {"options": { }}
    for i, choice in enumerate(choices):
        formatted_options["options"][chr(65+i)] = choice
    return formatted_question, formatted_options

### Define desired answer structure.
We require the model output a single letter since it is easy to operate.

In [None]:
class Answer(BaseModel):
    answer: str = Field(description="your single captial letter of option that is the right answer, without any spaces or special characters.")

### Define the model
We provide two different type of models. One with the textbooks embedding and one without.

In [None]:
def ask_model_with_out_embeddings(llm, question, options):
    """
    Invokes a language model to answer a multiple-choice question without using embeddings.

    Parameters:
        llm (LanguageModel): The language model to use for answering the question.
        question (str): The text of the multiple-choice question.
        options (list): A list of strings representing the multiple-choice options.

    Returns:
        dict: A dictionary containing the validated answer from the language model. The key 'answer'
              will hold the value of the model's response, expected to be a single letter.
    """
    parser = JsonOutputParser(pydantic_object=Answer)
    # Define the model behavior and prompt tempalte(no context)
    prompt = PromptTemplate(
        template="Answer the following multiple choice question:\n{format_instructions}\n{question}\n{options}. \n You should give an answer in the form of a single letter, without any spaces or special characters.",
        question="Question: {question}",
        options="Options:\n{options}",
        partial_variables={"format_instructions": parser.get_format_instructions()},
        input_variables={"question", "options"}      
    )
    # Run the chain
    chain = prompt | llm | parser
    start = timer.time()
    results = chain.invoke({"question": question, "options": options})
    while "answer" not in results:
        results = chain.invoke({"question": question, "options": options})
    end = timer.time()
    return results

In [None]:
def ask_model_with_embeddings(llm, question, options, retriever):
    """
    Invokes a language model to answer a multiple-choice question with embeddings.

    Parameters:
        llm (LanguageModel): The language model to use for answering the question.
        question (str): The text of the multiple-choice question.
        options (list): A list of strings representing the multiple-choice options.
        retriever (Retriever): The retriever object used to retrieve the context for the question.

    Returns:
        dict: A dictionary containing the validated answer from the language model. The key 'answer'
              will hold the value of the model's response, expected to be a single letter.
    """
    parser = JsonOutputParser(pydantic_object=Answer)
    # Define the model behavior and prompt tempalte
    prompt = PromptTemplate(
        template="Answer the following multiple choice question:\n{format_instructions} \n{context} \n{question}\n{options}. \n ",
        question="Question: {question}",
        options="Options:\n{options}",
        context="Context: {context}",
        partial_variables={"format_instructions": parser.get_format_instructions()},
        input_variables={"question", "options", "context"}      
    )
    # Run the chain
    chain = prompt | llm | parser
    start = timer.time()
    docs = retriever.invoke(question["question"])
    results = chain.invoke({"question": question, "options": options, "context":docs[0].page_content})
    while "answer" not in results:
        results = chain.invoke({"question": question, "options": options, "context":docs[0].page_content})
    end = timer.time()
    return results

## Experiment

In [None]:
# With embeddings
results_embeded = []
for index, row in df.iterrows():
    print(index)
    formatted_question, formatted_options = format_mcq(row['prompt'], [row['A'], row['B'], row['C'], row['D'], row['E']])
    answer = ask_model_with_embeddings(llm, formatted_question, formatted_options, retriever)
    results_embeded.append(answer["answer"])
    

In [None]:
# Without embeddings
results_no_embeded = []
for index, row in df.iterrows():
    print(index)
    formatted_question, formatted_options = format_mcq(row['prompt'], [row['A'], row['B'], row['C'], row['D'], row['E']])
    answer = ask_model_with_out_embeddings(llm, formatted_question, formatted_options)
    print(answer)
    results_no_embeded.append(answer["answer"])

In [None]:
# Save the results into npy arrays
np.save("results_embeded.npy", results_embeded)
np.save("results_no_embeded.npy", results_no_embeded)

### Calculate the accuracy
The following code only calculate accuracy for a single run of the experiment. We recommend to run the model for several time and take the average accuracy to get more reliable results.

In [None]:
answers = df['answer'].tolist()
embedded_accuracy = sum([1 for i in range(len(answers)) if answers[i] == results_embeded[i]]) / len(answers)
print(f"Embedded Accuracy: {embedded_accuracy}")
no_embedded_accuracy = sum([1 for i in range(len(answers)) if answers[i] == results_no_embeded[i]]) / len(answers)
print(f"Without Embedded Accuracy: {no_embedded_accuracy}")