In [None]:
%pip install langchain

In [None]:
%pip install pypdf

In [None]:
%pip install gpt4all

In [None]:
%pip install chromadb

### Model setup

In [None]:
from langchain_community.llms import Ollama
from langchain.embeddings import GPT4AllEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

import time as timer
import pandas as pd


In [None]:
# load the data
pdf_path = "APBiology-OP.pdf"
loader = PyPDFLoader(pdf_path)
data = loader.load()

In [None]:
# split the data into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(data)

In [None]:
vector_store  = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())

In [None]:
llm = Ollama(model = "llama3", format = "json")

In [None]:
df = pd.read_csv("train.csv")

print(df.keys())

In [None]:
questions = df['prompt'].tolist()
options = df[['A', 'B', 'C', 'D', 'E']].values.tolist()

In [None]:
def format_mcq(question, choices):
    formatted_question = {"question": question}
    formatted_options= {"options": { }}
    for i, choice in enumerate(choices):
        formatted_options["options"][chr(65+i)] = choice
    return formatted_question, formatted_options

In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
# Define your desired data structure.
class Answer(BaseModel):
    # answer: str = Field(description="your single letter of option that is the right answer, without any spaces or special characters.")
    answer: str

In [None]:

# Define the output parser
parser = JsonOutputParser(pydantic_object=Answer)
# Define the model behavior and prompt tempalte
prompt = PromptTemplate(
    template="Answer the following multiple choice question:\n{format_instructions}\n{question}\n{options}\n{context}",
    question="Question: {question}",
    options="Options:\n{options}",
    partial_variables={"format_instructions": parser.get_format_instructions()},
    input_variables={"question", "options", "context"}
)

# Define the questions
questions = {
    "question": "What is the function of the Golgi apparatus?",
}
options = {
    "options": ["A) Protein synthesis", "B) Lipid synthesis", "C) Carbohydrate synthesis", "D) Protein modification", "E) DNA replication"]
}

context = "The Golgi apparatus is an organelle found in most eukaryotic cells. It is made up of membrane-bound sacs called cisternae. The Golgi apparatus is responsible for modifying, sorting, and packaging proteins for secretion. It also plays a role in lipid synthesis and carbohydrate synthesis. The Golgi apparatus is involved in the transport of proteins and other molecules within the cell."

# Run the chain
chain = prompt | llm | parser
start = timer.time()
results = chain.invoke({"question": questions, "options": options, "context": context})
end = timer.time()

# Print the results
print(results['answer'])

In [None]:
def ask_model(llm, question, options):
    # Define the output parser
    parser = JsonOutputParser(pydantic_object=Answer)
    # Define the model behavior and prompt tempalte
    prompt = PromptTemplate(
        template="Answer the following multiple choice question:\n{format_instructions}\n{question}\n{options}. \n You should give an answer in the form of a single letter, without any spaces or special characters.",
        question="Question: {question}",
        options="Options:\n{options}",
        partial_variables={"format_instructions": parser.get_format_instructions()},
        input_variables={"question", "options"}      
    )
    # Run the chain
    chain = prompt | llm | parser
    start = timer.time()
    results = chain.invoke({"question": question, "options": options})
    while "answer" not in results:
        results = chain.invoke({"question": question, "options": options})
    end = timer.time()
    return results

In [None]:
results = []
for index, row in df.iterrows():
    if index >= 100: 
        break
    formatted_question, formatted_options = format_mcq(row['prompt'], [row['A'], row['B'], row['C'], row['D'], row['E']])
    print(formatted_question, formatted_options)
    answer = ask_model(llm, formatted_question, formatted_options)
    # results.append(answer)
    print("answer:",answer)
    print()

# df['Model Answer'] = results
# print(df[['prompt', 'Model Answer']].head())

In [None]:
query = "What is the function of the Golgi apparatus?"

In [None]:
print("Query: ", query)
start = timer.time()
answer = chain({"query": query})
end = timer.time()
print("Answer: ", answer)
print("Time: ", end-start)