In [None]:
%pip install langchain

In [None]:
%pip install pypdf

In [None]:
%pip install gpt4all

In [None]:
%pip install chromadb

### Model setup

In [23]:
from langchain_community.llms import Ollama
from langchain.embeddings import GPT4AllEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

import time as timer
import pandas as pd


In [16]:
# load the data
pdf_path = "APBiology-OP.pdf"
loader = PyPDFLoader(pdf_path)
data = loader.load()

In [17]:
# split the data into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(data)

In [18]:
vector_store  = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())

In [40]:
llm = Ollama(model = "llama3", format = "json")

In [20]:
df = pd.read_csv("train.csv")

print(df.keys())

Index(['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer'], dtype='object')


In [21]:
questions = df['prompt'].tolist()
options = df[['A', 'B', 'C', 'D', 'E']].values.tolist()

In [41]:
def format_mcq(question, choices):
    formatted_question = f"{question}\nOptions:\n"
    options_labels = ['A', 'B', 'C', 'D', 'E']
    for label, choice in zip(options_labels, choices):
        formatted_question += f"{label}) {choice}\n"
    return formatted_question

In [57]:
from langchain_core.pydantic_v1 import BaseModel, Field
# Define your desired data structure.
class Answer(BaseModel):
    answer: str = Field(description="your single letter of option that is the right answer, without any spaces or special characters")


In [60]:

# Define the output parser
parser = JsonOutputParser(pydantic_object=Answer)
# Define the model behavior and prompt tempalte
prompt = PromptTemplate(
    template="Answer the following multiple choice question:\n{format_instructions}\n{question}\n{options}\n{context}",
    question="Question: {question}",
    options="Options:\n{options}",
    partial_variables={"format_instructions": parser.get_format_instructions()},
    input_variables={"question", "options", "context"}
)

# Define the questions
questions = {
    "question": "What is the function of the Golgi apparatus?",
}
options = {
    "options": ["A) Protein synthesis", "B) Lipid synthesis", "C) Carbohydrate synthesis", "D) Protein modification", "E) DNA replication"]
}

context = "The Golgi apparatus is an organelle found in most eukaryotic cells. It is made up of membrane-bound sacs called cisternae. The Golgi apparatus is responsible for modifying, sorting, and packaging proteins for secretion. It also plays a role in lipid synthesis and carbohydrate synthesis. The Golgi apparatus is involved in the transport of proteins and other molecules within the cell."

# Run the chain
chain = prompt | llm | parser
start = timer.time()
results = chain.invoke({"question": questions, "options": options, "context": context})
end = timer.time()

# Print the results
print(results['answer'])

D


In [10]:
def ask_model(llm, formatted_question):
    instruction = 'Answer the following question by outputting the letters A, B, C, D, and E '\
    'in order of the most likely to be correct to the to least likely to be correct.\n\n'
    prompt = instruction + formatted_question
    chain = RetrievalQA.from_chain_type(llm, retriever=vector_store.as_retriever(),verbose=True)
    return chain.run(prompt)

In [11]:
results = []
for index, row in df.iterrows():
    if index >= 2: 
        break
    formatted_question = format_mcq(row['prompt'], [row['A'], row['B'], row['C'], row['D'], row['E']])
    print(formatted_question)
    answer = ask_model(llm, formatted_question)
    # results.append(answer)
    print("answer:",answer)

# df['Model Answer'] = results
# print(df[['prompt', 'Model Answer']].head())

Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?
Options:
A) MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."
B) MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 20.
C) MOND is a theory that explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.
D) MOND is a theory that reduces the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 2.
E) MOND is a theory that eliminates the observed mi

In [None]:
query = "What is the function of the Golgi apparatus?"

In [None]:
print("Query: ", query)
start = timer.time()
answer = chain({"query": query})
end = timer.time()
print("Answer: ", answer)
print("Time: ", end-start)