In [None]:
%pip install langchain

In [None]:
%pip install pypdf

In [None]:
%pip install gpt4all

In [None]:
%pip install chromadb

### Model setup

In [None]:
from langchain_community.llms import Ollama
from langchain.embeddings import GPT4AllEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

import time as timer
import pandas as pd


In [None]:
# load the data
pdf_path = "APBiology-OP.pdf"
loader = PyPDFLoader(pdf_path)
data = loader.load()

In [None]:
# split the data into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(data)

In [None]:
vector_store  = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())

In [None]:
llm = Ollama(model = "llama3")

In [None]:
df = pd.read_csv("test.csv")

print(df.keys())

In [None]:
questions = df['prompt'].tolist()
options = df[['A', 'B', 'C', 'D', 'E']].values.tolist()

In [None]:
def format_mcq(question, choices):
    formatted_question = f"{question}\nOptions:\n"
    options_labels = ['A', 'B', 'C', 'D', 'E']
    for label, choice in zip(options_labels, choices):
        formatted_question += f"{label}) {choice}\n"
    return formatted_question

In [None]:
def ask_model(llm, formatted_question):
    prompt = formatted_question + "\nWhich option is correct?"
    chain = RetrievalQA.from_chain_type(llm, retriever=vector_store.as_retriever(),verbose=True)
    response = chain({"query": prompt}) 
    return response['result'] 

In [None]:
results = []
for index, row in df.iterrows():
    if index >= 1: 
        break
    formatted_question = format_mcq(row['prompt'], [row['A'], row['B'], row['C'], row['D'], row['E']])
    print(formatted_question)
    answer = ask_model(llm, formatted_question)
    # results.append(answer)
    print(answer)

# df['Model Answer'] = results
# print(df[['prompt', 'Model Answer']].head())

In [None]:
query = "What is the function of the Golgi apparatus?"

In [None]:
print("Query: ", query)
start = timer.time()
answer = chain({"query": query})
end = timer.time()
print("Answer: ", answer)
print("Time: ", end-start)