In [None]:
import os

from glob import glob

from epicstuff import Dict
from taml import taml

from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [None]:
# load stuff like api keys
env = Dict(taml.load('env.taml'))
env.ai_api_ver = '2024-02-01'
env.ai_model = 'text-embedding-ada-002'

In [None]:
# convert pdf to txt
def pdf_to_text(pdf_path):
	from PyPDF2 import PdfReader
	reader = PdfReader(pdf_path)
	text = ''
	for page in reader.pages:
		text += page.extract_text()
	return text

def convert_pdfs_in_folder(pdf_folder, output_folder):
	# Ensure the output folder exists
	os.makedirs(output_folder, exist_ok=True)

	# Loop through all files in the directory
	for filename in os.listdir(pdf_folder):
		if filename.endswith(".pdf"):
			pdf_path = os.path.join(pdf_folder, filename)
			text = pdf_to_text(pdf_path)

			if text:  # If text was extracted
				# Create a text file with the same name as the PDF
				output_filename = os.path.splitext(filename)[0] + ".txt"
				output_path = os.path.join(output_folder, output_filename)

				# Save the extracted text to the file
				with open(output_path, "w", encoding="utf-8") as text_file:
					text_file.write(text)
				print(f"Converted {filename} to {output_filename}")
			else:
				print(f"No text found in {filename}")


pdf_folder = "pdfs/2023-2024 research articles"  # Folder containing PDFs
output_folder = "Final/texts"  # Folder to save text files

convert_pdfs_in_folder(pdf_folder, output_folder)

In [None]:
# azure search object from azuresearch.ipynb

from azure.search.documents.indexes.models import FreshnessScoringFunction, FreshnessScoringParameters, ScoringProfile, SearchableField, SearchField, SearchFieldDataType, \
	SimpleField, TextWeights

embeddings = AzureOpenAIEmbeddings(
	azure_deployment=env.ai_model,
	openai_api_version=env.ai_api_ver,
	azure_endpoint=env.ai_endpoint,
	api_key=env.ai_api,
)
embedding_function = embeddings.embed_query

fields = [
	SimpleField(
		name='id',
		type=SearchFieldDataType.String,
		key=True,
		filterable=True,
	),
	SearchableField(
		name='content',
		type=SearchFieldDataType.String,
		searchable=True,
	),
	SearchField(
		name='content_vector',
		type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
		searchable=True,
		vector_search_dimensions=len(embedding_function('Text')),
		vector_search_profile_name='myHnswProfile',
	),
	SearchableField(
		name='metadata',
		type=SearchFieldDataType.String,
		searchable=True,
	),
	# Additional field to store the title
	SearchableField(
		name='title',
		type=SearchFieldDataType.String,
		searchable=True,
	),
	# Additional field for filtering on document source
	SimpleField(
		name='source',
		type=SearchFieldDataType.String,
		filterable=True,
	),
	# Additional data field for last doc update
	SimpleField(
		name='last_update',
		type=SearchFieldDataType.DateTimeOffset,
		searchable=True,
		filterable=True,
	),
]
# Adding a custom scoring profile with a freshness function
sc_name = 'scoring_profile'
sc = ScoringProfile(
	name=sc_name,
	text_weights=TextWeights(weights={'title': 5}),
	function_aggregation='sum',
	functions=[
		FreshnessScoringFunction(
			field_name='last_update',
			boost=100,
			parameters=FreshnessScoringParameters(boosting_duration='P2D'),
			interpolation='linear',
		)
	],
)

vector_store: AzureSearch = AzureSearch(
	azure_search_endpoint=env.search_endpoint,
	azure_search_key=env.search_api,
	index_name='langchain-vector-demo-custom-scoring-profile',
	embedding_function=embeddings.embed_query,
	fields=fields,
	scoring_profiles=[sc],
	default_scoring_profile=sc_name,
)

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

doc_list = []
# load each text document into azure search
for file in glob(f'{output_folder}/*.txt'):

	loader = TextLoader(file, encoding="utf-8")

	documents = loader.load()
	text_splitter = CharacterTextSplitter(chunk_size=700, chunk_overlap=50, separator='.')
	docs = text_splitter.split_documents(documents)
	doc_list.append(docs)
	print(docs)
	print(len(docs))   #how many chunks are there in each pdf
	#vector_store.add_documents(documents=docs)

In [None]:
for doc in doc_list[14]:   #see how the doc is splitted
    print(doc.page_content)
    print('-----------------------------------------------------------------------------------------------')

In [None]:
from langchain_community.llms import Cohere
from langchain_core.prompts  import PromptTemplate
#from langchain_cohere import Cohere

cohere_llm = Cohere(model="command",
                    temperature=0.1,
                    cohere_api_key = "xEVt3zKBQ7aPsG05ZM7EwlCdMnhZLZwfR2s7T0tu")

prompt_template = """Answer the question with the provided context." \n\n
                Context: \n {context}?\n
                Question: \n {question} \n
                Answer:"""


prompt = PromptTemplate.from_template(template=prompt_template)

# Formatting the docs for the RAG chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
# find embending "group" with greatest similarity to query
#vector_store.similarity_search(query="How can the degree of autonomy of SDL be broken down into different levels?", k=3, search_type="similarity")[0].page_content
vector_store.similarity_search(query="What are the main challenges in the translation of protocols of self-driving labs?", k=3, search_type="similarity")[1].page_content


In [None]:
question1 = 'Alhpa flow is a self driving lab system. What learning method does the system use?'
multi_choice1 = 'A. Active learning, B. Unsupervised learning, C. Reinforcement learning, D. English learning'  #correct answer is C
question2 = 'sciclops is used in one of the self driving lab systems. What is sciclops?'
multi_choice2 = 'A. a robotic arm, B. a microchip, C. a software, D. a microscope'    #correct answer is A
question3 = 'BMLP enables rapid optimisation of metabolic models and offers a realistic approach to a self-driving lab for microbial engineering. What does BMLP stand for?'
multi_choice3 = 'A. Bayesian Metabolic Learning Platform, B. Bayesian Metabolic Learning Process, C. Boolean Matrix Logic Programming , D. Bayesian Matrix Learning Protocol'  #correct answer is C
question4 = 'In the Adam system, what is recommended form King et al.?'
multi_choice4 = 'A. routine supervision, B. more advanced hardware, C. complete autonomous lab, D. manual operation'   #correct answer is A
question5 = 'BRAD is a state-of-the-art chatbot and agentic system that integrates a suite of tools to handle bioinformatics tasks. What does BRAD stand for?'
multi_choice5 = 'A. Bioinformatics Research and Development, B.  Bioinformatics Retrieval Augmented Data, ssC. Bayesian Rapid Automated Discovery, D. Bioinformatics Rapid Automated Discovery'  #correct answer is B
question6 = 'What scale is the chemical space of all possible molecules is often estimated at?'
multi_choice6 = 'A. 10^40, B. 10^50, C. 10^60, D. 10^70'  #correct answer is C
question7 = 'In the paper, GPC quantifies alterations in the hydrodynamic radius associated with molecular weight. What does GPC stand for?'
multi_choice7 = 'A. Gel Permeation Chromatography, B. Gel Permeation Coefficient, C. Gas Permeation Constant, D. Gas Permeation Chromatography'  #correct answer is D
question8 = 'What are the main challenges in the translation of protocols of self-driving labs?'
multi_choice8 = 'A. data, hardware, and software; B. language, syntax, and semantics; C. data, autonomy, and execution; D. syntax, semantics, and execution'  #correct answer is D
question9 = 'In the study that uses social networking services to operate scanning probe microscopy measurement systems, when user message is judged as not executable, how does the system respond?'
multi_choice9 = 'A. the system does not respond; B. the system still tries to execute the command; C. the system prints none and give the reason; D. the system will pause for manual operation'  #correct answer is C
question10 = 'The concept of SDLs has its roots in the broader field of laboratory automation, began in:'
multi_choice10 = 'A. mid-19th century, B. early-20th century, C. mid-20th century, D. in 21st century'  #correct answer is C


In [None]:
# RAG Chain
def generate_answer(question, multi_choice):    #question and the multiple choice are separated, and only the question is used to search for the context.
    question_template = '\n Only one answer is correct. Only print out the one answer and the one letter (A,B,C,D) that represents it, and nothing else.'
    llm = cohere_llm
    docs = format_docs(vector_store.similarity_search(query=question, k=4, search_type="similarity"))
    #docs = vector_store.similarity_search(query=question, k=3, search_type="similarity")[1]
    context = docs.replace('\x00', '')  #remove mail characters as processing of the text

    #print(prompt.format(context = context, question = question + multi_choice))
    question = question + multi_choice + question_template

    #allow prompt truncation
    return llm(prompt.format(context = context, question = question), truncate = 'START')


def without_rag(question):
    question_template = 'Only one answer is correct, only print out the one answer and the one letter (A,B,C,D) that represents it.'
    question = question + question_template
    llm = cohere_llm
    return llm(question, truncate = 'START')

n=1

for question, multi_choice in zip([question1, question2, question3, question4, question5, question6, question7, question8, question9, question10],
                                   [multi_choice1, multi_choice2, multi_choice3, multi_choice4, multi_choice5, multi_choice6, multi_choice7, multi_choice8, multi_choice9, multi_choice10]):
    print (n)
    print(generate_answer(question, multi_choice))
    print('------------------')
    print(without_rag(question+multi_choice))
    print('==================')
    n += 1



# question = question8
# multi_choice = multi_choice8

# print(generate_answer(question, multi_choice))
# print('------------------')
# print(without_rag(question+multi_choice))



LLM with and without RAG are tested with the 10 questions, repeated for 5 times. LLM-RAG scores 9/10 for 4 times and 8/10 for 1 time. LLM without RAG scores 3/10 for 5 times.