In [15]:
import os

from glob import glob

from epicstuff import Dict
from taml import taml

from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings

In [26]:
# load stuff like api keys
env = Dict(taml.load('Final/env.taml'))
env.ai_api_ver = '2024-02-01'
env.ai_model = 'text-embedding-ada-002'

In [20]:
# convert pdf to txt
def pdf_to_text(pdf_path):
	from PyPDF2 import PdfReader
	reader = PdfReader(pdf_path)
	text = ''
	for page in reader.pages:
		text += page.extract_text()
	return text

def convert_pdfs_in_folder(pdf_folder, output_folder):
	# Ensure the output folder exists
	os.makedirs(output_folder, exist_ok=True)

	# Loop through all files in the directory
	for filename in os.listdir(pdf_folder):
		if filename.endswith(".pdf"):
			pdf_path = os.path.join(pdf_folder, filename)
			text = pdf_to_text(pdf_path)

			if text:  # If text was extracted
				# Create a text file with the same name as the PDF
				output_filename = os.path.splitext(filename)[0] + ".txt"
				output_path = os.path.join(output_folder, output_filename)

				# Save the extracted text to the file
				with open(output_path, "w", encoding="utf-8") as text_file:
					text_file.write(text)
				print(f"Converted {filename} to {output_filename}")
			else:
				print(f"No text found in {filename}")


pdf_folder = "Final/pdfs"  # Folder containing PDFs
output_folder = "Final/texts"  # Folder to save text files

convert_pdfs_in_folder(pdf_folder, output_folder)

Converted s41467-024-45569-5.pdf to s41467-024-45569-5.txt
Converted 1-s2.0-S2590238524001954-main.pdf to 1-s2.0-S2590238524001954-main.txt
Converted Advanced Intelligent Systems - 2022 - Delgado-Licona - Research Acceleration in Self‐Driving Labs  Technological Roadmap.pdf to Advanced Intelligent Systems - 2022 - Delgado-Licona - Research Acceleration in Self‐Driving Labs  Technological Roadmap.txt
Converted tom-et-al-2024-self-driving-laboratories-for-chemistry-and-materials-science.pdf to tom-et-al-2024-self-driving-laboratories-for-chemistry-and-materials-science.txt
Converted s44160-022-00231-0.pdf to s44160-022-00231-0.txt


In [29]:
# azure search object from azuresearch.ipynb

from azure.search.documents.indexes.models import FreshnessScoringFunction, FreshnessScoringParameters, ScoringProfile, SearchableField, SearchField, SearchFieldDataType, \
	SimpleField, TextWeights

embeddings = AzureOpenAIEmbeddings(
	azure_deployment=env.ai_model,
	openai_api_version=env.ai_api_ver,
	azure_endpoint=env.ai_endpoint,
	api_key=env.ai_api,
)
embedding_function = embeddings.embed_query

fields = [
	SimpleField(
		name='id',
		type=SearchFieldDataType.String,
		key=True,
		filterable=True,
	),
	SearchableField(
		name='content',
		type=SearchFieldDataType.String,
		searchable=True,
	),
	SearchField(
		name='content_vector',
		type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
		searchable=True,
		vector_search_dimensions=len(embedding_function('Text')),
		vector_search_profile_name='myHnswProfile',
	),
	SearchableField(
		name='metadata',
		type=SearchFieldDataType.String,
		searchable=True,
	),
	# Additional field to store the title
	SearchableField(
		name='title',
		type=SearchFieldDataType.String,
		searchable=True,
	),
	# Additional field for filtering on document source
	SimpleField(
		name='source',
		type=SearchFieldDataType.String,
		filterable=True,
	),
	# Additional data field for last doc update
	SimpleField(
		name='last_update',
		type=SearchFieldDataType.DateTimeOffset,
		searchable=True,
		filterable=True,
	),
]
# Adding a custom scoring profile with a freshness function
sc_name = 'scoring_profile'
sc = ScoringProfile(
	name=sc_name,
	text_weights=TextWeights(weights={'title': 5}),
	function_aggregation='sum',
	functions=[
		FreshnessScoringFunction(
			field_name='last_update',
			boost=100,
			parameters=FreshnessScoringParameters(boosting_duration='P2D'),
			interpolation='linear',
		)
	],
)

vector_store: AzureSearch = AzureSearch(
	azure_search_endpoint=env.search_endpoint,
	azure_search_key=env.search_api,
	index_name='langchain-vector-demo-custom-scoring-profile',
	embedding_function=embeddings.embed_query,
	fields=fields,
	scoring_profiles=[sc],
	default_scoring_profile=sc_name,
)

In [30]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

# load each text document into azure search
for file in glob(f'{output_folder}/*.txt'):

	loader = TextLoader(file, encoding="utf-8")

	documents = loader.load()
	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
	docs = text_splitter.split_documents(documents)

	vector_store.add_documents(documents=docs)

In [32]:
# find embending "group" with greatest similarity to query
vector_store.similarity_search(query="s of the reaction conditions, su", k=3, search_type="similarity")

[Document(metadata={'id': 'NDNmMDQ0MmYtMjQyOS00YmY2LWI4OTAtNDdhZGE5MTllMjA5', 'source': 'Final/texts/tom-et-al-2024-self-driving-laboratories-for-chemistry-and-materials-science.txt'}, page_content='Self-Driving Laboratories forChemistry andMaterials Science\nGaryTom,*StefanP.Schmid,SterlingG.Baird,YangCao,KouroshDarvish,HanHao,StanleyLo,\nSergioPablo-García, EllaM.Rajaonson, MartaSkreta,NarukiYoshikawa, Samantha Corapi,\nGunDenizAkkoc,FelixStrieth-Kalthoff,* MartinSeifrid,*andAlánAspuru-Guzik*\nCiteThis:Chem.Rev. 2024, 124, 9633−9732\n ReadOnline\nACCESS\n Metrics &More\n Article Recommendations\nABSTRACT: Self-drivinglaboratories (SDLs)promiseanacceleratedapplicationofthe\nscientificmethod.Throughtheautomation ofexperimental workflows,alongwith\nautonomous experimental planning,SDLsholdthepotentialtogreatlyaccelerateresearch\ninchemistryandmaterialsdiscovery.Thisreviewprovidesanin-depthanalysisofthestate-\nof-the-artinSDLtechnology,itsapplications acrossvariousscientificdisciplines,a