In [None]:
import os

from glob import glob

from epicstuff import Dict
from taml import taml

from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [None]:
# load stuff like api keys
env = Dict(taml.load('env.taml'))
env.ai_api_ver = '2024-02-01'
env.ai_model = 'text-embedding-ada-002'

In [None]:
# convert pdf to txt
def pdf_to_text(pdf_path):
	from PyPDF2 import PdfReader
	reader = PdfReader(pdf_path)
	text = ''
	for page in reader.pages:
		text += page.extract_text()
	return text

def convert_pdfs_in_folder(pdf_folder, output_folder):
	# Ensure the output folder exists
	os.makedirs(output_folder, exist_ok=True)

	# Loop through all files in the directory
	for filename in os.listdir(pdf_folder):
		if filename.endswith(".pdf"):
			pdf_path = os.path.join(pdf_folder, filename)
			text = pdf_to_text(pdf_path)

			if text:  # If text was extracted
				# Create a text file with the same name as the PDF
				output_filename = os.path.splitext(filename)[0] + ".txt"
				output_path = os.path.join(output_folder, output_filename)

				# Save the extracted text to the file
				with open(output_path, "w", encoding="utf-8") as text_file:
					text_file.write(text)
				print(f"Converted {filename} to {output_filename}")
			else:
				print(f"No text found in {filename}")


pdf_folder = "pdfs"  # Folder containing PDFs
output_folder = "Final/texts"  # Folder to save text files

convert_pdfs_in_folder(pdf_folder, output_folder)

In [None]:
# azure search object from azuresearch.ipynb

from azure.search.documents.indexes.models import FreshnessScoringFunction, FreshnessScoringParameters, ScoringProfile, SearchableField, SearchField, SearchFieldDataType, \
	SimpleField, TextWeights

embeddings = AzureOpenAIEmbeddings(
	azure_deployment=env.ai_model,
	openai_api_version=env.ai_api_ver,
	azure_endpoint=env.ai_endpoint,
	api_key=env.ai_api,
)
embedding_function = embeddings.embed_query

fields = [
	SimpleField(
		name='id',
		type=SearchFieldDataType.String,
		key=True,
		filterable=True,
	),
	SearchableField(
		name='content',
		type=SearchFieldDataType.String,
		searchable=True,
	),
	SearchField(
		name='content_vector',
		type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
		searchable=True,
		vector_search_dimensions=len(embedding_function('Text')),
		vector_search_profile_name='myHnswProfile',
	),
	SearchableField(
		name='metadata',
		type=SearchFieldDataType.String,
		searchable=True,
	),
	# Additional field to store the title
	SearchableField(
		name='title',
		type=SearchFieldDataType.String,
		searchable=True,
	),
	# Additional field for filtering on document source
	SimpleField(
		name='source',
		type=SearchFieldDataType.String,
		filterable=True,
	),
	# Additional data field for last doc update
	SimpleField(
		name='last_update',
		type=SearchFieldDataType.DateTimeOffset,
		searchable=True,
		filterable=True,
	),
]
# Adding a custom scoring profile with a freshness function
sc_name = 'scoring_profile'
sc = ScoringProfile(
	name=sc_name,
	text_weights=TextWeights(weights={'title': 5}),
	function_aggregation='sum',
	functions=[
		FreshnessScoringFunction(
			field_name='last_update',
			boost=100,
			parameters=FreshnessScoringParameters(boosting_duration='P2D'),
			interpolation='linear',
		)
	],
)

vector_store: AzureSearch = AzureSearch(
	azure_search_endpoint=env.search_endpoint,
	azure_search_key=env.search_api,
	index_name='langchain-vector-demo-custom-scoring-profile',
	embedding_function=embeddings.embed_query,
	fields=fields,
	scoring_profiles=[sc],
	default_scoring_profile=sc_name,
)

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

doc_list = []
# load each text document into azure search
for file in glob(f'{output_folder}/*.txt'):

	loader = TextLoader(file, encoding="utf-8")

	documents = loader.load()
	text_splitter = CharacterTextSplitter(chunk_size=700, chunk_overlap=50, separator='.')
	docs = text_splitter.split_documents(documents)
	doc_list.append(docs)
	print(docs)
	print(len(docs))   #how many chunks are there in each pdf
	vector_store.add_documents(documents=docs)

In [None]:
for doc in doc_list[0]:   #see how the doc is splitted
    print(doc.page_content)
    print('-----------------------------------------------------------------------------------------------')

In [None]:
from langchain_community.llms import Cohere
from langchain_core.prompts  import PromptTemplate
#from langchain_cohere import Cohere

cohere_llm = Cohere(model="command",
                    temperature=0.1,
                    cohere_api_key = "xEVt3zKBQ7aPsG05ZM7EwlCdMnhZLZwfR2s7T0tu")

prompt_template = """Answer the question as precise as possible using the provided context. If the answer is
                not contained in the context, say "answer not available in context" \n\n
                Context: \n {context}?\n
                Question: \n {question} \n
                Answer:"""


prompt = PromptTemplate.from_template(template=prompt_template)

# Formatting the docs for the RAG chain
def format_docs(docs):
    return "\n\n".join(doc for doc in docs)

In [None]:
# find embending "group" with greatest similarity to query
#vector_store.similarity_search(query="How can the degree of autonomy of SDL be broken down into different levels?", k=3, search_type="similarity")[0].page_content
vector_store.similarity_search(query="How long does it take to build an SDL?", k=3, search_type="similarity")[0].page_content


In [None]:
# RAG Chain
def generate_answer(question):
    llm = cohere_llm
    docs = vector_store.similarity_search(query=question, k=3, search_type="similarity")[0]
    context = docs.page_content

    #allow prompt truncation
    return llm(prompt.format(context = context, question = question), truncate = 'START')

# Generate answer
#generate_answer("What challenges are associated with optimizing SDL performance across different studies?")
generate_answer('How can the degree of autonomy of SDL be broken down into different levels?')