## STAGING SPLIT RAMQ PDF into multiple document or annexes

In [None]:
import PyPDF2

def split_pdf_by_sections(input_pdf, sections, output_prefix):
    # Open the input PDF file
    with open(input_pdf, 'rb') as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        
        for i, (start, end) in enumerate(sections):
            writer = PyPDF2.PdfWriter()
            
            # Add pages to the writer for the current section
            for page_number in range(start, end + 1):
                writer.add_page(reader.pages[page_number])
            
            # Write the section to a new PDF file
            output_filename = f"{output_prefix}_section_{i + 1}.pdf"
            with open(output_filename, 'wb') as output_pdf:
                writer.write(output_pdf)
            print(f"Section {i + 1} saved as {output_filename}")

# Example usage
input_pdf = './raw/liste_med_2024-12-12_fr.pdf'
sections = [(0,19),(20,21),(22,23),(24,39),(40,217),(218,229),(230,233),(234,765)]  # Define the page ranges for each section
output_prefix = './raw/output'

split_pdf_by_sections(input_pdf, sections, output_prefix)

_RAG Embedding_

In [1]:
# import
from llama_index.core import (
    StorageContext,
    VectorStoreIndex,
    SimpleDirectoryReader,
    PromptTemplate,
    Settings
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.duckdb import DuckDBVectorStore
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from IPython.display import Markdown, display

In [2]:
# Compute or reuse the duckdb database store
live = False

# define embedding function
Settings.embed_model = OllamaEmbedding(
    model_name="mxbai-embed-large",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)
# define llm model to interact with
Settings.llm = Ollama(
    model="llama3.2",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)
# Set the size of the chunk to be 512 tokens
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.chunk_size = 512
Settings.num_output = 512
Settings.context_window = 3900

def get_meta(file_path):
    return {"province": "Quebec", "Authority":"RAMQ", "file_path": file_path}

#LIVE
if live :
    # load documents 
    documents = SimpleDirectoryReader(input_dir="./ramq/",file_metadata=get_meta).load_data()
    # https://motherduck.com/blog/search-using-duckdb-part-2/
    vector_store = DuckDBVectorStore(database_name="knowledge_base", persist_dir="./duckdb_md/")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    knowledge_base = VectorStoreIndex.from_documents(documents, 
                                                     embed_model=Settings.embed_model,
                                                     storage_context=storage_context, 
                                                     show_progress=True)
else:
    # Load embedding from disk
    vector_store = DuckDBVectorStore.from_local("./duckdb/knowledge_base")
    knowledge_base = VectorStoreIndex.from_vector_store(vector_store)



In [None]:
import duckdb

con = duckdb.connect("./duckdb/knowledge_base")
con.sql("SHOW TABLES;")
con.sql("SELECT * FROM documents")

In [None]:

from llama_index.llms.ollama import Ollama
user_query= "Quelle est la marge bénéficiaire de McKesson ?"
llm = Ollama(model="llama3.2", request_timeout=120.0)
resp = llm.complete(user_query)
display(Markdown(f"{resp}"))

In [None]:
user_query= "Quelle est la marge bénéficiaire de McKesson et des autres ?"
#retriever = knowledge_base.as_retriever(similarity_top_k=3)
#retrieved_nodes = retriever.retrieve(user_query)
query_engine = knowledge_base.as_query_engine()

# Run a query
answer = query_engine.query(user_query)

display(Markdown(f"{answer.response}, \n\n Sources: "))

In [None]:
# The query engine

user_query= "Que sais-tu du ADÉFOVIR DIPIVOXIL ?"
#retriever = knowledge_base.as_retriever(similarity_top_k=3)
#retrieved_nodes = retriever.retrieve(user_query)
query_engine = knowledge_base.as_query_engine()

# Run a query
answer = query_engine.query(user_query)

display(Markdown(f"{answer.response}, \n\n Sources: "))

In [None]:
#TODO 
# https://docs.llamaindex.ai/en/stable/examples/llm/ollama/#structured-outputs

In [None]:
chatllm = knowledge_base.as_chat_engine()
msg = chatllm.chat(user_query)
display(Markdown(f"{msg.response}, \n\n Sources: "))