# Open-Ollama-RAG-ChatApp Notebook

In [1]:
from markitdown import MarkItDown
md = MarkItDown()
result_pdf = md.convert("raw/liste_med_2024-12-12_fr.pdf")
#print(result_pdf.text_content)
with open("ramq/liste_med_2024-12-12_fr.md","w+",encoding="utf-8") as md:
    md.write(result_pdf.text_content)

In [5]:
import PyPDF2

def split_pdf_by_sections(input_pdf, sections, output_prefix):
    # Open the input PDF file
    with open(input_pdf, 'rb') as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        
        for i, (start, end) in enumerate(sections):
            writer = PyPDF2.PdfWriter()
            
            # Add pages to the writer for the current section
            for page_number in range(start, end + 1):
                writer.add_page(reader.pages[page_number])
            
            # Write the section to a new PDF file
            output_filename = f"{output_prefix}_section_{i + 1}.pdf"
            with open(output_filename, 'wb') as output_pdf:
                writer.write(output_pdf)
            print(f"Section {i + 1} saved as {output_filename}")

# Example usage
input_pdf = './raw/liste_med_2024-12-12_fr.pdf'
sections = [(0,19),(20,21),(22,23),(24,39),(40,217),(218,229),(230,233),(234,765)]  # Define the page ranges for each section
output_prefix = './raw/output'

split_pdf_by_sections(input_pdf, sections, output_prefix)



Section 1 saved as ./raw/output_section_1.pdf
Section 2 saved as ./raw/output_section_2.pdf
Section 3 saved as ./raw/output_section_3.pdf
Section 4 saved as ./raw/output_section_4.pdf
Section 5 saved as ./raw/output_section_5.pdf
Section 6 saved as ./raw/output_section_6.pdf
Section 7 saved as ./raw/output_section_7.pdf
Section 8 saved as ./raw/output_section_8.pdf


In [None]:
from markitdown import MarkItDown
md = MarkItDown()
result_docx = md.convert("raw/liste_med_2024-12-12_fr.docx")
#print(result_pdf.text_content)
with open("ramq/liste_med_2024-12-12_fr_winword.md","w+",encoding="utf-8") as md:
    md.write(result_docx.text_content)

In [1]:
## initial database?
# -> Set to True if you run the notebook for the first time or if you changed the md files
initial_db = True

In [47]:
DATA_PATH = "ramq/"
OLLAMA_MODEL = "llama3.2"
OLLAMA_URL = "http://localhost:11434"
CHROMA_PATH = "chroma/"

## langchain split config
# md headers
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
]

# chunk sizes
chunk_size = 500
chunk_overlap = 100

## Create chunks from md files

In [24]:
#from langchain.document_loaders import TextLoader
from langchain.document_loaders import TextLoader
import os

documents = []

for file in os.listdir(DATA_PATH):
    loader = TextLoader(DATA_PATH + file, encoding="utf-8")
    documents.append(loader.load()[0])

In [None]:
documents[0].metadata
len(documents[0].page_content)

In [26]:
# for doc in documents:
#     print(doc.metadata)

In [27]:
from langchain.text_splitter import MarkdownHeaderTextSplitter


text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
chunks_array= []


for doc in documents:
    chunks_array.append(text_splitter.split_text(doc.page_content))
    # append source metadata to each chunk
    for chunk in chunks_array[-1]:
        # combine metadate
        chunk.metadata = doc.metadata



In [None]:
len(chunks_array)

In [29]:
# Char-level splits
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, add_start_index=True
)


In [30]:
chunks_array_txt_base = []
counter = 0
for document in chunks_array:
    for chunk in document:
        splits = text_splitter.split_documents([chunk])
        chunks_array_txt_base.append(splits)
        counter += 1
        

In [None]:
print(counter)
len(chunks_array_txt_base)

In [32]:
all_document_chunks = [chunk for document in chunks_array_txt_base for chunk in document]

In [None]:
print(len(all_document_chunks))

In [None]:
all_document_chunks[1].page_content

## Connect to ollama backend

In [None]:
# TEST OLLAMA CONNECTION ##
from langchain_ollama import ChatOllama

llm = ChatOllama(base_url=OLLAMA_URL, model=OLLAMA_MODEL)

print(llm.invoke("Quelle est la durée maximale autorisation avec FRMANEZUMAB?"))

## Create vector db

In [None]:
all_document_chunks[0]

In [36]:
# create chroma db or load db from disk
if initial_db:
    from langchain_ollama import OllamaEmbeddings
    from langchain.vectorstores import Chroma

    emb = OllamaEmbeddings(base_url=OLLAMA_URL,model=OLLAMA_MODEL)
    vectorstore = Chroma.from_documents(all_document_chunks, emb, persist_directory=CHROMA_PATH)

In [54]:
## load chroma db from disk
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma

vectorstore = Chroma(persist_directory=CHROMA_PATH, embedding_function=OllamaEmbeddings(base_url=OLLAMA_URL, model=OLLAMA_MODEL))

In [None]:
# test similarity search
query = "Quelle est la durée maximale autorisation avec FRMANEZUMAB?"

result_docs = vectorstore.similarity_search(query)
result_docs


## Gradio frontend

In [42]:
from langchain_community.llms import Ollama

def chat_ollama(message, history):
    # initiate ollama
    ollama = Ollama(base_url=OLLAMA_URL, model=OLLAMA_MODEL)

    # search for similar documents in chroma db
    result_chunks = chroma_db.similarity_search(message)
    
    chroma_knowledge = ""
    for id, chunk in enumerate(result_chunks):
        source_id = id + 1
        chroma_knowledge += "[" + str(source_id) +"] \n" + chunk.page_content + "\n"

    sources = ""
    for id, chunk in enumerate(result_chunks):
        source_id = id + 1
        sources += "[" + str(source_id) + "] \n" + chunk.metadata["source"] + "\n"

    prompt = "Answer the following question using the provided knowledge and the chat history:\n\n###KNOWLEDGE: " + chroma_knowledge + "\n###CHAT-HISTORY: " + str(history) + "\n\n###QUESTION: " + message
    result = ollama(prompt) + "\n\n\nReferences:\n" + sources 

    # print(prompt)
    
    return result

In [None]:
chat_ollama("Quelle est la durée maximale d'autorisation du FRAMANEZUMAB?", "")

In [None]:
import gradio as gr
gradio_interface = gr.ChatInterface(
        chat_ollama,
        chatbot=gr.Chatbot(),
        textbox=gr.Textbox(placeholder="Example: Who is Alice?", container=False, scale=7),
        title="The Ollama test chatbot",
        description=f"Ask the {OLLAMA_MODEL} chatbot a question!",
        theme='gradio/base', # themes at https://huggingface.co/spaces/gradio/theme-gallery
        #retry_btn=None,
        #undo_btn="Delete Previous",
        #clear_btn="Clear",
)



## Run app

In [None]:
gradio_interface.launch()

In [24]:
### END OF FILE ###