In [2]:
import os
import gradio as gr
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_parse import LlamaParse
import json

Setup LLM with Ollama

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LLAMACLOUD_API_KEY = os.getenv("LLAMACLOUD_API_KEY")

llm = Ollama(model="phi3.5:3.8b-mini-instruct-q8_0", api_key="OLLAMA_API_KEY")

Setup Embedding Model

In [4]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

Global Settings Config

In [5]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

Upload and Import ebooks function

In [17]:
import os
import tkinter as tk
from tkinter import filedialog
from ebooklib import epub
import fitz  # PyMuPDF

def upload_and_import_ebook(file_path=None, save_directory="data"):
    # Create a Tkinter root window
    root = tk.Tk()
    root.withdraw()  # Hide the root window

    # Open a file dialog to select an ebook or PDF file if no file path is provided
    if not file_path:
        file_path = filedialog.askopenfilename(
            title="Select an Ebook or PDF",
            filetypes=[("Ebook and PDF Files", "*.epub *.pdf")]
        )

    if file_path:
        content = ""
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == ".epub":
            # Load the ebook
            book = epub.read_epub(file_path)
            
            # Extract the content
            for item in book.get_items():
                if item.get_type() == ebooklib.ITEM_DOCUMENT:
                    content += item.get_body_content().decode('utf-8')
        
        elif file_extension == ".pdf":
            # Load the PDF
            pdf_document = fitz.open(file_path)
            
            # Extract the content
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                content += page.get_text()

        # Ensure the save directory exists
        os.makedirs(save_directory, exist_ok=True)

        # Save the content to a file in the save directory
        save_path = os.path.join(save_directory, os.path.basename(file_path) + ".txt")
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(content)

        return save_path
    else:
        return None

# Example usage with file dialog
ebook_path = upload_and_import_ebook()
if ebook_path:
    print(f"Ebook content imported successfully to {ebook_path}.")
else:
    print("No ebook or PDF selected.")

# Example usage with file path
# ebook_path = upload_and_import_ebook("path/to/your/ebook_or_pdf.epub")
# if ebook_path:
#     print(f"Ebook content imported successfully to {ebook_path}.")
# else:
#     print("Invalid file path.")

No ebook or PDF selected.


Document Loader

In [18]:
if ebook_path:
    docs = SimpleDirectoryReader(input_files=[ebook_path]).load_data()
else:
    ebook_path = "data/The Theory That Would Not Die How Bayes Rule Cracked the Enigma Code, Hunted Down Russian Submarines, and Emerged Triumphant from Two Centuries of Controversy by Sharon Bertsch McGrayne.pdf.txt"
    docs = SimpleDirectoryReader(input_files=[ebook_path]).load_data()

Advanced RAG (Routing)

In [19]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(docs)
query_engine = index.as_query_engine(similarity_top_k=5)

In [20]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata

vector_tool = QueryEngineTool(
    index.as_query_engine(),
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for searching for specific facts.",
    ),
)

summary_tool = QueryEngineTool(
    index.as_query_engine(response_mode="tree_summarize"),
    metadata=ToolMetadata(
        name="summary",
        description="Useful for summarizing an entire document.",
    ),
)

In [21]:
from llama_index.core.query_engine import RouterQueryEngine

query_engine = RouterQueryEngine.from_defaults(
    [vector_tool, summary_tool], select_multi=False, verbose=True, llm=llm
)

In [22]:
response = query_engine.query(
    "Provide a comprehensive overview of how Bayesian probability evolved from"
    " a theoretical concept to a critical wartime intelligence tool"
)

[1;3;38;5;200mSelecting query engine 1: Summarizing the evolution of Bayesian probability from a theoretical concept to its application as a critical wartime intelligence tool requires an overview that encompasses historical development, key milestones in both theory and practice, and significant impacts. This summary would provide condensed information about how these aspects converged during times of conflict..
[0m

In [23]:
print(response)

The journey of Bayesian probability began in the mid-18th century with Reverend Thomas Bayes, who formulated an initial theorem on updating beliefs based on new evidence. Despite his groundbreaking work being initially overlooked and buried by academia due to prevailing views against subjectivity, it laid the foundation for a revolutionary approach in decision-making under uncertainty.

Pierre Simon Laplace independently rediscovered Bayes' insights around 1774 while grappling with vast amounts of data. His application led him to draw conclusions on natural laws such as sex ratios, showcasing the potential and power of this methodology in practical terms beyond theoretical boundaries.

During a period when academia dismissed Laplace's work for being too subjective during its early development stages, it was actually practitioners who turned to Bayesian methods out of necessity due to their effectiveness at solving real-world problems and emergencies. A notable instance occurred in the 

In [24]:
response = query_engine.query(
    "Tell me about the specific details about Alan Turing's cryptographic methods at Bletchley Park"
)

[1;3;38;5;200mSelecting query engine 0: The question asks about specific details regarding Alan Turing's cryptographic methods. Choice (1) is most relevant as it pertains to searching for these particular facts..
[0m

In [25]:
print(response)

Alan Turing arrived in Bletchley Park where he was involved with various Enigma systems initially working on army codes before shifting his focus to German naval codes, which were more complex. He played a key role in the development of an electromechanical machine known as "bombe." This device dramatically increased efficiency by testing every possible wheel arrangement for enemy messages rapidly compared to manual methods. Turing's bombe significantly advanced British code-breaking capabilities and was essential when Bletchley Park faced increasingly sophisticated German Enigma machines during the war, particularly concerning naval codes vital in combating U-boat threats.
Throughout his time at Bletchley Park, Turing also engaged with colleagues like Gordon Welchman and Harold "Doc" Keen to refine bombe's design further into a metal cabinet prototype that was critical for deciphering the naval codes effectively. The machine utilized Bayesian scoring methods based on bans which later 

Chat History to RAG

In [None]:
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine

memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

chat_engine = CondensePlusContextChatEngine.from_defaults(
    index.as_retriever(),
    memory=memory,
    llm=llm,
    context_prompt=(
        "You are a chatbot, able to have normal interactions, as well as talk"
        " about the Kendrick and Drake beef."
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous chat history, or the context above, to interact and help the user."
    ),
    verbose=True,
)