In [3]:
import os
import gradio as gr
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_parse import LlamaParse
import json

  from .autonotebook import tqdm as notebook_tqdm


Setup LLM with Ollama

In [4]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LLAMACLOUD_API_KEY = os.getenv("LLAMACLOUD_API_KEY")

llm = Ollama(model="phi3.5:3.8b-mini-instruct-q8_0", api_key="OLLAMA_API_KEY")

Setup Embedding Model

In [5]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

Global Settings Config

In [6]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

Upload and Import ebooks function

In [21]:
import os
import tkinter as tk
from tkinter import filedialog
from ebooklib import epub
import fitz  # PyMuPDF

def upload_and_import_ebook(file_path=None, save_directory="data"):
    # Create a Tkinter root window
    root = tk.Tk()
    root.withdraw()  # Hide the root window

    # Open a file dialog to select an ebook or PDF file if no file path is provided
    if not file_path:
        file_path = filedialog.askopenfilename(
            title="Select an Ebook or PDF",
            filetypes=[("Ebook and PDF Files", "*.epub *.pdf")]
        )

    if file_path:
        content = ""
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == ".epub":
            # Load the ebook
            book = epub.read_epub(file_path)
            
            # Extract the content
            for item in book.get_items():
                if item.get_type() == ebooklib.ITEM_DOCUMENT:
                    content += item.get_body_content().decode('utf-8')
        
        elif file_extension == ".pdf":
            # Load the PDF
            pdf_document = fitz.open(file_path)
            
            # Extract the content
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                content += page.get_text()

        # Ensure the save directory exists
        os.makedirs(save_directory, exist_ok=True)

        # Save the content to a file in the save directory
        save_path = os.path.join(save_directory, os.path.basename(file_path) + ".txt")
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(content)

        return save_path
    else:
        return None

# Example usage with file dialog
ebook_path = upload_and_import_ebook()
if ebook_path:
    print(f"Ebook content imported successfully to {ebook_path}.")
else:
    print("No ebook or PDF selected.")

# Example usage with file path
# ebook_path = upload_and_import_ebook("path/to/your/ebook_or_pdf.epub")
# if ebook_path:
#     print(f"Ebook content imported successfully to {ebook_path}.")
# else:
#     print("Invalid file path.")

2024-12-04 14:55:11.403 python[3648:47500388] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Ebook content imported successfully to data/The Theory That Would Not Die How Bayes Rule Cracked the Enigma Code, Hunted Down Russian Submarines, and Emerged Triumphant from Two Centuries of Controversy by Sharon Bertsch McGrayne.pdf.txt.


Document Loader

In [22]:
if ebook_path:
    docs = SimpleDirectoryReader(input_files=[ebook_path]).load_data()
else:
    ebook_path = "sample_books/The Theory That Would Not Die How Bayes Rule Cracked the Enigma Code, Hunted Down Russian Submarines, and Emerged Triumphant from Two Centuries of Controversy by Sharon Bertsch McGrayne.pdf.txt"
    docs = SimpleDirectoryReader(input_files=[ebook_path]).load_data()

Advanced RAG (Routing)

In [23]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(docs)
query_engine = index.as_query_engine(similarity_top_k=5)

In [24]:
# Custom function to include citations in the response
def include_citations(response, docs):
    citations = []
    for doc in docs:
        if hasattr(doc, 'page_number'):
            citations.append(f"Page {doc.page_number}")
        elif hasattr(doc, 'section'):
            citations.append(f"Section {doc.section}")
    return f"{response}\n\nCitations: {', '.join(citations)}"

# Custom query engine tool with citation inclusion
class CitationQueryEngineTool(QueryEngineTool):
    def query(self, query):
        response = super().query(query)
        return include_citations(response, self.index.documents)

In [27]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata

# vector_tool = QueryEngineTool(
#     index.as_query_engine(),
#     metadata=ToolMetadata(
#         name="vector_search",
#         description="Useful for searching for specific facts.",
#     ),
# )

# Create tools with citation support
vector_tool = CitationQueryEngineTool(
    index.as_query_engine(),
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for searching for specific facts with citations.",
    ),
)

# summary_tool = QueryEngineTool(
#     index.as_query_engine(response_mode="tree_summarize"),
#     metadata=ToolMetadata(
#         name="summary",
#         description="Useful for summarizing an entire document.",
#     ),
# )

summary_tool = CitationQueryEngineTool(
    index.as_query_engine(response_mode="tree_summarize"),
    metadata=ToolMetadata(
        name="summary",
        description="Useful for summarizing an entire document with citations.",
    ),
)

In [28]:
from llama_index.core.query_engine import RouterQueryEngine

query_engine = RouterQueryEngine.from_defaults(
    [vector_tool, summary_tool], select_multi=False, verbose=True, llm=llm
)

In [29]:
response = query_engine.query(
    "Provide a comprehensive overview of how Bayesian probability evolved from"
    " a theoretical concept to a critical wartime intelligence tool"
)

[1;3;38;5;200mSelecting query engine 1: Summarizing the evolution of Bayesian probability requires an overview that encapsulates its theoretical origins and practical applications. Choice (2) is most relevant as it specifically mentions summarizing a document with citations, which implies covering historical contexts along with key developments in detail..
[0m

In [None]:
print(response)

Bayesian probability began as an intellectual debate within academia during its inception by Reverend Thomas Bayes, who initially worked on the theorem independently. However, it was Pierre Simon Laplace who later discovered and further developed this rule to address vast amounts of data around him—a testament to his stature among mathematicians and scientists like himself.

Despite its intellectual promise shown by both men's work in theory formulation during the 18th century, Bayesian probability was initially met with skepticism from academic circles who criticized it as subjective or impractical for precise scientific inquiry—a sentiment that persisted into subsequent centuries.

Parallel to this theoretical development and controversy were practical applications of Bayes' rule in real-world emergencies, notably during the Second World War. Alan Turing utilized a formulation derived from Laplace’s work known as 'Bayesian inference', applying it successfully against Nazi Germany by breaking their Enigma code—a feat that significantly contributed to Britain winning the war and also marked an important milestone in computing history, leading towards modern electronic computers.

Thus over time, Bayes' rule evolved from a theoretical concept mired in academic controversy into one of practical significance for decision-making amidst uncertainty during critical wartime intelligence tasks—a journey that underscores the dynamic interaction between theory and real-world application across different domains including computing.

In [31]:
response = query_engine.query(
    "Tell me about the specific details about Alan Turing's cryptographic methods at Bletchley Park"
)

[1;3;38;5;200mSelecting query engine 0: The request pertains to finding specific facts about Alan Turing's cryptographic methods with the possibility of needing citations. Choice (1) directly addresses this requirement as it is useful for searching and providing factual information along with references..
[0m

In [None]:
print(response)

Alan Turing played a significant role in cracking various codes during his time at Bletchley Park. He was involved with studying probability theory and Enigma codes, which were initially managed by other analysts like Dillwyn Knox who had solved simpler Italian naval code systems. Upon joining the GC&CS research center upon declaring war on Germany in 1939, Turing quickly became a key player due to his unique set of interests that spanned from abstract mathematics and topology to applied probability and machine thinking concepts.

One notable contribution by Turing was designing an advanced electromechanical device named the "bombe." This innovative creation significantly improved upon previous models, drastically increasing Bletchley Park's ability to break German codes more efficiently than before through faster testing of possible wheel arrangements in Enigma machines.

In addition to his work with bombes for deciphering messages encrypted by the Army and air force units during World War II, Turing also made significant strides towards tackling complex naval communications encoded using intricate German systems called Tunny-Lorenz codes—systems that were nearly impossible to crack manually due to their complexity.

Turing's efforts did not stop at mechanical improvements; he applied his knowledge of Bayesian probability and introduced a scoring system based on bans, which became integral for the functioning of large digital computers called Colossi later developed by engineer Thomas H. Flowers. These machines were capable of breaking naval Enigma codes much faster than previous methods due to their programmable nature enhanced with Turing's Bayesian model and contributed significantly towards shortening war duration in Europe according to Gen. Dwight "Ike" Eisenhower, who made key decisions based on intel provided by the Colossi during critical moments like planning for Normandy invasion.

Through his work at Bletchley Park alongside other intellectuals such as Donald Michie and mathematician Gordon Welchman—the latter also having been a student of Turing's earlier in Cambridge University, which had indirectly influenced their shared obsession with machine learning —he played an instrumental role not only during World War II but set the foundation for modern computing. His contributions to cryptography were groundbreaking and paved way towards advancements that would continue even after his time at Bletchley Park was over, including moving onto voice encryption duties later in 1945.

Chat History to RAG

In [None]:
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine

memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

chat_engine = CondensePlusContextChatEngine.from_defaults(
    index.as_retriever(),
    memory=memory,
    llm=llm,
    context_prompt=(
        "You are a chatbot, able to have normal interactions, as well as talk"
        " about the Kendrick and Drake beef."
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous chat history, or the context above, to interact and help the user."
    ),
    verbose=True,
)