In [1]:
import os
import gradio as gr
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_parse import LlamaParse
import json
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


Setup LLM with Ollama

In [2]:
load_dotenv()

LLAMACLOUD_API_KEY = os.getenv("LLAMACLOUD_API_KEY")

llm = Ollama(model="phi3.5:3.8b-mini-instruct-q8_0", api_key="OLLAMA_API_KEY")

Setup Embedding Model

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

Upload and Import ebooks function

In [18]:
import os
import tkinter as tk
from tkinter import filedialog
from ebooklib import epub
import fitz  # PyMuPDF

def upload_and_import_ebook(file_path=None, save_directory="data"):
    # Create a Tkinter root window
    root = tk.Tk()
    root.withdraw()  # Hide the root window

    # Open a file dialog to select an ebook or PDF file if no file path is provided
    if not file_path:
        file_path = filedialog.askopenfilename(
            title="Select an Ebook or PDF",
            filetypes=[("Ebook and PDF Files", "*.epub *.pdf")]
        )

    if file_path:
        content = ""
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == ".epub":
            # Load the ebook
            book = epub.read_epub(file_path)
            
            # Extract the content
            for item in book.get_items():
                if item.get_type() == ebooklib.ITEM_DOCUMENT:
                    content += item.get_body_content().decode('utf-8')
        
        elif file_extension == ".pdf":
            # Load the PDF
            pdf_document = fitz.open(file_path)
            
            # Extract the content
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                content += page.get_text()

        # Ensure the save directory exists
        os.makedirs(save_directory, exist_ok=True)

        # Save the content to a file in the save directory
        save_path = os.path.join(save_directory, os.path.basename(file_path) + ".txt")
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(content)

        return save_path
    else:
        return None

# Example usage with file dialog
ebook_path = upload_and_import_ebook()
if ebook_path:
    print(f"Ebook content imported successfully to {ebook_path}.")
else:
    print("No ebook or PDF selected.")

# Example usage with file path
# ebook_path = upload_and_import_ebook("path/to/your/ebook_or_pdf.epub")
# if ebook_path:
#     print(f"Ebook content imported successfully to {ebook_path}.")
# else:
#     print("Invalid file path.")

No ebook or PDF selected.


Document Loader

In [19]:
if ebook_path:
    docs = SimpleDirectoryReader(input_files=[ebook_path]).load_data()
else:
    ebook_path = "sample_books/The Theory That Would Not Die How Bayes Rule Cracked the Enigma Code, Hunted Down Russian Submarines, and Emerged Triumphant from Two Centuries of Controversy by Sharon Bertsch McGrayne.pdf.txt"
    docs = SimpleDirectoryReader(input_files=[ebook_path]).load_data()

In [20]:
print(docs)



In [21]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

Advanced RAG (Routing)

In [22]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(docs)
query_engine = index.as_query_engine(similarity_top_k=5)

In [25]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata

# Define the GetSources function
def GetSources(response):
    sources = []
    for node in response.source_nodes:
        # Access the TextNode object directly
        text_node = node.node

        # Assuming metadata is stored within the TextNode's metadata
        source = text_node.metadata.get('file_name')  # Access metadata using .metadata.get()
        page = text_node.metadata.get('page_label')   # Access metadata using .metadata.get()

        sources.append(f"Source: {source[:30]}..., Page: {page}")
    return "\n".join(sources)

# Create a custom query engine tool with citation inclusion
class CitationQueryEngineTool(QueryEngineTool):
    def query(self, query):
        response = super().query(query)
        sources = GetSources(response)
        return f"{response}\n\nCitations:\n{sources}"

# Create tools with citation support
vector_tool = CitationQueryEngineTool(
    index.as_query_engine(),
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for searching for specific facts with citations.",
    ),
)
summary_tool = CitationQueryEngineTool(
    index.as_query_engine(response_mode="tree_summarize"),
    metadata=ToolMetadata(
        name="summary",
        description="Useful for summarizing an entire document with citations.",
    ),
)

In [None]:
# from llama_index.core.tools import QueryEngineTool, ToolMetadata
    
# vector_tool = QueryEngineTool(
#     index.as_query_engine(),
#     metadata=ToolMetadata(
#         name="vector_search",
#         description="Useful for searching for specific facts.",
#     ),
# )

# summary_tool = QueryEngineTool(
#     index.as_query_engine(response_mode="tree_summarize"),
#     metadata=ToolMetadata(
#         name="summary",
#         description="Useful for summarizing an entire document.",
#     ),
# )

In [26]:
from llama_index.core.query_engine import RouterQueryEngine

query_engine = RouterQueryEngine.from_defaults(
    [vector_tool, summary_tool], select_multi=False, verbose=True, llm=llm
)

In [27]:
response = query_engine.query(
    "Provide a comprehensive overview of how Bayesian probability evolved from"
    " a theoretical concept to a critical wartime intelligence tool"
)

[1;3;38;5;200mSelecting query engine 1: The question requests a comprehensive overview of the evolution and application of Bayesian probability. Choice (2) is most relevant as it pertains to summarizing an entire document which likely contains historical context, theoretical development, practical applications including wartime intelligence use..
[0m

ReadTimeout: timed out

In [13]:
print(response)

Bayesian probability began as an intellectual exploration within statistical theory before transitioning into practical applications across various fields, including warfare. Initially discussed in the contexts such as neuroscience and cognitive science (as shown by works from Sharon Bertsch McGrayne to Judea Pearl's "Probabilistic Reasoning"), Bayesian probability provided a framework for understanding how knowledge could be updated with new evidence, which is inherently useful in decision-making processes.

The utilization of Bayes theorem and related methodologies saw an evolution from purely academic studies to being employed practically during significant historical events such as World War II (notably by John von Neumann) for code breaking efforts against the Enigma cipher, mentioned implicitly within documents referencing its impact on intelligence systems. This period marked a pivotal moment where Bayesian methods were recognized not just in theoretical discussions but also in 

Bayesian probability has seen an evolutionary journey that spans several centuries. Its story begins with Thomas Bayes, who discovered the foundational theorem in the mid-18th century amidst religious controversies over rational conclusions about God through empirical evidence from our world. Despite his groundbreaking work being initially recognized by a friend and editor named Richard Price posthumously due to its lack of immediate significance during Baye's time, this principle laid dormant for several decades following Laplace’s independent discovery in the 1770s.

Laplace refined Bayes theorem into what is widely used today by applying it extensively across vast amounts of data to draw conclusions on natural laws and phenomena—such as understanding why more boys than girls are born, which he identified using this statistical methodology despite societal conventions suggesting otherwise at that time.

However, the broader scientific community during Bayes' era considered his contributions subjective due largely to their lack of empirical application beyond religious debates and philosophies surrounding probability theory—an outlook not entirely changed even after Laplace’s death until practical applications came into focus elsewhere in history. 

The turnaround from being deemed a theoretical curiosity occurred during the Second World War, where Bayesian methods were employed by Alan Turing to crack Enigma's code for Allied forces—this effort dramatically altered wartime strategies and ultimately saved Britain while also propelling developments in computing technology.

Post-war applications expanded further into various fields; Jerome Cornfield used it clinically, Arthur Bailey made contributions within the realm of statistics education, The Cold War era saw its use by figures like John von Neumann who applied Bayesian methods to nuclear strategy and decision making during an arms race fraught with uncertainty.

In modern times, this probabilistic framework is integral not only in intelligence operations but also across business decisions (e.g., stock market predictions), medical diagnostics through the application of algorithms for interpreting mammograms or assessing heart attack risks using Bayesian statistics as described by researchers Goodman and Heckerman; it has even found its way into consumer experiences with search engines like Amazon optimizing recommendations based on user behavior.

Bayesian probability's journey from a mathematical curiosity to an indispensable tool in decision-making processes underlines the transformative power of theoretical concepts when applied practically—a narrative that underscores both its historical significance and contemporary relevance across various realms, including science, business, technology, medicine, military intelligence operations.

In [56]:
response = query_engine.query(
    "Tell me about the specific details about Alan Turing's cryptographic methods at Bletchley Park"
)

[1;3;38;5;200mSelecting query engine 0: The request asks for specific details about Alan Turing's cryptographic methods at Bletchley Park. Choice (1) is most relevant as it pertains to searching for and finding precise facts with citations that would include such detailed information..
[0m

In [None]:
print(response)

At Bletchley Park during World War II, Alan Turing developed a unique approach to tackle complex coding challenges. He was instrumental in designing an innovative machine known as the "bombe," which significantly accelerated code-breaking efforts against German military communications encoded by Enigma machines. This high-speed electromechanical device tested various wheel arrangements for potential correct decryption, rapidly eliminating impossible combinations and focusing on likely ones to find matches with intercepted messages.

Turing's work didn't stop there; he also improved the existing bombe design alongside colleagues Gordon Welchman and Harold "Doc" Keen. Their collective effort resulted in a more advanced prototype that enhanced Bletchley Park’s ability to decipher intricate naval codes, which were notoriously difficult due to their complex configurations involving multiple reflectors, rotors, plugboards, clip positions around the machine's wheels, and starting position pos

: 

At Bletchley Park during World War II, Alan Turing developed a unique approach to tackle complex coding challenges. He was instrumental in designing an innovative machine known as the "bombe," which significantly accelerated code-breaking efforts against German military communications encoded by Enigma machines. This high-speed electromechanical device tested various wheel arrangements for potential correct decryption, rapidly eliminating impossible combinations and focusing on likely ones to find matches with intercepted messages.

Turing's work didn't stop there; he also improved the existing bombe design alongside colleagues Gordon Welchman and Harold "Doc" Keen. Their collective effort resulted in a more advanced prototype that enhanced Bletchley Park’s ability to decipher intricate naval codes, which were notoriously difficult due to their complex configurations involving multiple reflectors, rotors, plugboards, clip positions around the machine's wheels, and starting position possibilities.

In parallel with mechanical advancements like bombe machines, Turing explored theoretical foundations that would later influence computational methods applied in code-breaking activities at Bletchley Park—including Bayesian probability scoring systems for pattern recognition within encrypted messages. This integration of probabilistic models into the cryptographic process marked an important intellectual contribution by enhancing machine efficiency and precision during critical wartime decryption tasks, eventually leading to groundbreaking achievements like shortening World War II in Europe significantly through timely intelligence gathered from intercepted communications.



Chat History to RAG

In [None]:
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine

memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

chat_engine = CondensePlusContextChatEngine.from_defaults(
    index.as_retriever(),
    memory=memory,
    llm=llm,
    context_prompt=(
        "You are a chatbot, able to have normal interactions, as well as talk"
        " about the Kendrick and Drake beef."
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous chat history, or the context above, to interact and help the user."
    ),
    verbose=True,
)