In [10]:
import os
import gradio as gr
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_parse import LlamaParse
import json
from dotenv import load_dotenv

Setup LLM with Ollama

In [11]:
llm = Ollama(model="phi3.5:3.8b-mini-instruct-q8_0", request_timeout=60)
# llm = Ollama(model="qwen2.5:14b", request_timeout=60)

Setup Embedding Model

In [12]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [13]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

Upload and Import ebooks function

In [None]:
import os
import tkinter as tk
from tkinter import filedialog
from ebooklib import epub
import fitz  # PyMuPDF

def upload_and_import_ebook(file_path=None, save_directory="data"):
    # Create a Tkinter root window
    root = tk.Tk()
    root.withdraw()  # Hide the root window

    # Open a file dialog to select an ebook or PDF file if no file path is provided
    if not file_path:
        file_path = filedialog.askopenfilename(
            title="Select an Ebook or PDF",
            filetypes=[("Ebook and PDF Files", "*.epub *.pdf")]
        )

    if file_path:
        content = ""
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == ".epub":
            # Load the ebook
            book = epub.read_epub(file_path)
            
            # Extract the content
            for item in book.get_items():
                if item.get_type() == ebooklib.ITEM_DOCUMENT:
                    content += item.get_body_content().decode('utf-8')
        
        elif file_extension == ".pdf":
            # Load the PDF
            pdf_document = fitz.open(file_path)
            
            # Extract the content
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                content += page.get_text()

        # Ensure the save directory exists
        os.makedirs(save_directory, exist_ok=True)

        # Save the content to a file in the save directory
        save_path = os.path.join(save_directory, os.path.basename(file_path[:-4]) + ".txt")
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(content)

        return save_path
    else:
        return None

# Example usage with file dialog
ebook_path = upload_and_import_ebook()
if ebook_path:
    print(f"Ebook content imported successfully to {ebook_path}.")
else:
    print("No ebook or PDF selected.")

Document Loader

In [6]:
if ebook_path:
    docs = SimpleDirectoryReader(input_files=[ebook_path]).load_data()
else:
    ebook_path = "sample_books/The Theory That Would Not Die How Bayes Rule Cracked the Enigma Code, Hunted Down Russian Submarines, and Emerged Triumphant from Two Centuries of Controversy by Sharon Bertsch McGrayne.txt"
    docs = SimpleDirectoryReader(input_files=[ebook_path]).load_data()

In [14]:
# Function to upload and import ebook content
def upload_ebook(file):
    upload_dir = "data"
    os.makedirs(upload_dir, exist_ok=True)
    filename = os.path.basename(file.name)

    # Create the destination path
    destination_path = os.path.join(upload_dir, filename)

    return destination_path

def handle_file_upload(files):
    ebook_content = upload_ebook(files)
    return ebook_content

def select_file():
    upload_dir = "data"
    files = os.listdir(upload_dir)

    if not files:
        file = """sample_books/The Theory That Would Not Die How Bayes Rule Cracked the Enigma Code, Hunted Down Russian Submarines, and Emerged Triumphant from Two Centuries of Controversy by Sharon Bertsch McGrayne.txt"""
    else:
        file = files[0]
    return file

file = select_file()
print(file)
docs = SimpleDirectoryReader(input_files=[file]).load_data()

sample_books/The Theory That Would Not Die How Bayes Rule Cracked the Enigma Code, Hunted Down Russian Submarines, and Emerged Triumphant from Two Centuries of Controversy by Sharon Bertsch McGrayne.txt


Advanced RAG (Routing)

In [15]:
from llama_index.core import VectorStoreIndex, SummaryIndex

# Build a vector index
vector_index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)

# Build a summary index
summary_index = SummaryIndex.from_documents(docs, embed_model=embed_model)

# Define query engines
vector_query_engine = vector_index.as_query_engine()
summary_query_engine = summary_index.as_query_engine()

In [8]:
from pydantic import BaseModel

class Citation(BaseModel):
    source: str
    page_number: str
    section_name: str

def GetSources(response):
    sources = []
    for node in response.source_nodes:
        text_node = node.node
        citation = Citation(
            source=text_node.metadata.get('file_name', 'Unknown Source'),
            page_number=text_node.metadata.get('page_label', 'Unknown Page'),
            section_name=text_node.metadata.get('section_name', 'Unknown Section')
        )
        sources.append(citation)
    return sources

In [9]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine

class CitationQueryEngineTool(QueryEngineTool):
    def query(self, query, timeout=100):
        response = super().query(query, timeout=timeout)
        sources = GetSources(response)
        citations = "\n".join([f"Source: {source.source}, Page: {source.page_number}, Section: {source.section_name}" for source in sources])
        return f"{response}\n\nCitations:\n{citations}"

# Create tools with citation support
vector_tool = CitationQueryEngineTool(
    index.as_query_engine(),
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for searching for specific facts with citations.",
    ),
)
summary_tool = CitationQueryEngineTool(
    index.as_query_engine(response_mode="tree_summarize"),
    metadata=ToolMetadata(
        name="summary",
        description="Useful for summarizing an entire document with citations.",
    ),
)

# Create the router query engine
query_engine = RouterQueryEngine.from_defaults(
    [vector_tool, summary_tool], select_multi=False, verbose=True, llm=llm
)

In [16]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine

# Create a custom query engine tool with citation inclusion
class CitationQueryEngineTool(QueryEngineTool):
    # Define the GetSources function to include citations
    def GetSources(response):
        sources = []
        for node in response.source_nodes:
            text_node = node.node
            source = text_node.metadata.get('file_name')
            page = text_node.metadata.get('page_label')
            section = text_node.metadata.get('section_name')
            sources.append(f"Source: {source[:30]}..., Page: {page}, Section: {section}")
        return "\n".join(sources)
    
    def query(self, query, timeout=100):
        response = super().query(query, timeout=timeout)
        sources = self.GetSources(response)
        return f"{response}\n\nCitations:\n{sources}"

# Create tools with citation support
vector_tool = CitationQueryEngineTool(
    vector_index.as_query_engine(),
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for searching for specific facts with citations.",
    ),
)
summary_tool = CitationQueryEngineTool(
    summary_index.as_query_engine(response_mode="tree_summarize"),
    metadata=ToolMetadata(
        name="summary",
        description="Useful for summarizing an entire document with citations.",
    ),
)

# Create the router query engine
query_engine = RouterQueryEngine.from_defaults(
    [vector_tool, summary_tool], select_multi=False, verbose=True, llm=llm
)

In [None]:
# from llama_index.core.tools import QueryEngineTool, ToolMetadata
# from llama_index.core.query_engine import RouterQueryEngine
    
# vector_tool = QueryEngineTool(
#     index.as_query_engine(),
#     metadata=ToolMetadata(
#         name="vector_search",
#         description="Useful for searching for specific facts.",
#     ),
# )

# summary_tool = QueryEngineTool(
#     index.as_query_engine(response_mode="tree_summarize"),
#     metadata=ToolMetadata(
#         name="summary",
#         description="Useful for summarizing an entire document.",
#     ),
# )

# query_engine = RouterQueryEngine.from_defaults(
#     [vector_tool, summary_tool], select_multi=False, verbose=True, llm=llm
# )

In [17]:
response = query_engine.query(
    "Provide a comprehensive overview of how Bayesian probability evolved from"
    " a theoretical concept to a critical wartime intelligence tool"
)

[1;3;38;5;200mSelecting query engine 1: The question asks for a comprehensive overview of the evolution of Bayesian probability, which requires summarizing historical development and its practical application during wartime. This aligns more closely with providing an overall summary rather than pinpointing specific facts..
[0m

ReadTimeout: timed out

In [None]:
print(response)

Bayesian probability's journey from a theoretical concept to a critical wartime intelligence tool is marked by periods of development and neglect followed by resurgence. Initially conceived in an era of religious and scientific debate, the framework was developed independently but simultaneously by Thomas Bayes and Pierre Simon Laplace. Though Bayes' work faded into obscurity due to his reluctance to publish it during his lifetime, Laplace's rigorous application of Bayesian methods laid a strong foundation for future use.

In the early 20th century, Bayesian probability faced criticism from statisticians who favored frequentist approaches, leading to its marginalization within academic circles. However, practical problem solvers continued to find value in Bayesian methods for dealing with uncertainty and making decisions under incomplete information. This pragmatic approach came into sharp focus during World War II.

Alan Turing, a pivotal figure of that era, recognized the potential of Bayesian probability in decrypting German naval codes through the Enigma machine. By integrating Bayesian principles, Turing was able to systematically update probabilities based on intercepted code fragments, leading to significant breakthroughs in intelligence gathering. This application not only played a crucial role in Allied victory but also contributed to advancements in computer science and software development.

Thus, while Bayesian probability initially struggled for acceptance due to theoretical debates and personal beliefs, its effectiveness in real-world problem-solving scenarios ultimately led to its recognition as an indispensable tool during critical historical periods such as World War II.

In [None]:
def GetSources(response):
  for node in response.source_nodes:
      # Access the TextNode object directly
      text_node = node.node

      # Assuming metadata is stored within the TextNode's metadata
      source = text_node.metadata.get('file_name') # Access metadata using .metadata.get()
      page = text_node.metadata.get('page_label')  # Access metadata using .metadata.get()
      section = text_node.metadata.get('section_name')

      print(f"Source: {source[:30]}...")
      print(f"Section: {section}")
      print(f"Page: {page}")

GetSources(response)

In [None]:
response = query_engine.query(
    "Tell me about the specific details about Alan Turing's cryptographic methods at Bletchley Park"
)

In [None]:
print(response)

At Bletchley Park during World War II, a young mathematician named Alan Turing made significant contributions to code-breaking efforts against German communications. Initially assigned various tasks including testing possibilities of wheel arrangements in Enigma machines using his unique design known as the "bombe," which was an electromechanical device that significantly accelerated decoding processes, he quickly demonstrated exceptional skill and innovation.

Turing's work went beyond simple trial-and-error techniques; it involved theoretical mathematics applied to practical problems in cryptography. He leveraged his understanding of probability theory alongside machine learning principles which were discussed during walks with colleagues Donald Michie and Gordon Welchman among others, sparking ideas about artificial intelligence that would later influence computing technology's evolution.

The Bombe was a critical advancement over previous manual methods because it enabled the rapid testing for Enigma settings by simulating all potential combinations in parallel rather than one at a time—a daunting task when faced with numerous permutations due to complex encryption mechanisms employed, such as multiple rotors and plugboard configurations.

Moreover, Turing developed scoring systems based on Bayesian statistics that enhanced the Bombe's efficiency further by prioritizing certain patterns or hypotheses over others in a probabilistic manner—an approach which was integral when dealing with vast amounts of data generated during decoding efforts at Bletchley Park.

His contribution played an essential role not only to break Enigma codes but also set the foundation for advances toward electronic digital computing and programming, as evidenced by his work's influence on later developments like Colossus computers—the world’s first large-scale electronic digital machines built during wartime.

In summary, Alan Turing at Bletchley Park employed a blend of probability theory, computational innovation with the bombe machine and Bayesian methods to break complex German codes which proved instrumental in winning crucial battles by providing vital intelligence about enemy movements—ultimately shortening World War II's duration.

Chat History to RAG

In [None]:
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine
import requests

# Initialize the memory buffer with a token limit
memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

# Create the chat engine with context and memory
chat_engine = CondensePlusContextChatEngine.from_defaults(
    index.as_retriever(),
    memory=memory,
    llm=llm,
    context_prompt=(
        "You are a chatbot, able to have normal interactions, as well as talk"
        " about the ebook/document that was uploaded from the user."
        " Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous chat history, or the context above, to interact and help the user."
    ),
    verbose=True,
)

# Create the chat engine with context and memory
chat_engine = CondensePlusContextChatEngine.from_defaults(
    query_engine=query_engine.query,  # Use the query method directly
    memory=memory,
    llm=llm,
    context_prompt=(
        "You are a chatbot, able to have normal interactions, as well as talk"
        " about the ebook/document that was uploaded from the user."
        " Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous chat history, or the context above, to interact and help the user."
    ),
    verbose=True,
)


# Function to chat with the LLM
def chat_with_llm(user_input):
    try:
        response = chat_engine.chat(user_input)
    except requests.Timeout as err:
        
        response = chat_engine.chat(user_input)
    
    return response

# Example usage
user_input = "Provide a comprehensive overview of how Bayesian probability evolved from a theoretical concept to a critical wartime intelligence tool"
response = chat_with_llm(user_input)
print(response)

In [None]:
user_input2 = ""
response2 = chat_with_llm(user_input2)

Bayesian probability originates with Reverend Thomas Bayes (1702-1761), an English minister and mathematician, who formulated the principles that now bear his name. However, it was not until Pierre-Simon Laplace developed what is known as 'Laplacian statistics' in 1812 that these concepts were further formalized into a more rigorous mathematical framework for probability theory.

Bayesian methods involve using probabilities to represent the degree of belief or certainty about uncertain events and incorporating new evidence over time through an updating process described by Bayes’ Theorem: P(H|E) = [P(E|H)*P(H)] / P(E), where H represents a hypothesis, E stands for observed data, P(H|E) is the probability of the hypothesis given the evidence (posteriori probability), and P(E|H) is the likelihood of observing said evidence if our hypothesized situation were true.

For many years following its formalization in mathematics, Bayesian approaches to statistics remained more or less on the sidelines due partly because frequentist methods dominated statistical thought at that time – a methodology based strictly upon long-run frequencies and not subjective probabilities about uncertain events (as per Laplace's philosophy).

The turning point for Bayes’ theorem in practical application came during World War II, where it played an essential role as part of codebreaking efforts. Alan Turing famously applied a type of machine learning algorithm based on probabilistic inference to crack the German Enigma codes – essentially applying what we now understand as early forms of neural networks and Bayesian methods for pattern recognition amidst chaos in encrypted signals (incorrectly credited often, but not actually pioneered by him).

Moreover, William Feller's work on probabilistic models laid down the mathematical foundation necessary to apply these techniques more broadly. In his book "An Introduction to Probability Theory and Its Applications", he presented Bayesian methods as a way of incorporating prior knowledge into probability calculations – an idea that has proven enormously influential across various fields, from machine learning algorithms in artificial intelligence (AI) systems like Cognitive Tutors mentioned earlier on, which depend heavily upon these principles for decision-making processes.

In the context of wartime efforts during WWII and beyond into modern times where Bayesian methods are ubiquitous – they've allowed us to make sense out complex data streams (like monitoring neurons firing within a brain or controlling prosthetic limbs) by treating them as probabilistic entities rather than deterministically predicting their behavior.
  
In conclusion, the journey of Bayesian probability from its inception through academic debate and mathematical formalization to becoming an indispensable tool during World War II for intelligence work demonstrates how theoretical ideas can transform into practical applications when circumstances demand it - reshaping our understanding not only about mathematics but also broader fields like neuroscience, AI development etc.