In [1]:
!pip install llama-index==0.10.37 llama-index-readers-smart-pdf-loader pymupdf llamasherpa

Collecting pymupdf
  Using cached pymupdf-1.25.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)


ERROR: Could not find a version that satisfies the requirement llamasherpa (from versions: none)
ERROR: No matching distribution found for llamasherpa


In [4]:
!pip install --upgrade --quiet llmsherpa

In [6]:
import os

from getpass import getpass
import nest_asyncio

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()

True

In [8]:
CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")

In [7]:
def clean(
    text: str,
    extra_whitespace: bool = False,
    broken_paragraphs: bool = False,
    bullets: bool = False,
    ascii: bool = False,
    lowercase: bool = False,
    citations: bool = False,
    merge_split_words: bool = False,

) -> str:
    """Cleans text.

    """

    cleaned_text = text.lower() if lowercase else text
    cleaned_text = (
        clean_non_ascii_chars(cleaned_text) if ascii else cleaned_text
    )
    cleaned_text = remove_citations(cleaned_text) if citations else cleaned_text
    cleaned_text = clean_extra_whitespace(cleaned_text) if extra_whitespace else cleaned_text
    cleaned_text = clean_bullets(cleaned_text) if bullets else cleaned_text
    cleaned_text = merge_hyphenated_words(cleaned_text) if merge_split_words else cleaned_text
    return cleaned_text.strip()

In [34]:
PDF_PATH = "../data/almanack_of_naval_ravikant.pdf"

In [None]:
TEXT_PATH = "./data/pg10763.txt"

url = "copyright"

In [36]:
from llama_index.core import SimpleDirectoryReader
# from llama_index.readers.file import PDFReader
# from llama_index.readers.smart_pdf_loader import SmartPDFLoader

text_loaded = SimpleDirectoryReader(input_files=[TEXT_PATH]).load_data()

pdf_loaded = SimpleDirectoryReader(input_files=[PDF_PATH]).load_data()
# smart_pdf_loader_docs = SmartPDFLoader(llmsherpa_api_url=LLMSHERPA_API_URL).load_data(PDF_PATH)

# pdf_reader_docs = PDFReader().load_data(PDF_PATH)

In [37]:
print(pdf_loaded[100].get_content())

BUILDING  JUDGMENT ·  101SHED YOUR IDENTITY TO SEE REALITY
Our egos are constructed in our formative years—our first 
two decades. They get constructed by our environment, our 
parents, society. Then, we spend the rest of our life trying to 
make our ego happy. We interpret anything new through our 
ego: “How do I change the external world to make it more how 
I would like it to be?” [8]
“Tension is who you think you should be.  
Relaxation is who you are.”
—Buddhist saying
You absolutely need habits to function. You cannot solve every 
problem in life as if it is the first time it’s thrown at you. We 
accumulate all these habits. We put them in the bundle of 
identity, ego, ourselves, and then we get attached to them. “I’m 
Naval. This is the way I am.”
It’ s really important to be able to uncondition yourself, to be 
able to take your habits apart and say, “Okay, this is a habit I 
probably picked up when I was a toddler trying to get my par-
ent’s attention. Now I’ve reinforced it a

In [38]:
def handle_chapter_headers_footers(strings, flag):
    """
    Modify a list of strings based on a specified flag and join them into a single string.

    This function first removes any empty strings from the input list. It then checks if the
    remaining list has more than three elements. If so, it modifies the list by removing the
    first element, last element, or both, based on the value of the flag. The final list is then
    joined into a single string with spaces separating the elements.

    Parameters:
        strings (list of str): The list of strings to modify.
        flag (str): A flag indicating the modification to perform on the list:
            - 'remove_first': Remove the first element of the list.
            - 'remove_last': Remove the last element of the list.
            - 'remove_first_last': Remove both the first and last elements of the list.
            - 'remove_first_two': Remove the first two elements of the list.
            - Any other value leaves the list unchanged.

    Returns:
        str: A single string composed of the modified list elements, separated by spaces.
    """
    # Filter out empty strings
    filtered_strings = [s for s in strings if s]
    
    # Check if the filtered list has more than three elements
    if len(filtered_strings) > 3:
        if flag == 'remove_first':
            filtered_strings = filtered_strings[1:]  # Slice off the first element
        elif flag == 'remove_last':
            filtered_strings = filtered_strings[:-1]  # Slice off the last element
        elif flag == 'remove_first_last':
            filtered_strings = filtered_strings[1:-1]  # Slice off the first and last elements
        elif flag == 'remove_first_two':
            filtered_strings = filtered_strings[2:]  # Slice off the first two elements
    
    # Join all strings with a space and return the result
    return ' '.join(filtered_strings).strip()

In [39]:
def extract_text(page, file_name, title, author, flag, opt="text"):
    """
    Extracts text from a specified page of a document and returns a dictionary containing
    the extracted text and associated metadata.

    The function first retrieves text from the given `page` object using the specified `opt` method.
    It then processes this text to remove chapter headers, footers, and applies various cleaning
    procedures according to the `flag` and other parameters set in the `clean` function.

    Parameters:
        page (fitz.Page): The page object from which to extract text.
        file_name (str): The name of the file from which the page is taken.
        title (str): The title of the document.
        author (str): The author of the document.
        flag (str): A flag used to customize how chapter headers and footers are handled.
        opt (str, optional): The method of text extraction to be used by `get_text`.
            Defaults to "text", but can be changed to other methods supported by the library.

    Returns:
        dict: A dictionary with two keys:
            - 'text': A string containing the cleaned and processed text from the page.
            - 'metadata': A dictionary containing metadata about the text, including the
                          page number, file name, title, and author.
    """
    
    text = page.get_text(opt, sort=True)

    text = text.split("\n")

    text = handle_chapter_headers_footers(text, flag)

    text = clean(
        text,
        extra_whitespace=True,
        broken_paragraphs=True,
        bullets=True,
        ascii=True,
        lowercase=False,
        citations=True,
        merge_split_words=True,
    )

    return {
        "text": text,
        "metadata": {
            "page_number": page.number,
            "file_name": file_name,
            "title": title,
            "author": author
        }
    }

def extract_texts_from_pdf(file_path, title, author, pages, flag):
    document = get_document(file_path, pages)
    file_name = os.path.basename(file_path)
    extracted_texts = [extract_text(page, file_path, title, author, flag) for page in document]
    return extracted_texts