In [None]:
!pip install pandas
!pip install fitz
!pip install re
!pip install pytesseract
!pip install PIL

In [18]:
import pandas as pd
import fitz
import re
import os
import io
import pytesseract
from PIL import Image

### Part 1 of Challenge: Find Highest Numeric Value

#### First, we will extract all text from our pdf

In [19]:
# Collect raw text from document. We will use fitz to extract text from page and embedded images, graphs, etc.
def extract_text_from_pdf(pdf_path):
    
    # Load the PDF document
    doc = fitz.open(pdf_path)
    total_text = ""

    for page in doc:

        # Extract text from the page
        page_text = page.get_text()
        
        total_text += page_text

    return total_text

#### Next, we will parse the text corpus for all numeric values. Then, we will find the maximum among these values

In [20]:
# Function to get largest numeric value from pdf text, and return in original format.
# Does not consider unit.
def extract_number_from_text(text):
    # Regex pattern to match numbers with optional commas, decimals.
    number_pattern = r'\b\d+(?:,\d{3})*(?:\.\d+)?\b'
    
    # Find all numbers in the text
    raw_numbers = re.findall(number_pattern, text)
    
    numbers_with_format = []
    
    for num in raw_numbers:
        # Remove commas for conversion purposes
        clean_num = num.replace(',', '')

        
        try:
            if '.' in clean_num:
                cleaned_number = float(clean_num)
            else:
                cleaned_number = int(clean_num)
                
            numbers_with_format.append((num, cleaned_number))
        except ValueError:
            # Skip any numbers that can't be converted
            continue
    
    # Find the tuple with the highest cleaned number
    if numbers_with_format:
        highest_number = max(numbers_with_format, key=lambda x: x[1])
        return highest_number[0]  # Return the original formatted number
    else:
        return None

## Run the cell below to get the highest numeric value in pdf.

### In order to change the pdf, replace file path with the path to the pdf you'd like to test

In [21]:
def get_highest_value_in_pdf(pdf_path):
    raw_text = extract_text_from_pdf(pdf_path)
    highest_num = extract_number_from_text(raw_text)
    return highest_num

pdf_path = os.path.expanduser("./Data/AirForceFiscalData.pdf")

get_highest_value_in_pdf(pdf_path)

'6,000,000'

#### The highest numeric value in this document, without considering scaling/units, is 6,000,000

### Part 2 of Challenge: Find Greatest Numeric Value (Bonus)

#### First, we extract all raw text, all table data, and all funds data. We are storing table and funds data separately, as this will help with scaling numbers later in the process

In [22]:
def bonus_extract_text(pdf_path):
    # Load the PDF document
    doc = fitz.open(pdf_path)
    total_text = ""
    all_tables=[]
    all_funds=[]

    for page in doc:

        # Extract text from the page
        page_text = page.get_text()
        
        # Retrieve tables and blocks
        tables = page.find_tables()
        blocks = page.get_text("blocks")
        
        if tables:
            for table in tables:
                all_tables.append(table.extract())
        
        if len(blocks) > 0:
            block_text=str(blocks[0]).strip().lower() # Check to see this is a fund
            if "millions" in block_text or "$m" in block_text or "thousands" in block_text:
                all_funds.append(blocks)


        total_text += page_text

    return total_text, all_tables, all_funds

#### Starting with the raw text, we will scale any number in text that is immediately followed by 'millions' or 'thousands'. However, numeric data found in tables or funds will not be considered

In [23]:
def extract_and_scale_numbers(text):
    # Define the pattern to match numbers followed by 'billion', million', or 'thousand'
    pattern = re.compile(r'[$]?\b(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*(billion|billions|million|millions|thousand|thousands)\b', re.IGNORECASE)
    
    scaled_numbers = []
    
    matches = pattern.findall(text)
    
    for match in matches:
        number = float(match[0].replace(',', ''))  #Extract number
        unit = match[1].lower()   # Extract the unit (billion/million/thousand)
        if unit == 'million' or unit == 'millions':
            scaled_numbers.append(number * 1_000_000)
        elif unit == 'thousand' or unit == 'thousands':
            scaled_numbers.append(number * 1_000)
        elif unit == 'billion' or unit == 'billions':
            scaled_numbers.append(number * 1_000_000_000)
    
    return scaled_numbers

#### Next, we will scale numeric data found in the tables of our pdf. We will do this by retrieving individual table objects, grabbing the table's scalar (either millions or thousands in this case), and scale all numeric data contained in the specified table object by the appropriate scalar

In [33]:
def process_array_with_scalars_tables(tables):

    def extract_and_scale_numbers(text, scalar):
        # Extract all numbers from the text
        numbers = re.findall(r'\b\d+(?:,\d{3})*(?:\.\d+)?\b', text)
        numbers=[num.replace(",","")for num in numbers]
        scaled_numbers = [float(num) * scalar for num in numbers if num not in years]
        return scaled_numbers

    # Track the current scalar based on the unit mentioned in previous rows
    current_scalar = 1
    scaled_numbers = []
    years = set([2023,2024,2025])
    
    for table in tables:
        # Reset scaling factor once we move to next table
        current_scalar=1
        for item in table:
            if item:
                
                # Ensure item is a string
                item = str(item)
                
                # Determine if the current row mentions a scaling factor
                if 'billions' in item.lower():
                    current_scalar = 1_000_000_000
                elif 'millions' in item.lower() or '$m' in item.lower():
                    current_scalar = 1_000_000
                elif 'Thousands' in item:
                    current_scalar = 1_000

                # Extract and scale the numbers from the text
                scaled_numbers.extend(extract_and_scale_numbers(item, current_scalar))

    return scaled_numbers

#### We will perform a similar process for funds. Since the structure of tables and funds are different, funds are handled slightly differently than tables, but there still is significant overlap in the underlying logic

In [34]:
import re

def process_array_with_scalars_funds(funds):

    def extract_and_scale_numbers(text, scalar):
        # Extract all numbers from the text
        numbers = re.findall(r'\b(?:\()?\d+(?=\d{3}|,|\.)(?:,\d{3})*(?:\.\d+)?(?:\))?\b', text)
        numbers = [num.replace(",", "") for num in numbers]
        scaled_numbers = [float(num) * scalar for num in numbers if num not in years]
        return scaled_numbers

    # Track the current scalar based on the unit mentioned in previous rows
    current_scalar = 1 
    scaled_numbers = []
    years = set([2023,2024,2025])

    # Iterate through fund objects
    for fund in funds:
        for item in fund:
            if item:
                
                # Ensure item is a tuple, and skip the first four elements (coordinates)
                if isinstance(item, tuple):
                    # Extract the text portion (5th element onward)
                    text = item[4]

                    # Determine if the current row mentions a scaling factor
                    if 'billions' in text.lower():
                        current_scalar = 1_000_000_000
                    elif 'millions' in text.lower() or '$m' in text.lower():
                        current_scalar = 1_000_000
                    elif 'Thousands' in text:
                        current_scalar = 1_000

                    # Extract and scale the numbers from the text
                    scaled_numbers.extend(extract_and_scale_numbers(text, current_scalar))

    return scaled_numbers

## Run the cell below to get the greatest numeric value in pdf.

###  In order to change the pdf, replace file path with the path to the pdf you'd like to test

In [35]:
def get_greatest_number_in_pdf(pdf_path):
    raw_text, tables, funds = bonus_extract_text(pdf_path)
    text_data = extract_and_scale_numbers(raw_text)
    table_data = process_array_with_scalars_tables(tables)
    funds_data = process_array_with_scalars_funds(funds)
    return f"{max(text_data+table_data+funds_data):,}"

pdf_path = "./Data/AirForceFiscalData.pdf"
get_greatest_number_in_pdf(pdf_path)

'35,110,000,000.0'