<a href="https://colab.research.google.com/github/Andrew-TraverseMT/NYC_Addresses/blob/main/extract_address_from_taxbill.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pdfplumber



In [6]:
import requests
import pdfplumber
import io

In [7]:
def extract_all_text_from_url(url):
    """
    Extracts all selectable text from each page of a PDF accessible via a URL, handling redirects.

    Args:
        url (str): The URL pointing to the PDF (or a redirect to the PDF).

    Returns:
        list: A list of strings, where each string contains the text from one page.
              If no text is found on a page, a message is included for that page.
              If an error occurs, a list with an error message is returned.
    """
    try:
        # Download the PDF from the URL, following redirects
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        # Check if the response content is a PDF
        content_type = response.headers.get('Content-Type', '')
        if 'application/pdf' not in content_type:
            return ["Error: The URL does not point to a PDF file"]

        # Open the PDF from the response content using pdfplumber
        with pdfplumber.open(io.BytesIO(response.content)) as pdf:
            all_text = []
            for page_number, page in enumerate(pdf.pages, start=1):
                text = page.extract_text()
                if text:
                    # Clean the text by removing extra whitespace and empty lines
                    cleaned_text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
                    all_text.append(f"Page {page_number}:\n{cleaned_text}")
                else:
                    all_text.append(f"Page {page_number}: No selectable text found")
            return all_text

    except requests.RequestException as e:
        return [f"Error downloading PDF: {e}"]
    except Exception as e:
        return [f"Error extracting text: {e}"]

# Example usage with your URL
url = 'https://a836-edms.nyc.gov/dctm-rest/repositories/dofedmspts/StatementSearch?bbl=1000917502&stmtDate=20250215&stmtType=SOA'
extracted_content = extract_all_text_from_url(url)

# Print the extracted content for each page
for page_content in extracted_content:
    print(page_content)
    print('-' * 50)  # Separator between pages

Page 1:
80117992502150100140001NYNP
Property Tax Bill Quarterly Statement
Activity through February 15, 2025
Owner name: 111 FULTON ST CONDO
How much do I owe?
Property address: 111FULTON ST.
Outstanding charges $0.00
Borough Block Lot
1 00091 7502 New charges $0.00
Total amount due by April 1, 2025* $0.00
* To avoid interest, you must pay byApril 15.
Ways to pay:
Most common way to pay Other ways to pay
Online By Mail In Person
Go to www.nyc.gov/citypay Remove the detachable Visit a DOF business
or scan the QR code to the slip (below) and mail it center with a copy of
right with your phone. Use with your payment. this bill. See
your BBL (gray box, top left) Payment processed in www.nyc.gov/visitdof
to search for your property. 7-10 business days. for locations. Open
Most people pay in five Monday to Friday,
minutes or less. 8:30 a.m. to 4:30 p.m.
Wait times may vary.
No fees when you pay from your checking
account (e-check) or electronic wire transfer.
1400.01 -ZB -40 -4 -0 -2 -10736


In [9]:
def group_words_into_lines(words, tolerance=5):
    """
    Group words into lines based on their vertical position ('top' coordinate).
    Words with 'top' values within the tolerance are considered part of the same line.

    Args:
        words (list): List of word dictionaries from pdfplumber.extract_words()
        tolerance (int): Maximum difference in 'top' coordinates to group words on the same line

    Returns:
        list: List of lines, where each line is a list of word dictionaries
    """
    if not words:
        return []
    # Sort words by 'top' descending (top to bottom) and 'x0' ascending (left to right)
    words = sorted(words, key=lambda w: (-w['top'], w['x0']))
    lines = []
    current_line = [words[0]]
    for word in words[1:]:
        if abs(word['top'] - current_line[-1]['top']) < tolerance:
            current_line.append(word)
        else:
            lines.append(current_line)
            current_line = [word]
    if current_line:
        lines.append(current_line)
    return lines

def extract_mailing_address(url, keyword="ORSID Realty Corp", num_lines=3):
    """
    Extract a mailing address from a PDF's first page, starting with the specified keyword.

    Args:
        url (str): URL to the PDF file
        keyword (str): Text to search for to identify the start of the address
        num_lines (int): Number of lines to extract starting from the keyword line

    Returns:
        str: The extracted address as a newline-separated string, or an error message
    """
    try:
        # Download the PDF
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        # Verify the response is a PDF
        if 'application/pdf' not in response.headers.get('Content-Type', ''):
            return "Error: URL does not point to a PDF file"

        # Convert response content to a file-like object
        pdf_file = io.BytesIO(response.content)

        # Open the PDF with pdfplumber
        with pdfplumber.open(pdf_file) as pdf:
            first_page = pdf.pages[0]
            page_width = first_page.width

            # Extract all words with their positions
            words = first_page.extract_words()

            # Filter words on the left half of the page to isolate the target address
            left_words = [w for w in words if w['x0'] < page_width / 2]

            # Group words into lines
            lines = group_words_into_lines(left_words)

            # Convert lines to text
            lines_text = [' '.join(word['text'] for word in line) for line in lines]

            # Search for the line containing the keyword
            for i, line in enumerate(lines_text):
                if keyword in line:
                    # Extract the line with the keyword and the next (num_lines - 1) lines
                    address_lines = lines_text[i:i + num_lines]
                    return '\n'.join(address_lines)
            return "Mailing address not found"

    except requests.RequestException as e:
        return f"Error downloading PDF: {e}"
    except Exception as e:
        return f"Error extracting text: {e}"

# Example usage
url = 'https://a836-edms.nyc.gov/dctm-rest/repositories/dofedmspts/StatementSearch?bbl=1000917502&stmtDate=20250215&stmtType=SOA'
address = extract_mailing_address(url)
print(address)

Mailing address not found
