<a href="https://colab.research.google.com/github/Andrew-TraverseMT/NYC_Addresses/blob/main/extract_address_from_taxbill.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook takes records from MapPluto and looks up parcel owner mailing addresses from property tax assessments using the BBL.

In [1]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import requests
import pdfplumber
import io
import re
import pandas as pd
from time

In [5]:

def extract_all_text_from_url(url):
    """
    Extracts all selectable text from each page of a PDF accessible via a URL, handling redirects.

    Args:
        url (str): The URL pointing to the PDF (or a redirect to the PDF).

    Returns:
        list: A list of strings, where each string contains the text from one page.
              If no text is found on a page, a message is included for that page.
              If an error occurs, a list with an error message is returned.
    """
    try:
        # Download the PDF from the URL, following redirects
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        # Check if the response content is a PDF
        content_type = response.headers.get('Content-Type', '')
        if 'application/pdf' not in content_type:
            return ["Error: The URL does not point to a PDF file"]

        # Open the PDF from the response content using pdfplumber
        with pdfplumber.open(io.BytesIO(response.content)) as pdf:
            all_text = []
            for page_number, page in enumerate(pdf.pages, start=1):
                text = page.extract_text()
                if text:
                    # Clean the text by removing extra whitespace and empty lines
                    cleaned_text = '\n'.join(line.strip() for line in text.split('\n') if line.strip())
                    all_text.append(f"Page {page_number}:\n{cleaned_text}")
                else:
                    all_text.append(f"Page {page_number}: No selectable text found")
            return all_text

    except requests.RequestException as e:
        return [f"Error downloading PDF: {e}"]
    except Exception as e:
        return [f"Error extracting text: {e}"]

# Example usage with your URL
url = 'https://a836-edms.nyc.gov/dctm-rest/repositories/dofedmspts/StatementSearch?bbl=1000917502&stmtDate=20250215&stmtType=SOA'
extracted_content = extract_all_text_from_url(url)

# Print the extracted content for each page
for page_content in extracted_content:
    print(page_content)
    print('-' * 50)  # Separator between pages

Page 1:
80117992502150100140001NYNP
Property Tax Bill Quarterly Statement
Activity through February 15, 2025
Owner name: 111 FULTON ST CONDO
How much do I owe?
Property address: 111FULTON ST.
Outstanding charges $0.00
Borough Block Lot
1 00091 7502 New charges $0.00
Total amount due by April 1, 2025* $0.00
* To avoid interest, you must pay byApril 15.
Ways to pay:
Most common way to pay Other ways to pay
Online By Mail In Person
Go to www.nyc.gov/citypay Remove the detachable Visit a DOF business
or scan the QR code to the slip (below) and mail it center with a copy of
right with your phone. Use with your payment. this bill. See
your BBL (gray box, top left) Payment processed in www.nyc.gov/visitdof
to search for your property. 7-10 business days. for locations. Open
Most people pay in five Monday to Friday,
minutes or less. 8:30 a.m. to 4:30 p.m.
Wait times may vary.
No fees when you pay from your checking
account (e-check) or electronic wire transfer.
1400.01 -ZB -40 -4 -0 -2 -10736


In [11]:
# Read the data into a DataFrame
df = pd.read_csv("/content/MapPluto_Subset_02202025.csv")

# Display results
print(df[['BoroCode', 'Block', 'Lot', 'BBL']])

bbl_list = df['BBL'].to_list()

     BoroCode  Block   Lot         BBL
0           1     67    23  1000670023
1           1     77    24  1000770024
2           1     78     4  1000780004
3           1     91  7502  1000917502
4           1    173  7502  1001737502
..        ...    ...   ...         ...
292         5   7071    25  5070710025
293         5   7465     1  5074650001
294         5   7614    20  5076140020
295         5   7626    10  5076260010
296         5   7632     6  5076320006

[297 rows x 4 columns]


In [12]:
def download_and_extract_text(url, max_retries=3, initial_delay=2):
    """
    Download a PDF from a URL and extract text lines from the first page, with retries on connection errors.

    Args:
        url (str): The URL of the PDF to download.
        max_retries (int): Maximum number of retry attempts (default: 3).
        initial_delay (int): Initial delay in seconds before retrying (default: 2).

    Returns:
        list: Extracted text lines if successful, or an error message string if failed.
    """
    delay = initial_delay
    for attempt in range(max_retries):
        try:
            # Attempt to download the PDF
            response = requests.get(url)
            response.raise_for_status()  # Raises an exception for HTTP errors (e.g., 404, 500)

            # Verify the content is a PDF
            if 'application/pdf' not in response.headers.get('Content-Type', ''):
                raise ValueError("URL does not point to a PDF file")

            # Extract text from the first page of the PDF
            pdf_file = io.BytesIO(response.content)
            with pdfplumber.open(pdf_file) as pdf:
                first_page = pdf.pages[0]
                text = first_page.extract_text()
                if not text:
                    raise ValueError("No text extracted from the PDF")
                return text.split('\n')

        except requests.ConnectionError as e:
            # Handle connection errors (e.g., IncompleteRead)
            if attempt < max_retries - 1:  # If not the last attempt
                print(f"Connection error: {e}. Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff: double the delay each retry
            else:
                # All retries exhausted
                return f"Error downloading PDF after {max_retries} attempts: {e}"

        except requests.RequestException as e:
            # Handle other request errors (e.g., timeouts, HTTP errors) without retrying
            return f"Error downloading PDF: {e}"

        except Exception as e:
            # Handle errors during PDF processing (e.g., invalid PDF)
            return f"Error extracting text: {e}"

def extract_address(lines):
    """Extract the mailing address starting after the line with two '#' symbols."""
    hash_line_index = -1
    for i, line in enumerate(lines):
        if line.count('#') >= 2 and re.match(r'^#.*#$', line.strip()):
            hash_line_index = i
            break

    if hash_line_index == -1 or hash_line_index + 1 >= len(lines):
        return "Address not found: No line with two '#' symbols or insufficient lines follow"

    address_lines = []
    start_index = hash_line_index + 1

    if start_index < len(lines):
        line = lines[start_index].replace("Make checks payable & mail payment to:", "").strip()
        address_lines.append(line)

    if start_index + 1 < len(lines):
        line = lines[start_index + 1].replace("NYC Department of Finance", "").strip()
        address_lines.append(line)

    if start_index + 2 < len(lines):
        line = lines[start_index + 2].strip()
        address_lines.append(line)

    if start_index + 4 < len(lines):
        line = lines[start_index + 4].replace("Binghamton NY 13902-5536", "").strip()
        address_lines.append(line)

    return '\n'.join(address_lines)

# Dictionary to store results
results = {}

# Base URL template
url_template = 'https://a836-edms.nyc.gov/dctm-rest/repositories/dofedmspts/StatementSearch?bbl={}&stmtDate=20250215&stmtType=SOA'

In [13]:
# Loop through each BBL and process it
for bbl in bbl_list:
    try:
        url = url_template.format(bbl)
        text_lines = download_and_extract_text(url)
        if isinstance(text_lines, list):
            address = extract_address(text_lines)
            results[bbl] = address
        else:
            results[bbl] = text_lines  # Store error message from download_and_extract_text
    except Exception as e:
        results[bbl] = f"Error: {str(e)}"

# Print the results
for bbl, address in results.items():
    print(f"BBL: {bbl}\nAddress:\n{address}\n{'-'*40}")

BBL: 1000670023
Address:
MAIDEN & NASSAU LLC
C/O FEDERAL RESERVE BANK OF NE
33LIBERTY ST.
NEW YORKNY 10045-1003
----------------------------------------
BBL: 1000770024
Address:
33 BRE INC.
THURCON PROPERTIES, LTD, MANA
49 W.32ND ST.FL. 2
NEW YORKNY 10001-3811
----------------------------------------
BBL: 1000780004
Address:
ARC NYC123WILLIAM, LLC
123WILLIAM ST.
NEW YORKNY 10038-3804

----------------------------------------
BBL: 1000917502
Address:
111 FULTON ST. CONDO
ORSID REALTY CORP.
156 W.56TH ST.FL. 6
NEW YORKNY 10019-3911
----------------------------------------
BBL: 1001737502
Address:
THE WORTH BUILDING CONDO
C/O JORDAN COOPER & ASSOCIATES
1085071ST AVE.APT. BB
FOREST HILLSNY 11375-4523
----------------------------------------
BBL: 1001767501
Address:
Error downloading PDF: ('Connection broken: IncompleteRead(0 bytes read, 258023 more expected)', IncompleteRead(0 bytes read, 258023 more expected))
----------------------------------------
BBL: 1001877501
Address:
OWNER/AGENT
3

In [14]:
# join results with data and output a new csv

results_df = pd.DataFrame(list(results.items()), columns=['BBL', 'Address'])

df['BBL'] = df['BBL'].astype(str)
results_df['BBL'] = results_df['BBL'].astype(str)

merged_df = pd.merge(df, results_df, on='BBL', how='left')

merged_df.to_csv("MapPluto_Subset_with_Mailing_Addr.csv")