1. https://apps.marincounty.org/TaxBillOnline/?PropertyId=11217306&__ncforminfo=41lsqeI2Aiy2chb-b2Jg6Uwm-yo2S_GamBPZnrDednu1aIwmzWenA2p0lClfnI9QIvT66uxfM62nvwK7Vpf55juNEQpfLcGwmRpAfwZ4WbE%3D

2. https://apps.marincounty.org/TaxBillOnline/Bill?BillNumber=24-1096268

In [10]:
import os
import re
import gzip
import csv

import geopandas as gpd
import requests
from pyproj import Transformer

# ——— Configuration ———
GEOJSON_PATH = r'C:\Users\Dewank Mahajan\Desktop\DKM Business\LowPropTax\AccessorOffice\Marin\Parcels.geojson'
SCRAPE_DIR   = r'C:\Users\Dewank Mahajan\Desktop\DKM Business\LowPropTax\AccessorOffice\Marin\Dir'
OUTPUT_CSV   = r'C:\Users\Dewank Mahajan\Desktop\DKM Business\LowPropTax\AccessorOffice\Marin\output_tax.csv'

os.makedirs(SCRAPE_DIR, exist_ok=True)

# ——— Regexes & Transformer ———
BILL_NUMBER_REGEX    = re.compile(r'BillNumber=([\d-]+)')
FORMINFO_REGEX       = re.compile(r'name="__ncforminfo"\s+value="([^"]+)"')
AMOUNT_REGEX         = re.compile(r'Total Amount Due:\s+<span class="i16">\$([\d,\.]+)')
transformer          = Transformer.from_crs(2227, 4326, always_xy=True)

# ——— Session with browser‑like headers ———
session = requests.Session()
session.headers.update({
    'User-Agent':      'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept':          'text/html,application/xhtml+xml',
    'Referer':         'https://apps.marincounty.org/TaxBillOnline/'
})

def fetch_and_save_bill(apn, out_path):
    """Fetch the bill HTML, gzip-save it, and return a status code."""
    # 1) main page
    r1 = session.get(f'https://apps.marincounty.org/TaxBillOnline/?PropertyId={apn}')
    if r1.status_code != 200:
        return 'FetchError'
    m_bill = BILL_NUMBER_REGEX.search(r1.text)
    if not m_bill:
        return 'NoBillNumber'
    bill_number = m_bill.group(1)

    m_form = FORMINFO_REGEX.search(r1.text)
    if not m_form:
        return 'NoFormInfo'
    forminfo = m_form.group(1)

    # 2) actual bill page
    r2 = session.get(
        f'https://apps.marincounty.org/TaxBillOnline/Bill'
        f'?BillNumber={bill_number}&__ncforminfo={forminfo}'
    )
    if r2.status_code != 200:
        return 'FetchError'

    with gzip.open(out_path, 'wt', encoding='utf-8') as gz:
        gz.write(r2.text)
    return 'Fetched'

def extract_tax_from_gz(path):
    """Open gzipped HTML and pull out the tax amount."""
    with gzip.open(path, 'rt', encoding='utf-8') as gz:
        html = gz.read()
    m = AMOUNT_REGEX.search(html)
    if not m:
        return None
    return float(m.group(1).replace(',', ''))

# ——— Load GeoJSON ———
print("Loading GeoJSON…")
gdf = gpd.read_file(GEOJSON_PATH)
print(f"  → {len(gdf)} parcels loaded.")

# ——— Prepare CSV ———
with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f_out:
    fieldnames = ['address', 'apn', 'longitude', 'latitude', 'tax', 'county', 'zone', 'comment']
    writer = csv.DictWriter(f_out, fieldnames=fieldnames)
    writer.writeheader()

    # ——— Iterate parcels ———
    for idx, row in gdf.iterrows():
        apn = row['Prop_ID']
        print(f"[{idx+1}/{len(gdf)}] APN {apn}")

        # centroid + transform
        centroid = row.geometry.centroid
        lon, lat = transformer.transform(centroid.x, centroid.y)

        # address + commercial flag
        address = row.get('AssessorSi') or ''
        if 'UseCdDesc' in row and row['UseCdDesc'] and 'Commercial' in row['UseCdDesc']:
            address += ' (Commercial)'

        # decide whether to fetch or reuse
        gz_path = os.path.join(SCRAPE_DIR, f"{apn}.html.gz")
        if not os.path.exists(gz_path):
            status = fetch_and_save_bill(apn, gz_path)
            if status != 'Fetched':
                tax = ''
                print(f"   → {status}")
            else:
                tax_val = extract_tax_from_gz(gz_path)
                if tax_val is None:
                    tax = ''
                    status = 'ParseError'
                    print("   → ParseError")
                else:
                    tax = f"{tax_val:.2f}"
                    print(f"   → tax = {tax}")
        else:
            status = 'Cached'
            tax_val = extract_tax_from_gz(gz_path)
            if tax_val is None:
                tax = ''
                status = 'ParseError'
                print("   → Cached but ParseError")
            else:
                tax = f"{tax_val:.2f}"
                print(f"   → Cached, tax = {tax}")

        # write row with comment
        writer.writerow({
            'address': address,
            'apn': apn,
            'longitude': lon,
            'latitude': lat,
            'tax': tax,
            'county': 'MN',
            'zone': '',
            'comment': status
        })

print("Done! CSV written to:", OUTPUT_CSV)


Loading GeoJSON…
  → 96285 parcels loaded.
[1/96285] APN 024-261-01
   → NoBillNumber
[2/96285] APN 109-270-12
   → NoBillNumber
[3/96285] APN 132-163-18
   → NoBillNumber
[4/96285] APN 160-833-14
   → NoBillNumber
[5/96285] APN 058-390-16
   → NoBillNumber
[6/96285] APN 165-220-16
   → NoBillNumber
[7/96285] APN 160-832-05
   → NoBillNumber
[8/96285] APN 179-270-15
   → NoBillNumber
[9/96285] APN 175-451-26
   → NoBillNumber
[10/96285] APN 058-171-85
   → NoBillNumber
[11/96285] APN 114-171-05
   → NoBillNumber
[12/96285] APN 022-631-01
   → NoBillNumber
[13/96285] APN 191-071-14
   → NoBillNumber
[14/96285] APN 028-154-11
   → NoBillNumber
[15/96285] APN 025-012-15
   → NoBillNumber
[16/96285] APN 039-101-31
   → NoBillNumber
[17/96285] APN 006-119-11
   → NoBillNumber
[18/96285] APN 005-211-49
   → NoBillNumber
[19/96285] APN 170-130-26
   → NoBillNumber
[20/96285] APN 028-222-84
   → NoBillNumber
[21/96285] APN 165-220-05
   → NoBillNumber
[22/96285] APN 168-222-06
   → NoBillNumbe

KeyboardInterrupt: 