1. https://apps.marincounty.org/TaxBillOnline/?PropertyId=11217306&__ncforminfo=41lsqeI2Aiy2chb-b2Jg6Uwm-yo2S_GamBPZnrDednu1aIwmzWenA2p0lClfnI9QIvT66uxfM62nvwK7Vpf55juNEQpfLcGwmRpAfwZ4WbE%3D

2. https://apps.marincounty.org/TaxBillOnline/Bill?BillNumber=24-1096268

In [5]:
import os
import re
import gzip
import csv

import pandas as pd
import requests

# ——— Configuration ———
CSV_PATH     = '/content/sample_data/Parcels_2922184505886182349.csv'
SCRAPE_DIR   = r'C:\Users\Dewank Mahajan\Desktop\DKM Business\LowPropTax\AccessorOffice\Marin\Dir'
OUTPUT_CSV   = r'C:\Users\Dewank Mahajan\Desktop\DKM Business\LowPropTax\AccessorOffice\Marin\output_tax.csv'

os.makedirs(SCRAPE_DIR, exist_ok=True)

# ——— Regexes & Session ———
BILL_NUMBER_REGEX = re.compile(r'BillNumber=([\d-]+)')
FORMINFO_REGEX    = re.compile(r'name="__ncforminfo"\s+value="([^"]+)"')
AMOUNT_REGEX      = re.compile(r'Total Amount Due:\s+<span class="i16">\$([\d,\.]+)')

session = requests.Session()
session.headers.update({
    'User-Agent':      'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept':          'text/html,application/xhtml+xml',
    'Referer':         'https://apps.marincounty.org/TaxBillOnline/'
})

def fetch_and_save_bill(apn, out_path):
    """Fetch the bill HTML, gzip-save it, and return a status code."""
    r1 = session.get(f'https://apps.marincounty.org/TaxBillOnline/?PropertyId={apn}')
    if r1.status_code != 200:
        return 'FetchError'
    m_bill = BILL_NUMBER_REGEX.search(r1.text)
    if not m_bill:
        return 'NoBillNumber'
    bill_number = m_bill.group(1)

    m_form = FORMINFO_REGEX.search(r1.text)
    if not m_form:
        return 'NoFormInfo'
    forminfo = m_form.group(1)

    r2 = session.get(
        f'https://apps.marincounty.org/TaxBillOnline/Bill'
        f'?BillNumber={bill_number}&__ncforminfo={forminfo}'
    )
    if r2.status_code != 200:
        return 'FetchError'

    with gzip.open(out_path, 'wt', encoding='utf-8') as gz:
        gz.write(r2.text)
    return 'Fetched'

def extract_tax_from_gz(path):
    """Open gzipped HTML and pull out the tax amount."""
    with gzip.open(path, 'rt', encoding='utf-8') as gz:
        html = gz.read()
    m = AMOUNT_REGEX.search(html)
    if not m:
        return None
    return float(m.group(1).replace(',', ''))

# ——— Load CSV ———
print("Loading parcel CSV…")
df = pd.read_csv(CSV_PATH)
print(f"  → {len(df)} parcels loaded.")

# ——— Write output ———
with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f_out:
    fieldnames = ['address', 'apn', 'longitude', 'latitude', 'tax', 'county', 'zone', 'comment']
    writer = csv.DictWriter(f_out, fieldnames=fieldnames)
    writer.writeheader()

    for idx, row in df.iterrows():
        apn = row['Prop_ID']
        print(f"[{idx+1}/{len(df)}] APN {apn}")

        # No geometry → leave blank
        lon, lat = '', ''

        # placeholder address logic (you can enhance if needed)
        address = ''

        # decide whether to fetch or reuse
        gz_path = os.path.join(SCRAPE_DIR, f"{apn}.html.gz")
        if not os.path.exists(gz_path):
            status = fetch_and_save_bill(apn, gz_path)
            if status != 'Fetched':
                tax = ''
                print(f"   → {status}")
            else:
                tax_val = extract_tax_from_gz(gz_path)
                if tax_val is None:
                    tax = ''
                    status = 'ParseError'
                    print("   → ParseError")
                else:
                    tax = f"{tax_val:.2f}"
                    print(f"   → tax = {tax}")
        else:
            status = 'Cached'
            tax_val = extract_tax_from_gz(gz_path)
            if tax_val is None:
                tax = ''
                status = 'ParseError'
                print("   → Cached but ParseError")
            else:
                tax = f"{tax_val:.2f}"
                print(f"   → Cached, tax = {tax}")

        # write row
        writer.writerow({
            'address': address,
            'apn': apn,
            'longitude': lon,
            'latitude': lat,
            'tax': tax,
            'county': 'MN',
            'zone': '',
            'comment': status
        })

print("Done! CSV written to:", OUTPUT_CSV)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   → NoBillNumber
[93786/96284] APN 028-233-38
   → NoBillNumber
[93787/96284] APN 050-061-20
   → NoBillNumber
[93788/96284] APN 177-031-08
   → NoBillNumber
[93789/96284] APN 121-290-04
   → NoBillNumber
[93790/96284] APN 172-212-07
   → NoBillNumber
[93791/96284] APN 172-151-35
   → NoBillNumber
[93792/96284] APN 172-181-02
   → NoBillNumber
[93793/96284] APN 172-251-14
   → NoBillNumber
[93794/96284] APN 178-063-11
   → NoBillNumber
[93795/96284] APN 030-058-11
   → NoBillNumber
[93796/96284] APN 170-042-08
   → NoBillNumber
[93797/96284] APN 166-201-06
   → NoBillNumber
[93798/96284] APN 028-222-05
   → NoBillNumber
[93799/96284] APN 172-051-21
   → NoBillNumber
[93800/96284] APN 172-341-04
   → NoBillNumber
[93801/96284] APN 172-053-31
   → NoBillNumber
[93802/96284] APN 172-161-30
   → NoBillNumber
[93803/96284] APN 172-051-26
   → NoBillNumber
[93804/96284] APN 170-011-16
   → NoBillNumber
[93805/96284] APN 034-14