In [11]:
import pandas as pd
import os, gzip, time,requests

In [5]:
def construct_complete_address(row):
    """
    Build a complete address string using:
      - from_address_num
      - to_address_num (if different from the from_address_num)
      - street_name and street_type, and
      - analysis_neighborhood (if available)
    """
    from_num = str(row.get("from_address_num", "")).strip()
    to_num   = str(row.get("to_address_num", "")).strip()
    street   = str(row.get("street_name", "")).strip()
    st_type  = str(row.get("street_type", "")).strip()
    neighborhood = str(row.get("analysis_neighborhood", "")).strip()

    if from_num and to_num and from_num != to_num:
        house_num = f"{from_num}-{to_num}"
    else:
        house_num = from_num
    addr = f"{house_num} {street} {st_type}".strip()
    if neighborhood:
        addr = f"{addr}, {neighborhood}"
    return addr

def process_parcel_data(input_csv, output_csv):
    # Read your CSV into a DataFrame.
    df = pd.read_csv(input_csv)

    # Construct the complete address.
    df["complete_address"] = df.apply(construct_complete_address, axis=1)

    # Rename the APN field from 'blklot' and select the 10 key attributes.
    top10 = df.rename(columns={"blklot": "APN"})[[
        "APN",                 # Unique Assessor Parcel Number
        "complete_address",    # Constructed complete address
        "centroid_latitude",   # Geographic latitude of the centroid
        "centroid_longitude",  # Geographic longitude of the centroid
        "active",              # Active flag
        "supdist",             # Full name of Supervisorial District
        "supname",             # Name of current Supervisor
        "police_district",     # SFPD District
        "planning_district",   # Planning district
        "data_as_of"           # Timestamp of last update in the source
    ]]
    
    # Write the selected data to an output CSV file.
    top10.to_csv(output_csv, index=False)
    print(f"Processed data written to {output_csv}")

if __name__ == "__main__":
    # Use a raw string for Windows paths.
    input_csv_file = r"\SanFrancisco\Parcels___Active_and_Retired_20250412.csv"
    output_csv_file = "./processed_parcels.csv"
    process_parcel_data(input_csv_file, output_csv_file)


Processed data written to ./processed_parcels.csv


In [10]:
#### To check if the URL for scraping will work!

##https://sanfrancisco-ca.county-taxes.com/public/search?search_query=3995156&category=gsgx_property_tax

In [None]:
# File paths and endpoint constants:
INPUT_CSV = r"\SanFrancisco\Parcels___Active_and_Retired_20250412.csv"
OUTPUT_DIR = r"\SanFrancisco\Tax"
BASE_URL = "https://sanfrancisco-ca.county-taxes.com/public/search"

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Create a persistent HTTP session for efficiency.
session = requests.Session()

# Define headers to mimic a real browser.
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive"
}

def process_row(row):
    """
    For a given row (parcel record), check if its HTML file exists.
    If not, construct the URL (using the APN), request the HTML,
    and write it as a gzipped file.
    """
    apn = str(row["blklot"]).strip()
    output_path = os.path.join(OUTPUT_DIR, f"{apn}.html")
    if os.path.exists(output_path):
        return  # Skip if the file already exists

    url = f"{BASE_URL}?search_query={apn}&category=gsgx_property_tax"
    resp = session.get(url, headers=HEADERS)
    
    if resp.status_code == 200:
        with gzip.open(output_path, "wt") as f_out:
            f_out.write(resp.text)
        print(f"APN {apn} processed successfully.")
    else:
        print(f"-> Failed for APN {apn} with status code: {resp.status_code}")
    time.sleep(0.5)  # Rate limiting

def main():
    df = pd.read_csv(INPUT_CSV)
    df.apply(process_row, axis=1)

if __name__ == "__main__":
    main()
