In [1]:
from pathlib import Path
import pandas as pd
import geopandas as gpd
from mtcpy.aws import pull_df_from_redshift_sql, pull_geotable_redshift

Info: Found credentials at: /Users/jcroff/Library/CloudStorage/Box-Box/dvutils-creds-jcroff.json


In [2]:
# set workign directory 
work_dir = Path("/Users/jcroff/Library/CloudStorage/Box-Box/DSA Projects/Spatial Analysis and Mapping/Wildfire Risk Analysis/Data Outputs")

# pull data from redshift

In [3]:
req_cols = [
    "geoid",
    # "zoning_type", - out of scope
    # "zoning_subtype", - out of scope
    "structno",
    "yearbuilt",
    "numstories",
    "numunits",
    # "numrooms", - out of scope
    # "structstyle", - available in premium but not ingested into redshift
    # "improvval", - out of scope
    # "landval", - out of scope
    # "parval", - out of scope
    "saleprice",
    "scity",
    "county",
    "szip",
    "lat",
    "lon",
    "fema_flood_zone",
    # "fema_nri_risk_rating", - available in premium but not ingested into redshift
    # "qoz", - avialable in premium but not ingested into redshift
    # "census_tract", - out of scope
    # "census_block", - out of scope
    # "sourceurl", - available in premium but not ingested into redshift
    # "recrdareano", - available in premium but not ingested into redshift
    "ll_gisacre",
    "ll_gissqft",
    "ll_bldg_footprint_sqft",
    "ll_uuid",
    "ll_stack_uuid",
    "ll_bldg_count",
    "lbcs_structure_desc",
    # "housing_affordability_index", - available in premium but not ingested into redshift
    # "population_density", - available in premium but not ingested into redshift
    # "population_growth_past_5_years", - available in premium but not ingested into redshift
    # "population_growth_next_5_years", - available in premium but not ingested into redshift
    # "housing_growth_past_5_years", - available in premium but not ingested into redshift
    # "housing_growth_next_5_years", - available in premium but not ingested into redshift
    # "median_household_income", - available in premium but not ingested into redshift
    # "transmission_line_distance", - available in premium but not ingested into redshift
]
sql_cols = ", ".join(req_cols)
sql = f"SELECT {sql_cols} FROM regrid_v25.parcel_source_tbl_v25" 
df = pull_df_from_redshift_sql(sql_statement=sql)

took 1.3119 minutes


In [4]:
# convert numeric columns to numeric
numeric_cols = [
    "structno",
    "yearbuilt",
    "numstories",
    "numunits",
]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col])

In [5]:
df["numunits"].isna().mean()

# overall, .80 of the parcels is missing numunits

0.8002539159719686

In [6]:
# get a count of records where structno is equal to or greater than 1
df[df["structno"] > 0].shape[0]

159013

In [7]:
# get a count of records where ll_bldg_count is equal to or greater than 1
df[df["ll_bldg_count"] > 0].shape[0]

2168498

In [8]:
# get a count of records where numunits is not null but structno is null
df[(df["numunits"].notnull()) & (df["structno"].isnull())].shape[0]

311022

In [9]:
# get a count of records where numunits is not null but ll_bldg_count is null
df[(df["numunits"].notnull()) & (df["ll_bldg_count"].isnull())].shape[0]

0

In [10]:
# get a summary of non-null numunits by lbcs_structure_desc
grouped = df.groupby("lbcs_structure_desc")["numunits"].agg(
    count_non_null_units="count",  # non-null values
    total_number_structures="size",  # total number of rows in group
    missing_units=lambda x: x.isna().sum(),  # number of missing
    missing_pct_units=lambda x: x.isna().mean() * 100,  # percent missing
)
grouped

Unnamed: 0_level_0,count_non_null_units,total_number_structures,missing_units,missing_pct_units
lbcs_structure_desc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Air and space transportation facility,0,20,20,100.000000
Airport terminal,0,19,19,100.000000
Assembly and construction-type plants,0,216,216,100.000000
Attached units,10858,10876,18,0.165502
Automobile parking facilities,4,1532,1528,99.738903
...,...,...,...,...
Utility and other nonbuilding structures,0,1075,1075,100.000000
Warehouse discount store building,1,10,9,90.000000
Warehouse or storage facility,106,6694,6588,98.416492
Water transportation or marine related,0,9,9,100.000000


In [11]:
grouped.sort_values(by='missing_pct_units', ascending=True).reset_index().to_csv(
    work_dir / "missing_numunits_by_structure_type.csv",
    index=False
)

In [12]:
df.lbcs_structure_desc.unique()

array(['Residential buildings', 'Unclassified', 'Single-family buildings',
       'Water-supply-related facility',
       'Railroads, including monorails, etc.', 'Office or bank building',
       'Manufactured housing', 'Highways and roads',
       'Multifamily structures',
       'Utility and other nonbuilding structures',
       'Store or shop building', 'Department store building',
       'Hotels, motels, and tourist courts',
       'Sheds, farm buildings, or agricultural facilities',
       'Industrial buildings and structures',
       'Multifamily structures: Two Units',
       'Office or store building with residence on top',
       'Multifamily structures: Four Units', 'Townhouses',
       'Medical facility', 'Public assembly structures',
       'Warehouse or storage facility',
       'Light industrial structures and facilities',
       'Commercial buildings and other specialized structures',
       'School or university buildings',
       'Automobile repair and service structur

In [13]:
# look at single-family building missing numunits
df.query("lbcs_structure_desc == 'Single-family buildings'")

Unnamed: 0,geoid,structno,yearbuilt,numstories,numunits,saleprice,scity,county,szip,lat,lon,fema_flood_zone,ll_gisacre,ll_gissqft,ll_bldg_footprint_sqft,ll_uuid,ll_stack_uuid,ll_bldg_count,lbcs_structure_desc
131,06085,,2018.0,,,,Gilroy,Santa Clara,95020-4183,37.018880,-121.586218,X,0.08,3342.0,1918.0,08c3a857-86eb-4d7a-87c6-e59dc463bc46,,1,Single-family buildings
132,06085,,2018.0,,,,Gilroy,Santa Clara,95020-4183,37.018877,-121.586395,X,0.09,4111.0,2150.0,1e97c378-390b-4164-a257-dbb1c88a8b52,,1,Single-family buildings
133,06085,,2018.0,,,,Gilroy,Santa Clara,95020-4183,37.018917,-121.584843,X,0.09,3884.0,2499.0,7f5f3f7f-a5c6-4f15-9c00-6ea78c9b1875,,1,Single-family buildings
134,06085,,2018.0,,,,Gilroy,Santa Clara,95020-4183,37.018919,-121.585000,X,0.09,3886.0,2129.0,92cac6f9-addf-4cde-a65b-0bf26d352694,,1,Single-family buildings
135,06085,,2018.0,,,,Gilroy,Santa Clara,95020-4183,37.018921,-121.585157,X,0.09,3890.0,2245.0,db5df4de-4305-47a3-affc-671144920fae,,1,Single-family buildings
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2324330,06097,1.0,1927.0,1.0,1.0,185000.0,Santa Rosa,Sonoma,95407-7757,38.386830,-122.733277,X,0.49,21187.0,4819.0,c5222228-2a2f-42dd-b2ed-dc52225b98c6,,5,Single-family buildings
2324382,06097,1.0,1964.0,1.0,1.0,650000.0,Santa Rosa,Sonoma,95409-3512,38.469401,-122.662494,X,0.16,7153.0,2294.0,f1cf800b-a81d-45fe-817c-076b9c1bd0b3,,1,Single-family buildings
2324383,06097,1.0,1964.0,1.0,1.0,585000.0,Santa Rosa,Sonoma,95409-3561,38.469394,-122.662788,X,0.16,7168.0,2746.0,bb826a59-c0ef-4dc7-ad05-6178dbf5f01d,,1,Single-family buildings
2324384,06097,1.0,1963.0,1.0,1.0,600000.0,Santa Rosa,Sonoma,95409-3561,38.469345,-122.663000,X,0.17,7317.0,2913.0,9fdfaa91-f612-4bae-b795-3cfa54887ee1,,1,Single-family buildings
