In [1]:
import pandas as pd
import numpy as np
import os
import simpledbf
from datetime import datetime
pd.options.display.max_rows = 100

In [2]:
if os.getenv('USERNAME')    =='ywang':
    BOX_dir                 = 'C:\\Users\\ywang\\Box\\Modeling and Surveys\\Urban Modeling\\Bay Area UrbanSim 1.5\\PBA50'
    GitHub_petrale_dir      = 'C:\\Users\\ywang\\Documents\\GitHub\\petrale\\'
    GitHub_urbansim_dir     = 'C:\\Users\\ywang\\Documents\\GitHub\\bayarea_urbansim\\data'
    
    # input file locations
    dev_capacity_box_dir    = os.path.join(BOX_dir, 'Policies\\Base zoning\\outputs\\capacity')
    pba50_zoningmod_dir     = os.path.join(BOX_dir, 'Policies\\Zoning Modifications')
    other_inputs_dir        = os.path.join(BOX_dir, 'Policies\\Base zoning\\inputs')
    
    # output file location
    data_output_dir         = os.path.join(BOX_dir, 'Policies\\Base zoning\\outputs\\capacity')

    
ALLOWED_BUILDING_TYPE_CODES = ["HS","HT","HM","OF","HO","SC","IL","IW","IH","RS","RB","MR","MT","ME"]
RES_BUILDING_TYPE_CODES     = ["HS","HT","HM",                                        "MR"          ]
NONRES_BUILDING_TYPE_CODES  = [               "OF","HO","SC","IL","IW","IH","RS","RB","MR","MT","ME"]

# zoning data sources
data_sources = ['pba40','basis']

# used in calculate_capacity()
SQUARE_FEET_PER_ACRE                = 43560.0
SQUARE_FEET_PER_DU                  = 1200.0
FEET_PER_STORY                      = 11.0
PARCEL_USE_EFFICIENCY               = 0.5
SQUARE_FEET_PER_EMPLOYEE            = 350.0
SQUARE_FEET_PER_EMPLOYEE_OFFICE     = 175.0
SQUARE_FEET_PER_EMPLOYEE_INDUSTRIAL = 500.0

today = datetime.today().strftime('%Y_%m_%d')

In [3]:
basemap_b10_file = os.path.join(other_inputs_dir, "b10.csv")
basemap_b10 = pd.read_csv(
    basemap_b10_file,
    dtype = {'parcel_id':np.int64})
print("Read {:,} rows from {}".format(len(basemap_b10), basemap_b10_file))
display(basemap_b10.head())
display(basemap_b10.dtypes)

print("Out of {:,} buildings, there are {:,} unique values of 'building_id' and {:,} unique values of 'parcel_id'".format(
    len(basemap_b10), len(basemap_b10.building_id.unique()), len(basemap_b10.parcel_id.unique())))

Read 1,843,351 rows from C:\Users\ywang\Box\Modeling and Surveys\Urban Modeling\Bay Area UrbanSim 1.5\PBA50\Policies\Base zoning\inputs\b10.csv


Unnamed: 0,OBJECTID,building_id,parcel_id,development_type_id,improvement_value,residential_units,residential_sqft,sqft_per_unit,non_residential_sqft,building_sqft,nonres_rent_per_sqft,res_price_per_sqft,stories,year_built,redfin_sale_price,redfin_sale_year,redfin_home_type,costar_property_type,costar_rent,id
0,1,1,742974,1,0.0,1,2029,2029.42425,0,2029.42425,0.0,302.769751,1,1945,,,,,,1
1,2,2,744961,1,0.0,1,2029,2029.42425,0,2029.42425,0.0,254.429279,1,1965,,,,,,2
2,3,3,1442641,1,53262.87,1,1568,1568.0,0,1568.0,0.0,183.474166,1,1964,,,,,,3
3,4,4,190969,2,245000.0,0,0,1266.0,1595,1266.0,0.0,0.0,2,1992,340000.0,2003.0,Condo/Coop,,,4
4,5,5,308709,2,283500.0,0,0,1513.0,1513,1513.0,0.0,0.0,1,1978,442000.0,2004.0,Condo/Coop,,,5


OBJECTID                  int64
building_id               int64
parcel_id                 int64
development_type_id       int64
improvement_value       float64
residential_units         int64
residential_sqft          int64
sqft_per_unit           float64
non_residential_sqft      int64
building_sqft           float64
nonres_rent_per_sqft    float64
res_price_per_sqft      float64
stories                   int64
year_built                int64
redfin_sale_price       float64
redfin_sale_year        float64
redfin_home_type         object
costar_property_type     object
costar_rent              object
id                        int64
dtype: object

Out of 1,843,351 buildings, there are 1,843,351 unique values of 'building_id' and 1,843,292 unique values of 'parcel_id'


In [4]:
# count a building as "vacant" based on building's development_type_id
# https://github.com/BayAreaMetro/petrale/blob/master/incoming/dv_buildings_det_type_lu.csv
basemap_b10["building_vacant"] = 0.0
basemap_b10.loc[basemap_b10.development_type_id== 0, "building_vacant"] = 1.0
basemap_b10.loc[basemap_b10.development_type_id== 15, "building_vacant"] = 1.0

In [5]:
###### Bring in Building data (b10) to determine parcel characteristics

basemap_p10_file = os.path.join(other_inputs_dir, 'p10.csv')
basemap_p10 = pd.read_csv(
    basemap_p10_file,
    usecols =['PARCEL_ID','geom_id_s','ACRES','LAND_VALUE'],
    dtype   ={'PARCEL_ID':np.float64, 'geom_id_s':str, 
              'ACRES':np.float64, 'LAND_VALUE':np.float64})

# merge builing and p10 parcel data w/ Outer-join
basemap_b10_p10 = pd.merge(left=basemap_b10, right=basemap_p10[['PARCEL_ID','LAND_VALUE']], 
                           left_on='parcel_id', right_on='PARCEL_ID', how='outer')
print("basemap_b10_p10 has {:,} rows; head():".format(len(basemap_b10_p10)))
display(basemap_b10_p10.head())
display(basemap_b10_p10.dtypes)

# combine values for multiple buildings within one parcel
basemap_b10_p10_groupby_parcel = basemap_b10_p10.groupby(['PARCEL_ID']).agg({
    'LAND_VALUE'          :'max',
    'improvement_value'   :'sum',
    'residential_units'   :'sum',
    'residential_sqft'    :'sum',
    'non_residential_sqft':'sum',
    'building_sqft'       :'sum',
    'redfin_sale_price'   :'sum',
    # 'costar_rent'         :'sum', # this is a string
    'year_built'          :'min',
    'building_id'         :'min',
    'building_vacant'     :'prod'}) # all buildings must be vacant to call this vacant



basemap_b10_p10 has 2,369,891 rows; head():


Unnamed: 0,OBJECTID,building_id,parcel_id,development_type_id,improvement_value,residential_units,residential_sqft,sqft_per_unit,non_residential_sqft,building_sqft,...,year_built,redfin_sale_price,redfin_sale_year,redfin_home_type,costar_property_type,costar_rent,id,building_vacant,PARCEL_ID,LAND_VALUE
0,1.0,1.0,742974.0,1.0,0.0,1.0,2029.0,2029.42425,0.0,2029.42425,...,1945.0,,,,,,1.0,0.0,742974.0,5706.0
1,2.0,2.0,744961.0,1.0,0.0,1.0,2029.0,2029.42425,0.0,2029.42425,...,1965.0,,,,,,2.0,0.0,744961.0,429.0
2,3.0,3.0,1442641.0,1.0,53262.87,1.0,1568.0,1568.0,0.0,1568.0,...,1964.0,,,,,,3.0,0.0,,
3,4.0,4.0,190969.0,2.0,245000.0,0.0,0.0,1266.0,1595.0,1266.0,...,1992.0,340000.0,2003.0,Condo/Coop,,,4.0,0.0,190969.0,105000.0
4,5.0,5.0,308709.0,2.0,283500.0,0.0,0.0,1513.0,1513.0,1513.0,...,1978.0,442000.0,2004.0,Condo/Coop,,,5.0,0.0,,


OBJECTID                float64
building_id             float64
parcel_id               float64
development_type_id     float64
improvement_value       float64
residential_units       float64
residential_sqft        float64
sqft_per_unit           float64
non_residential_sqft    float64
building_sqft           float64
nonres_rent_per_sqft    float64
res_price_per_sqft      float64
stories                 float64
year_built              float64
redfin_sale_price       float64
redfin_sale_year        float64
redfin_home_type         object
costar_property_type     object
costar_rent              object
id                      float64
building_vacant         float64
PARCEL_ID               float64
LAND_VALUE              float64
dtype: object

In [6]:
## combine building/parcel data with parcel-level capacity data

capacity_file = os.path.join(dev_capacity_box_dir,'2020_04_29_devCapacity_allAttrs_BASIS_devType_intensity_partial.csv')
capacity = pd.read_csv(
    capacity_file,
    usecols = ['PARCEL_ID','ACRES','county_id', 'county_name', 'juris_zmod','nodev_zmod',
               'units_pba40','units_basis',
               'sqft_pba40','sqft_basis',
               'Ksqft_pba40','Ksqft_basis',
               'emp_pba40','emp_basis'])

capacity_b10 = pd.merge(left=capacity, right=basemap_b10_p10_groupby_parcel,
                        how="left", on="PARCEL_ID")

print("capacity_b10 has {:,} rows; head():".format(len(capacity_b10)))
display(capacity_b10.head())
display(capacity_b10.dtypes)



capacity_b10 has 1,956,208 rows; head():


Unnamed: 0,PARCEL_ID,ACRES,nodev_zmod,units_pba40,sqft_pba40,Ksqft_pba40,emp_pba40,units_basis,sqft_basis,Ksqft_basis,...,LAND_VALUE,improvement_value,residential_units,residential_sqft,non_residential_sqft,building_sqft,redfin_sale_price,year_built,building_id,building_vacant
0,229116,3.36052,0,6.721041,0.0,0.0,0.0,6.721041,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,1.0
1,244166,1.294423,0,3.883268,0.0,0.0,0.0,3.883268,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,1.0
2,202378,14.993605,0,130.444362,0.0,0.0,0.0,0.0,0.0,0.0,...,6036500.0,0.0,20.0,101000.0,0.0,101000.0,1007250.0,2009.0,15681.0,0.0
3,2004420,316.247146,0,1.318751,0.0,0.0,0.0,1.318751,0.0,0.0,...,179954.0,146211.0,0.0,0.0,0.0,0.0,0.0,1965.0,17798.0,0.0
4,340332,0.621275,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,1.0


PARCEL_ID                 int64
ACRES                   float64
nodev_zmod                int64
units_pba40             float64
sqft_pba40              float64
Ksqft_pba40             float64
emp_pba40               float64
units_basis             float64
sqft_basis              float64
Ksqft_basis             float64
emp_basis               float64
county_id                 int64
county_name              object
juris_zmod               object
LAND_VALUE              float64
improvement_value       float64
residential_units       float64
residential_sqft        float64
non_residential_sqft    float64
building_sqft           float64
redfin_sale_price       float64
year_built              float64
building_id             float64
building_vacant         float64
dtype: object

In [7]:
## Identify vacant parcels

capacity_b10["is_vacant"] = False
capacity_b10.loc[ capacity_b10['building_id'].isnull(),   "is_vacant" ] = True
capacity_b10.loc[ capacity_b10['building_vacant'] == 1.0, "is_vacant" ] = True
capacity_b10.loc[(capacity_b10['improvement_value'   ] == 0) & 
                 (capacity_b10['residential_units'   ] == 0) &
                 (capacity_b10['residential_sqft'    ] == 0) &
                 (capacity_b10['non_residential_sqft'] == 0) &
                 (capacity_b10['building_sqft'       ] == 0), "is_vacant"] = True
print("capacity_b10.is_vacant:")
display(capacity_b10["is_vacant"].value_counts())

capacity_b10.is_vacant:


False    1570070
True      386138
Name: is_vacant, dtype: int64

In [8]:
## Identify under-built parcels and calculate the net units capacity for under-built parcels 

for data_source in data_sources:
    new_units = (capacity_b10['units_' + data_source] - 
                 capacity_b10['residential_units']    - 
                 capacity_b10['non_residential_sqft'] / SQUARE_FEET_PER_DU).clip(lower=0)
    ratio = (new_units / capacity_b10['residential_units']).replace(np.inf, 1)
    capacity_b10['is_under_built_' + data_source] = ratio > 0.5
    print('under_built parcels counts - ', data_source,':\n', (capacity_b10['is_under_built_' + data_source].value_counts()))

under_built parcels counts -  pba40 :
 False    1436251
True      519957
Name: is_under_built_pba40, dtype: int64
under_built parcels counts -  basis :
 False    1514621
True      441587
Name: is_under_built_basis, dtype: int64


In [9]:
## Calculate zoned capacity to existing capactiy ratio

for data_source in data_sources:
    # ratio of existing res units to zoned res units
    capacity_b10['res_zoned_existing_ratio'] = (capacity_b10['residential_units'] / capacity_b10['units_' + data_source]).replace(np.inf, 1).clip(lower=0)
    # ratio of existing non-res sqft to zoned non-res sqft
    capacity_b10['nonres_zoned_existing_ratio'] = (capacity_b10['non_residential_sqft'] / capacity_b10['sqft_' + data_source]).replace(np.inf, 1).clip(lower=0)
    

In [10]:
## Identify old buildings (if multiple buildings on one parcel, take the oldest) and not build on before-1940 parcels

capacity_b10['building_age'] = 'missing'
capacity_b10.loc[ capacity_b10.year_built >= 2000, 'building_age' ] = 'after 2000'
capacity_b10.loc[ capacity_b10.year_built <  2000, 'building_age' ] = '1980-2000'
capacity_b10.loc[ capacity_b10.year_built <  1980, 'building_age' ] = '1940-1980'
capacity_b10.loc[ capacity_b10.year_built <  1940, 'building_age' ] = 'before 1940'

capacity_b10['has_old_building'] = np.nan
capacity_b10.loc[ capacity_b10.building_age == 'before 1940','has_old_building'] = True
print("capacity_b10.building_age:")
display(capacity_b10["building_age"].value_counts())

capacity_b10.building_age:


1940-1980      885647
missing        378551
1980-2000      303551
before 1940    250662
after 2000     137797
Name: building_age, dtype: int64

In [11]:
## Calculate parcel's investment-land ratio

capacity_b10['ILR'] = capacity_b10['improvement_value'] / capacity_b10['LAND_VALUE']
capacity_b10.loc[capacity_b10['LAND_VALUE'] == 0, 'ILR'] = 'n/a'

In [12]:
capacity_b10.to_csv(os.path.join(data_output_dir, today+'_capacity_gross_net.csv'),index = False)