In [1]:
import pandas as pd
import censusdata
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import glob
import seaborn as sb

%matplotlib inline
%config InlineBackend.figure_format ='retina'

from config import *

In [2]:
year = 2021

In [3]:
data = pd.read_csv(dataFolder + f'Building_Permits/co{year}a.txt', skiprows=1)

In [4]:
xwalk = pd.read_pickle(dataFolder + "/xwalk_data_combined.pkl")

In [5]:
#Extract block group code
xwalk['fips'] = (xwalk['bgrp'] / 10**7).astype(int)
fipsToCbsa = xwalk.groupby('fips').first()['cbsa']
fipsToCbsa = dict(zip(fipsToCbsa.index, fipsToCbsa.values)) #Dict is faster for lookup

In [6]:
data['FIPS'] = data['State'] * 10**3 + data['County']
data['CBSA'] = data['FIPS'].apply(lambda c: fipsToCbsa.get(c, 99999))

In [7]:
data.columns # First row skipped, it has moe information

Index(['Date', 'State', 'County', 'Code', 'Code.1', 'Name', 'Bldgs', 'Units',
       'Value', 'Bldgs.1', 'Units.1', 'Value.1', 'Bldgs.2', 'Units.2',
       'Value.2', 'Bldgs.3', 'Units.3', 'Value.3', 'Bldgs.4', 'Units.4',
       'Value.4', 'Bldgs.5', 'Units.5', 'Value.5', 'Bldgs.6', 'Units.6',
       'Value.6', 'Bldgs.7', 'Units.7', 'Value.7', 'FIPS', 'CBSA'],
      dtype='object')

In [8]:
data['Tot_Bldgs'] = data['Bldgs'] + data['Bldgs.1'] + data['Bldgs.2'] + data['Bldgs.3']
data['Tot_Units'] = data['Units'] + data['Units.1'] + data['Units.2'] + data['Units.3']
data['Tot_Bldgs_Value'] = data['Value'] + data['Value.1'] + data['Value.2'] + data['Value.3']

In [9]:
dataByCbsa = data.groupby(by='CBSA').agg({
    'Tot_Bldgs': 'sum',
    'Tot_Units': 'sum',
    'Tot_Bldgs_Value': 'sum',
})
dataByCbsa

Unnamed: 0_level_0,Tot_Bldgs,Tot_Units,Tot_Bldgs_Value
CBSA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10100,81,96,15477554
10140,427,432,101302000
10180,557,883,150495424
10220,14,19,2627000
10300,155,155,36885648
...,...,...,...
49660,386,400,82860702
49700,747,981,212299726
49740,1118,1291,205135992
49780,21,21,5263673


In [10]:
dataByCbsa.to_pickle(dataFolder + f'Building_Permits/dataByCbsa{year}.pkl')

In [11]:
dataByCbsa.sort_values(by='Tot_Bldgs_Value', ascending=False).head(25)

Unnamed: 0_level_0,Tot_Bldgs,Tot_Units,Tot_Bldgs_Value
CBSA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19100,53102,78705,16651947614
26420,53639,69263,13677947459
99999,45274,53986,12447089332
38060,35559,50581,11654523919
12420,25350,50907,9096074989
12060,31960,39466,8721860957
35620,15345,56661,8691942971
31080,12429,31151,6964837989
33100,8781,25313,6684442282
42660,9806,30743,6452235075
