In [1]:
# step 0: spatially join business data to tazs using "contains" so businesses on taz boundaries don't get dropped

In [2]:
import numpy as np
import pandas as pd

In [3]:
taz_bus = pd.read_csv('./taz1454_BAbusinesses_contains.csv')
naics = pd.read_csv('./naics_recode.csv')

In [4]:
# drop one copy of businesses that were joined to two tazs because they are on the boundary
print(len(taz_bus))
taz_bus_null = taz_bus[taz_bus['JOIN_FID'] == -1]
taz_bus_notnull = taz_bus[taz_bus['JOIN_FID'] != -1]
taz_bus = taz_bus_notnull.drop_duplicates('JOIN_FID', keep='last')

323630


In [5]:
# join businesses and six categories
taz_bus['naicssix'] = taz_bus['NAICS'].astype(str).str[:6].astype(int)
taz_bus_null['naicssix'] = 0
taz_bus = taz_bus.append(taz_bus_null)
taz_bus_naics = pd.merge(taz_bus, naics, on='naicssix', how='left')
print(len(taz_bus_naics))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


323626


In [6]:
# number of tazs
len(taz_bus_naics['TAZ1454'].unique())

1454

In [7]:
# create steelhead columns and populate them with no. of employees
for i in taz_bus_naics['sixcat'].unique():
    taz_bus_naics[i] = 0
    for j in np.arange(len(taz_bus_naics)):
        if taz_bus_naics['sixcat'][j] == i:
            taz_bus_naics.loc[j,i] = taz_bus_naics['EMPNUM'][j]
taz_bus_naics.describe()

Unnamed: 0,OBJECTID_x,Join_Count,TARGET_FID,JOIN_FID,SUPERD,TAZ1454,Shape__Are,Shape__Len,diff,LOCNUM,...,naics2,remi70,FPSEMPN,OTHEMPN,HEREEMPN,RETEMPN,MWTEMPN,MISSING,AGREMPN,nan
count,323626.0,323626.0,323626.0,323626.0,323626.0,323626.0,323626.0,323626.0,323626.0,323625.0,...,323625.0,323625.0,323626.0,323626.0,323626.0,323626.0,323626.0,323626.0,323626.0,323626.0
mean,-1.0,0.999997,767.678234,161919.133969,15.355481,739.755394,0.0,0.0,2503.641518,494107200.0,...,899.231048,154.40702,2.282576,1.610371,3.86722,1.545398,1.935682,0.088664,0.042565,0.0
std,0.0,0.001758,430.524526,93529.621065,9.514729,452.548579,0.0,0.0,5519.912071,214681700.0,...,1696.402438,863.828437,36.563921,34.725106,60.360497,18.178067,54.278985,2.702688,2.884014,0.0
min,-1.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,-2735.0,16295.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-1.0,1.0,401.0,80909.25,8.0,347.0,0.0,0.0,178.0,402943900.0,...,52.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-1.0,1.0,751.0,161847.5,15.0,739.0,0.0,0.0,688.0,426607600.0,...,61.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,-1.0,1.0,1168.0,242948.75,22.0,1152.0,0.0,0.0,2253.0,644791600.0,...,81.0,56.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
max,-1.0,1.0,1453.0,323903.0,34.0,1454.0,0.0,0.0,55026.0,998913600.0,...,4849.0,6768.0,10001.0,8000.0,17383.0,4500.0,11269.0,1100.0,900.0,0.0


In [8]:
# clean up columns and summarize employment by taz
taz_bus_naics_col = taz_bus_naics[['TAZ1454', 'EMPNUM', 'AGREMPN', 'FPSEMPN', 'HEREEMPN', 'MWTEMPN', 'OTHEMPN', 'RETEMPN', 'MISSING']]
taz_bus_naics_sum = taz_bus_naics_col.groupby(['TAZ1454']).sum()
taz_bus_naics_sum['OTHEMPN'] = taz_bus_naics_sum['OTHEMPN'] + taz_bus_naics_sum['MISSING'] # fold missing into othempn
taz_bus_naics_sum = taz_bus_naics_sum.drop(['MISSING'], axis=1)
taz_bus_naics_sum.head()

Unnamed: 0_level_0,EMPNUM,AGREMPN,FPSEMPN,HEREEMPN,MWTEMPN,OTHEMPN,RETEMPN,MISSING
TAZ1454,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16674.0,10.0,13069.0,1298.0,341.0,1307.0,649.0,364.0
2,25943.0,0.0,17533.0,3281.0,1023.0,1228.0,2878.0,173.0
3,2218.0,0.0,714.0,902.0,67.0,396.0,139.0,32.0
4,18250.0,1.0,11124.0,1754.0,1834.0,2705.0,832.0,228.0
5,18613.0,0.0,3523.0,6055.0,2658.0,555.0,5822.0,133.0


In [11]:
# check that employment totals are the same at the beginning and end
print(taz_bus['EMPNUM'].sum())
print((taz_bus_naics_sum['AGREMPN'].sum()+taz_bus_naics_sum['FPSEMPN'].sum()+taz_bus_naics_sum['HEREEMPN'].sum()
      +taz_bus_naics_sum['MWTEMPN'].sum()+taz_bus_naics_sum['OTHEMPN'].sum()+taz_bus_naics_sum['RETEMPN'].sum()))

3680429.0
3680429.0


In [13]:
# check that the correct no. of tazs come out
len(taz_bus_naics_sum)

1454

In [12]:
taz_bus_naics_sum.to_csv('./2015_employment_TAZ1454.csv')