In [1]:
from random import random
from bisect import bisect
from itertools import repeat
import numpy as np
import pandas as pd
import geopandas as gpd

In [2]:
buildings_path = './buildings_assignment_input_files/buildings_CT/bldgs_ct.shp'
buildings = gpd.read_file(buildings_path)

In [3]:
buildings.GEOID10.nunique()

263

In [4]:
buildings.head()

Unnamed: 0,predicted,GEOID10,geometry
0,0,51059980300,"POLYGON ((-77.15002 38.95214, -77.14999 38.952..."
1,0,51059980300,"POLYGON ((-77.14759 38.95328, -77.14746 38.953..."
2,0,51059980300,"POLYGON ((-77.14823 38.95111, -77.14825 38.951..."
3,0,51059980300,"POLYGON ((-77.15000 38.95663, -77.15016 38.956..."
4,0,51059980300,"POLYGON ((-77.14972 38.95580, -77.14971 38.955..."


In [5]:
buildings['x_centroid'] = np.floor(buildings['geometry'].to_crs(epsg=32610).centroid.x).astype(np.int64)
buildings['y_centroid'] = np.floor(buildings['geometry'].to_crs(epsg=32610).centroid.y).astype(np.int64)

In [6]:
buildings = buildings.reset_index()
buildings = buildings.rename(columns={"index":"building_id"})

In [7]:
buildings = buildings[['building_id','x_centroid','y_centroid','predicted','GEOID10']]
buildings = buildings.rename(columns={"predicted":"building_type", "GEOID10":"ct_id"})
buildings['ct_id'] = buildings['ct_id'].astype(np.int64)

In [8]:
buildings.head()

Unnamed: 0,building_id,x_centroid,y_centroid,building_type,ct_id
0,0,4521868,5460193,0,51059980300
1,1,4521920,5460429,0,51059980300
2,2,4522053,5460153,0,51059980300
3,3,4521462,5460653,0,51059980300
4,4,4521557,5460603,0,51059980300


In [9]:
buildings.to_csv('./repast4py/input/input_buildings_ffx.csv', index=False)

In [10]:
buildings.head()

Unnamed: 0,building_id,x_centroid,y_centroid,building_type,ct_id
0,0,4521868,5460193,0,51059980300
1,1,4521920,5460429,0,51059980300
2,2,4522053,5460153,0,51059980300
3,3,4521462,5460653,0,51059980300
4,4,4521557,5460603,0,51059980300


In [11]:
buildings = buildings[['building_id','building_type','ct_id']]

##### building types: 0 - non-residential; 1 - residential

In [12]:
buildings[buildings['building_type'] == 0].shape

(9699, 3)

In [13]:
buildings.head()

Unnamed: 0,building_id,building_type,ct_id
0,0,0,51059980300
1,1,0,51059980300
2,2,0,51059980300
3,3,0,51059980300
4,4,0,51059980300


In [14]:
buildings['ct_id'].nunique()

263

In [15]:
buildings[buildings['building_type'] == 0]['ct_id'].nunique()

247

In [16]:
buildings[buildings['building_type'] == 1]['ct_id'].nunique()

260

In [17]:
d = {'ct_id': buildings['ct_id'].drop_duplicates().reset_index(drop=True),
     'res_count': [0] * buildings['ct_id'].nunique(), 'non_res_count': [0] * buildings['ct_id'].nunique()}
buildings_per_ct = pd.DataFrame(data=d)

In [18]:
buildings_per_ct.head()

Unnamed: 0,ct_id,res_count,non_res_count
0,51059980300,0,0
1,51059480300,0,0
2,51059480202,0,0
3,51059421001,0,0
4,51059460501,0,0


In [19]:
buildings_per_ct['res_count']=buildings_per_ct['ct_id'].map(buildings[buildings['building_type'] == 1].groupby('ct_id').count()['building_type'])
buildings_per_ct['non_res_count']=buildings_per_ct['ct_id'].map(buildings[buildings['building_type'] == 0].groupby('ct_id').count()['building_type'])

In [20]:
buildings_per_ct[buildings_per_ct['res_count'].isna()]

Unnamed: 0,ct_id,res_count,non_res_count
10,51059980200,,13.0
165,51059980100,,3.0
193,51059452802,,9.0


In [21]:
buildings_per_ct = buildings_per_ct.fillna(0)
buildings_per_ct['res_count'] = buildings_per_ct['res_count'].astype(np.int64)
buildings_per_ct['non_res_count'] = buildings_per_ct['non_res_count'].astype(np.int64)

In [22]:
buildings_per_ct.head()

Unnamed: 0,ct_id,res_count,non_res_count
0,51059980300,3,50
1,51059480300,1466,110
2,51059480202,62,126
3,51059421001,359,55
4,51059460501,831,72


In [23]:
buildings[buildings['ct_id'] == 51059980300].shape

(53, 3)

In [24]:
buildings[(buildings['ct_id'] == 51059980300) & (buildings['building_type'] == 0)].shape

(50, 3)

In [25]:
buildings[(buildings['ct_id'] == 51059980300) & (buildings['building_type'] == 1)].shape

(3, 3)

In [26]:
buildings_per_ct[buildings_per_ct['ct_id'] == 51059415100]

Unnamed: 0,ct_id,res_count,non_res_count
172,51059415100,1056,43


In [27]:
buildings.groupby(['ct_id','building_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,building_id
ct_id,building_type,Unnamed: 2_level_1
51059415100,0,43
51059415100,1,1056
51059415200,0,23
51059415200,1,438
51059415300,0,15
...,...,...
51600300300,1,1118
51600300400,0,115
51600300400,1,1196
51600300500,0,100


In [28]:
ct_path = './buildings_assignment_input_files/ffx_county_city_ct/ffx_county_city_ct.shp'
ct_2010 = gpd.read_file(ct_path)

In [29]:
ct_2010.columns

Index(['GEOID10', 'geometry'], dtype='object')

In [30]:
ct_2010.shape

(263, 2)

In [31]:
ct_2010['GEOID10'] = ct_2010['GEOID10'].astype(np.int64)

In [32]:
flows_path = './buildings_assignment_input_files/commuting_flows_gt.csv'
commuting_flows = pd.read_csv(flows_path)
commuting_flows.shape

(34366, 3)

In [33]:
commuting_flows = commuting_flows[commuting_flows['h_geoid'].isin(ct_2010['GEOID10'])]
commuting_flows = commuting_flows[commuting_flows['w_geoid'].isin(ct_2010['GEOID10'])]
commuting_flows.shape

(34366, 3)

In [34]:
commuting_flows.nunique()

h_geoid    263
w_geoid    263
count      199
dtype: int64

In [35]:
ct_2010.shape

(263, 2)

In [36]:
buildings_per_ct.head()

Unnamed: 0,ct_id,res_count,non_res_count
0,51059980300,3,50
1,51059480300,1466,110
2,51059480202,62,126
3,51059421001,359,55
4,51059460501,831,72


In [37]:
buildings_per_ct[buildings_per_ct['res_count'] != 0]['ct_id'].shape

(260,)

In [38]:
buildings_per_ct[buildings_per_ct['non_res_count'] != 0]['ct_id'].shape

(247,)

In [39]:
commuting_flows.shape

(34366, 3)

In [40]:
buildings_per_ct[buildings_per_ct['res_count'] == 0]['ct_id']

10     51059980200
165    51059980100
193    51059452802
Name: ct_id, dtype: int64

In [41]:
commuting_flows[commuting_flows['h_geoid'].isin
                (buildings_per_ct[buildings_per_ct['res_count'] == 0]['ct_id'])].shape

(165, 3)

In [42]:
commuting_flows[commuting_flows['h_geoid'].isin
                (buildings_per_ct[buildings_per_ct['res_count'] == 0]['ct_id'])].head()

Unnamed: 0,h_geoid,w_geoid,count
17111,51059452802,51059415100,3
17112,51059452802,51059415500,6
17113,51059452802,51059415800,1
17114,51059452802,51059420502,1
17115,51059452802,51059421001,2


In [43]:
commuting_flows[commuting_flows['h_geoid'].isin
                (buildings_per_ct[buildings_per_ct['res_count'] == 0]['ct_id'])]['count'].sum()

494

In [44]:
commuting_flows = commuting_flows[commuting_flows['h_geoid'].isin
                                  (buildings_per_ct[buildings_per_ct['res_count'] != 0]['ct_id'])]
commuting_flows = commuting_flows[commuting_flows['w_geoid'].isin
                                  (buildings_per_ct[buildings_per_ct['non_res_count'] != 0]['ct_id'])]
commuting_flows.shape

(33757, 3)

In [45]:
commuting_flows.head()

Unnamed: 0,h_geoid,w_geoid,count
0,51059415100,51059415100,21
1,51059415100,51059415200,14
2,51059415100,51059415300,4
3,51059415100,51059415401,4
4,51059415100,51059415500,6


In [46]:
commuting_flows.nunique()

h_geoid    260
w_geoid    247
count      199
dtype: int64

In [47]:
commuting_flows[commuting_flows['h_geoid'].isin(commuting_flows['w_geoid'])].nunique()

h_geoid    244
w_geoid    247
count      197
dtype: int64

In [48]:
commuting_flows.head()

Unnamed: 0,h_geoid,w_geoid,count
0,51059415100,51059415100,21
1,51059415100,51059415200,14
2,51059415100,51059415300,4
3,51059415100,51059415401,4
4,51059415100,51059415500,6


In [49]:
commuting_flows.groupby('h_geoid').sum()['count']

h_geoid
51059415100     499
51059415200     511
51059415300     728
51059415401    1023
51059415402     395
               ... 
51600300100    1508
51600300200    1353
51600300300    1490
51600300400    1169
51600300500     941
Name: count, Length: 260, dtype: int64

In [50]:
buildings_per_ct.shape

(263, 3)

In [51]:
buildings_per_ct.head()

Unnamed: 0,ct_id,res_count,non_res_count
0,51059980300,3,50
1,51059480300,1466,110
2,51059480202,62,126
3,51059421001,359,55
4,51059460501,831,72


In [52]:
pop_per_ct = buildings_per_ct.copy()
pop_per_ct = pop_per_ct.rename(columns={'res_count': 'res_pop', 'non_res_count': 'work_pop'})
pop_per_ct['res_pop'].values[:] = 0
pop_per_ct['work_pop'].values[:] = 0

In [53]:
pop_per_ct.head()

Unnamed: 0,ct_id,res_pop,work_pop
0,51059980300,0,0
1,51059480300,0,0
2,51059480202,0,0
3,51059421001,0,0
4,51059460501,0,0


In [54]:
commuting_flows.groupby('h_geoid').sum()['count'].head()

h_geoid
51059415100     499
51059415200     511
51059415300     728
51059415401    1023
51059415402     395
Name: count, dtype: int64

In [55]:
pop_per_ct['res_pop'] = pop_per_ct['ct_id'].map(
    commuting_flows.groupby('h_geoid').sum()['count'])
pop_per_ct['work_pop'] = pop_per_ct['ct_id'].map(
    commuting_flows.groupby('w_geoid').sum()['count'])

In [56]:
pop_per_ct.head()

Unnamed: 0,ct_id,res_pop,work_pop
0,51059980300,46.0,22.0
1,51059480300,1255.0,1129.0
2,51059480202,1293.0,19333.0
3,51059421001,570.0,1490.0
4,51059460501,553.0,4754.0


In [57]:
pop_per_ct[pop_per_ct.isnull().any(axis=1)]

Unnamed: 0,ct_id,res_pop,work_pop
10,51059980200,,88.0
73,51059432701,709.0,
121,51059481102,1007.0,
149,51059482502,976.0,
165,51059980100,,405.0
193,51059452802,,67.0
196,51059452301,879.0,
220,51059415600,284.0,
221,51059422401,446.0,
234,51059471303,920.0,


In [58]:
pop_per_ct = pop_per_ct.fillna(0)
pop_per_ct['res_pop'] = pop_per_ct['res_pop'].astype(np.int64)
pop_per_ct['work_pop'] = pop_per_ct['work_pop'].astype(np.int64)

In [59]:
pop_per_ct[pop_per_ct.isnull().any(axis=1)]

Unnamed: 0,ct_id,res_pop,work_pop


In [60]:
commuting_flows.groupby('h_geoid').sum()['count']

h_geoid
51059415100     499
51059415200     511
51059415300     728
51059415401    1023
51059415402     395
               ... 
51600300100    1508
51600300200    1353
51600300300    1490
51600300400    1169
51600300500     941
Name: count, Length: 260, dtype: int64

In [61]:
pop_per_ct.shape

(263, 3)

In [62]:
pop_per_ct['work_pop'].sum()

258436

In [63]:
buildings_per_ct.shape

(263, 3)

In [64]:
buildings_per_ct[buildings_per_ct['res_count'] == 0]

Unnamed: 0,ct_id,res_count,non_res_count
10,51059980200,0,13
165,51059980100,0,3
193,51059452802,0,9


In [65]:
pop_per_ct[pop_per_ct['ct_id'] == 51059452802]

Unnamed: 0,ct_id,res_pop,work_pop
193,51059452802,0,67


In [66]:
pop_per_ct['res_pop'].sum()

258436

In [67]:
pop_per_ct.head()

Unnamed: 0,ct_id,res_pop,work_pop
0,51059980300,46,22
1,51059480300,1255,1129
2,51059480202,1293,19333
3,51059421001,570,1490
4,51059460501,553,4754


In [68]:
pop_per_ct.shape

(263, 3)

In [69]:
buildings[buildings['building_type'] == 1].shape

(187535, 3)

In [70]:
buildings[(buildings['ct_id'] == 51059980300) & buildings['building_type'] == 1]

Unnamed: 0,building_id,building_type,ct_id
11384,11384,1,51059980300
11395,11395,1,51059980300
11397,11397,1,51059980300


In [71]:
commuting_flows[commuting_flows['h_geoid'] == 51059980300].shape

(39, 3)

In [72]:
commuting_flows[commuting_flows['h_geoid'] == 51059980300]['count'].sum()

46

In [73]:
# synthetic population generation: 2-pass weighted probability for selecting work census tracts
home_id = []
work_id = []
for index, row in pop_per_ct.iterrows():
    home_id.extend(buildings[(buildings['ct_id'] == row['ct_id']) & 
              (buildings['building_type'] == 1)]['building_id'].sample(row['res_pop'], 
                                                                       replace=True).values.tolist())

    work_flows = commuting_flows[commuting_flows['h_geoid'] == row['ct_id']]
    idx_list = []
    size = work_flows['count'].sum()
    arr = (work_flows['count']/size).cumsum().to_numpy()

    idx_list = [bisect(arr,round(random(), 8)) 
                    for _ in repeat(None, size)] 
    w_ids = work_flows.reset_index()['w_geoid'].iloc[idx_list].values.tolist()

    work_id += [buildings[(buildings['ct_id'] == w_id) & 
                            (buildings['building_type'] == 0)]['building_id'].sample
                            (1).reset_index().at[0,'building_id']
                            for w_id in w_ids]

In [74]:
input_agents = pd.DataFrame(
    {'home_id': home_id,
     'work_id': work_id,
    })
input_agents = input_agents.reset_index().rename(columns={'index': 'agent_id'})

In [75]:
input_agents.head()

Unnamed: 0,agent_id,home_id,work_id
0,0,11395,3292
1,1,11384,72752
2,2,11397,5318
3,3,11397,192862
4,4,11384,23504


In [76]:
input_agents.shape

(258436, 3)

In [77]:
input_agents.to_csv('./repast4py/input/input_agents_ffx.csv', index=False)