This script matches the block with the TAZ and outputs the TAZ table with corresponding information

In [61]:
# Libraries and dependencies
import numpy as np
import pandas as pd

In [64]:
# Read in the input files:
# Parse in the dataframe and skip the first row
new_york_df = pd.read_csv('new_york_size.csv', skiprows=1)
new_york_df.head()
new_york_df['Id2'] = new_york_df['Id2'].astype('int')
new_york_df['avg_hh'] = new_york_df['total hhsize']/new_york_df['total hh num']
new_york_df.head()
# Further clean the data by eliminating rows containing NaN:
new_york_df = new_york_df[~np.isnan(new_york_df).any(axis=1)]
# Calculate the percentage of different numbers of households:
for i in range(1,7):
    new_york_df['percentage_' + str(i)] = new_york_df[str(i)]/new_york_df['total hhsize']
new_york_df.head()

Unnamed: 0,Id2,1,2,3,4,5,6,7,total hhsize,total hh num,avg_hh,percentage_1,percentage_2,percentage_3,percentage_4,percentage_5,percentage_6
2,-167250853,58,86,48,72,0,20,5,289,1052,0.274715,0.200692,0.297578,0.16609,0.249135,0.0,0.069204
3,-167250852,224,262,122,13,68,0,0,689,1618,0.425834,0.325109,0.380261,0.177068,0.018868,0.098694,0.0
5,-167250843,86,63,15,58,0,0,0,222,534,0.41573,0.387387,0.283784,0.067568,0.261261,0.0,0.0
6,-167250842,313,349,177,55,22,0,0,916,2245,0.408018,0.341703,0.381004,0.193231,0.060044,0.024017,0.0
7,-167250841,442,229,59,65,16,0,0,811,1547,0.52424,0.545006,0.282367,0.07275,0.080148,0.019729,0.0


In [65]:
# Read in the TAZ data:
TAZ = pd.read_csv('TAZ_Household_size-1.csv')
# The first 318 TAZ belong to New York
new_york_TAZ = TAZ.head(318)
new_york_TAZ.head()

Unnamed: 0,TAZ,HHNUM,HHSIZE
0,1,4347.4,2.036
1,2,2215.2,2.216
2,3,0.0,0.0
3,4,722.0,1.906
4,5,3733.0,1.676


In [74]:
# Start matching the HH_SIZE for the avg_hh in the new_york_TAZ
# Construct a dict to store all the ID of the residential block and its average hh value:
block_dict = {}
for block_row in new_york_df.itertuples(index=True, name='Pandas'):
    block_dict[getattr(block_row, "avg_hh")] =  getattr(block_row, "Id2")
# print(block_dict)  
matches = {}
# Initialize a empty dataframe which to be added to new_york_TAZ:
columns = ['1','2','3','4','5', '6', '7','avg_hh']
rows = []

In [76]:
# Iterate through the TAZ table:
for taz_row in new_york_TAZ.itertuples(index = True, name = 'Pandas'):
    
    current_hh_target = getattr(taz_row, 'HHSIZE')
    
    current_taz_id = getattr(taz_row, 'TAZ')
    
    # Find the closest element in the list according to the target number
    closest_key = min(block_dict.keys(), key=lambda x:abs(x-current_hh_target))
    
    # Retrieve the value stored corresponding to the key, which is the block id:
    current_block_id = block_dict[closest_key]
    matches[current_taz_id] = current_block_id
    
    row = new_york_df[new_york_df['Id2'] == current_block_id][columns]
    rows.append(row)

    # Remove the items from block_dict since we could not use the same block_id again:
    block_dict.pop(closest_key)    


       1    2    3  4  5   6  7    avg_hh
576  475  163  103  7  0  18  0  0.559124
       1    2   3   4   5  6  7    avg_hh
356  590  424  82  26  42  0  0  0.558809
       1    2   3   4   5  6   7    avg_hh
1069  50  123  99  78  70  0  86  0.209524
       1    2  3  4  5  6  7    avg_hh
228  446  148  0  0  0  0  0  0.558796
        1   2   3   4   5  6  7    avg_hh
1046  384  73  54  45  13  0  0  0.557843
       1    2   3   4  5  6  7    avg_hh
409  369  215  54  53  0  0  0  0.557708
       1    2    3  4  5  6  7    avg_hh
635  380  317  142  0  0  0  0  0.557475
       1    2   3   4  5  6  7    avg_hh
377  350  368  58  15  0  0  0  0.557435
       1    2   3   4  5  6  7    avg_hh
681  247  149  58  40  0  0  0  0.556306
       1   2   3   4  5  6  7    avg_hh
816  157  66  39  20  0  0  0  0.556213
       1    2   3   4  5  6  7    avg_hh
584  475  427  85  73  0  0  0  0.556139
      1    2   3   4  5  6  7    avg_hh
78  594  384  49  50  0  0  0  0.556014
       1    2 

155  580  160  110  117  0  0  0  0.518221
        1    2   3  4   5  6   7    avg_hh
1110  330  323  61  0  26  0  13  0.517526
       1    2    3  4  5  6  7    avg_hh
378  270  192  131  0  0  0  0  0.517452
       1    2    3   4  5  6  7    avg_hh
265  271  177  101  41  0  0  0  0.516637
       1    2   3   4   5  6  7    avg_hh
318  305  158  80  17  18  0  0  0.516071
      1    2   3   4  5  6  7    avg_hh
672  89  101  38  10  7  0  0  0.515789
      1    2   3    4   5  6  7    avg_hh
42  556  397  64  100  15  0  0  0.515718
       1    2   3   4   5  6  7    avg_hh
683  383  315  88  54  19  0  0  0.514679
       1    2    3   4  5  6  7    avg_hh
779  277  169  152  37  0  0  0  0.513754
       1    2    3  4  5  6  7    avg_hh
159  620  530  266  0  0  0  0  0.513229
       1    2   3   4  5  6  7    avg_hh
710  230  152  23  89  0  0  0  0.512448
       1    2   3   4   5  6  7    avg_hh
719  429  215  81  27  76  0  0  0.512376
       1    2   3    4   5  6  7   avg_hh

       1    2   3   4   5  6  7    avg_hh
925  281  185  43  24  20  0  0  0.478788
       1    2    3  4  5  6  7    avg_hh
166  316  490  278  0  0  0  0  0.478376
       1    2   3   4   5  6  7    avg_hh
564  153  134  39  35  26  0  0  0.478368
        1    2    3   4  5  6  7    avg_hh
1059  297  125  103  71  0  0  0  0.478331
       1    2    3   4  5  6  7    avg_hh
558  263  160  245  33  0  0  0  0.478172
       1    2    3   4  5  6  7   avg_hh
594  239  273  134  42  0  0  0  0.47811
       1    2    3    4  5  6  7    avg_hh
684  379  166  120  117  0  0  0  0.477995
       1    2   3  4   5  6  7    avg_hh
472  258  237  46  0  65  0  0  0.477541
       1    2   3   4   5  6  7    avg_hh
671  243  232  80  55  29  0  0  0.476866
        1    2    3   4  5  6  7    avg_hh
1166  138  171  129  15  0  0  0  0.476842
      1    2    3    4  5  6  7    avg_hh
89  570  422  193  101  0  0  0  0.476473
       1    2  3   4   5  6  7    avg_hh
347  263  115  9  24  17  0  0  0.4

In [51]:
# Fill in the table:
new_york_TAZ.reset_index(drop = True, inplace = True)
