This script matches the block with the TAZ and outputs the TAZ table with corresponding information

In [134]:
# Libraries and dependencies
import numpy as np
import pandas as pd

In [135]:
# Read in the input files:
# Parse in the dataframe and skip the first row
new_york_df = pd.read_csv('new_york_size.csv', skiprows=1)
new_york_df.head()
new_york_df['Id2'] = new_york_df['Id2'].astype('int')
new_york_df['avg_hh'] = new_york_df['total hh num']/ new_york_df['total hhsize']
new_york_df.head()
# Further clean the data by eliminating rows containing NaN:
new_york_df = new_york_df[~np.isnan(new_york_df).any(axis=1)]
# Calculate the percentage of different numbers of households:
for i in range(1,7):
    new_york_df['percentage_' + str(i)] = new_york_df[str(i)]/new_york_df['total hhsize']
new_york_df.head()

Unnamed: 0,Id2,1,2,3,4,5,6,7,total hhsize,total hh num,avg_hh,percentage_1,percentage_2,percentage_3,percentage_4,percentage_5,percentage_6
2,-167250853,58,86,48,72,0,20,5,289,1052,0.274715,0.200692,0.297578,0.16609,0.249135,0.0,0.069204
3,-167250852,224,262,122,13,68,0,0,689,1618,0.425834,0.325109,0.380261,0.177068,0.018868,0.098694,0.0
5,-167250843,86,63,15,58,0,0,0,222,534,0.41573,0.387387,0.283784,0.067568,0.261261,0.0,0.0
6,-167250842,313,349,177,55,22,0,0,916,2245,0.408018,0.341703,0.381004,0.193231,0.060044,0.024017,0.0
7,-167250841,442,229,59,65,16,0,0,811,1547,0.52424,0.545006,0.282367,0.07275,0.080148,0.019729,0.0


In [136]:
# Read in the TAZ data:
TAZ = pd.read_csv('TAZ_Household_size-1.csv')
# The first 318 TAZ belong to New York
new_york_TAZ = TAZ.head(318)
new_york_TAZ.head()

Unnamed: 0,TAZ,HHNUM,HHSIZE
0,1,4347.4,2.036
1,2,2215.2,2.216
2,3,0.0,0.0
3,4,722.0,1.906
4,5,3733.0,1.676


In [137]:
# Start matching the HH_SIZE for the avg_hh in the new_york_TAZ
# Construct a dict to store all the ID of the residential block and its average hh value:
block_dict = {}
for block_row in new_york_df.itertuples(index=True, name='Pandas'):
    block_dict[getattr(block_row, "avg_hh")] =  getattr(block_row, "Id2")
# print(block_dict)  
matches = {}
# Initialize a empty dataframe which to be added to new_york_TAZ:
columns = ['1','2','3','4','5', '6', '7','avg_hh']
rows = []

In [138]:
# Iterate through the TAZ table:
for taz_row in new_york_TAZ.itertuples(index = True, name = 'Pandas'):
    
    current_hh_target = getattr(taz_row, 'HHSIZE')
    
    current_taz_id = getattr(taz_row, 'TAZ')
    
    # Find the closest element in the list according to the target number
    closest_key = min(block_dict.keys(), key=lambda x:abs(x-current_hh_target))
    
    # Retrieve the value stored corresponding to the key, which is the block id:
    current_block_id = block_dict[closest_key]
    matches[current_taz_id] = current_block_id
    
    row = new_york_df[new_york_df['Id2'] == current_block_id][columns].values.tolist()
    row = row[0]
    rows.append(row)
    

    # Remove the items from block_dict since we could not use the same block_id again:
    block_dict.pop(closest_key)    


In [144]:
# Fill in the table:
new_york_TAZ
# Combine two dataframe together

Unnamed: 0,TAZ,HHNUM,HHSIZE
0,1,4347.4,2.036
1,2,2215.2,2.216
2,3,0.0,0.000
3,4,722.0,1.906
4,5,3733.0,1.676
5,6,1732.2,1.814
6,7,2726.0,1.684
7,8,6.0,2.056
8,9,1237.8,2.236
9,10,1476.2,2.206


In [150]:
to_be_added = pd.DataFrame(rows, columns = columns)
# Combine two dataframe together:
result = pd.concat([new_york_TAZ, to_be_added], axis = 1)
writer = pd.ExcelWriter('output.xlsx')
result.to_excel(writer,'Sheet1')
writer.save()