In [169]:
# Libraries and dependencies
import numpy as np
import pandas as pd

In [170]:
# Read in the input files:
new_york_df = pd.read_csv('new_york_size.csv', skiprows=1)
bronx_df = pd.read_csv('bronx_size.csv', skiprows=1)
kings_df = pd.read_csv('kings_size.csv', skiprows=1)
queens_df = pd.read_csv('queens_size.csv', skiprows=1)
Richmond_df = pd.read_csv('Richmond_size.csv', skiprows=1)

In [171]:
# Change all the Id, i.e Id2 value, into integers:
for df in [new_york_df, queens_df, bronx_df, kings_df , Richmond_df]:
    df['Id2'] = df['Id2'].astype('int')

In [172]:
# Generate average household size for each dataframe, i.e adding a column in the dataframe:
# for dataframe in dataframe_list:
#     dataframe['avg_hh'] = dataframe['total hh num']/dataframe['total hhsize']
new_york_df['avg_hh'] = new_york_df['total hh num']/ new_york_df['total hhsize']
bronx_df['avg_hh'] = bronx_df['total hh num']/ bronx_df['total hhsize']
queens_df['avg_hh'] = queens_df['total hh num']/ queens_df['total hhsize']
kings_df['avg_hh'] = kings_df['total hh num']/kings_df['total hhsize']
Richmond_df['avg_hh'] = Richmond_df['total hh num']/ Richmond_df['total hhsize']

In [173]:
# Further clean the data by eliminating rows containing NaN:
new_york_df = new_york_df[~np.isnan(new_york_df).any(axis=1)]
bronx_df = bronx_df[~np.isnan(bronx_df).any(axis=1)]
queens_df = queens_df[~np.isnan(queens_df).any(axis=1)]
kings_df = kings_df[~np.isnan(kings_df).any(axis=1)]
Richmond_df = Richmond_df[~np.isnan(Richmond_df).any(axis=1)]

In [174]:
# Calculate the percentage for each household for each dataframe
for df in [new_york_df, queens_df, bronx_df, kings_df , Richmond_df]:
    for i in range(1,7):
        df['percentage_' + str(i)] = df[str(i)]/df['total hhsize']

In [175]:
# Read in the TAZ data:
TAZ = pd.read_csv('TAZ_Household_size-1.csv')
new_york_TAZ = TAZ.loc[0:317]
queens_TAZ = TAZ.loc[318:751]
bronx_TAZ = TAZ.loc[752:1024]
kings_TAZ = TAZ.loc[1025:1537]
Richmond_TAZ = TAZ.loc[1538:1621]

In [206]:
# The following function aims to match TAZ with corresponding block:
def matching(block_df, TAZ_input):
    # Construct a dict to store all the ID of the residential block and its average hh value:
    block_dict = {}
    for block_row in block_df.itertuples(index=True, name='Pandas'):
        block_dict[getattr(block_row, "avg_hh")] =  getattr(block_row, "Id2")  
    matches = {}
    # Initialize a empty dataframe which to be added to new_york_TAZ:
    columns = ['1','2','3','4','5', '6', '7','avg_hh']
    rows = []
    
    # Iterate through the TAZ table:
    
    for taz_row in TAZ.itertuples(index = True, name = 'Pandas'):

        current_hh_target = getattr(taz_row, 'HHSIZE')

        current_taz_id = getattr(taz_row, 'TAZ')

        # Find the closest element in the list according to the target number
        closest_key = min(block_dict.keys(), key=lambda x:abs(x-current_hh_target))
        

        # Retrieve the value stored corresponding to the key, which is the block id:
        current_block_id = block_dict[closest_key]

        matches[current_taz_id] = current_block_id

        row = block_df[block_df['Id2'] == current_block_id][columns].values.tolist()
        row = row[0]
        rows.append(row)
        # Remove the items from block_dict since we could not use the same block_id again:
        block_dict.pop(closest_key)
    
    # Construct a new dataframe to include all these rows:
    to_be_added = pd.DataFrame(rows, columns = columns)
    result = pd.concat([TAZ, to_be_added],axis = 1)
    
    return result

In [207]:
result_ny = matching(new_york_df, new_york_TAZ)
result_queens = matching(queens_df, queens_TAZ)
result_bronx = matching(bronx_df, bronx_TAZ)
result_kings = matching(kings_df, kings_TAZ)
result_Richmond = matching(Richmond_df, Richmond_TAZ)

In [208]:
result_queens

Unnamed: 0,TAZ,HHNUM,HHSIZE,1,2,3,4,5,6,7,avg_hh
0,,,,12.0,259.0,117.0,25.0,0.0,0.0,0.0,2.547215
1,,,,565.0,187.0,225.0,95.0,0.0,26.0,0.0,2.102914
2,,,,202.0,135.0,96.0,24.0,35.0,10.0,0.0,2.324701
3,,,,252.0,322.0,148.0,48.0,29.0,11.0,0.0,2.296296
4,,,,15.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000
5,,,,153.0,124.0,77.0,61.0,15.0,0.0,0.0,2.283721
6,,,,13.0,67.0,11.0,16.0,0.0,0.0,0.0,2.373832
7,,,,212.0,100.0,69.0,0.0,14.0,7.0,0.0,2.203980
8,,,,238.0,146.0,98.0,25.0,20.0,6.0,0.0,2.245779
9,,,,65.0,97.0,32.0,51.0,9.0,0.0,5.0,2.706564
