In [4]:
import numpy as np
import pandas as pd

Process: Household Count, Size and Individual education status

In [5]:
social_data = '../nta/soc_2021acs5yr_nta.xlsx'
social_df = pd.read_excel(social_data)

social_df.head()

Unnamed: 0,GeoType,NTAType,GeogName,GeoID,Borough,HH1E,HH1M,HH1C,HH1P,HH1Z,...,HHCompE,HHCompM,HHCompC,HHCompP,HHCompZ,HHIntE,HHIntM,HHIntC,HHIntP,HHIntZ
0,NTA2020,0,Greenpoint,BK0101,Brooklyn,17487,659.0,2.3,100.0,0.0,...,16441,674.0,2.5,94.0,1.5,15969,678.0,2.6,91.3,1.8
1,NTA2020,0,Williamsburg,BK0102,Brooklyn,26955,958.0,2.2,100.0,0.0,...,25075,951.0,2.3,93.0,1.2,24590,997.0,2.5,91.2,1.8
2,NTA2020,0,South Williamsburg,BK0103,Brooklyn,10613,556.0,3.2,100.0,0.0,...,5700,557.0,5.9,53.7,4.4,3890,467.0,7.3,36.7,4.0
3,NTA2020,0,East Williamsburg,BK0104,Brooklyn,22665,710.0,1.9,100.0,0.0,...,20390,728.0,2.2,90.0,1.5,19464,754.0,2.4,85.9,2.0
4,NTA2020,0,Brooklyn Heights,BK0201,Brooklyn,11857,681.0,3.5,100.0,0.0,...,11688,680.0,3.5,98.6,0.9,11423,680.0,3.6,96.3,1.5


In [6]:
NTA_ID = 'BK0101'

In [7]:
nta_social_df = social_df[social_df['GeoID'] == NTA_ID]

nta_social_df.head()

Unnamed: 0,GeoType,NTAType,GeogName,GeoID,Borough,HH1E,HH1M,HH1C,HH1P,HH1Z,...,HHCompE,HHCompM,HHCompC,HHCompP,HHCompZ,HHIntE,HHIntM,HHIntC,HHIntP,HHIntZ
0,NTA2020,0,Greenpoint,BK0101,Brooklyn,17487,659.0,2.3,100.0,0.0,...,16441,674.0,2.5,94.0,1.5,15969,678.0,2.6,91.3,1.8


In [8]:
def show_columns(nta_df, start_ix=5, frequency=5):
    total_columns = nta_df.columns.shape[0]
    unique_attrs = (total_columns - start_ix)//frequency
    print("Unique attributes: ", unique_attrs)

    attribute_list = nta_df.columns[start_ix:total_columns:frequency]
    return attribute_list

print(show_columns(nta_social_df))


Unique attributes:  437
Index(['HH1E', 'Fam1E', 'FamChU18E', 'MrdFamE', 'MrdChU18E', 'MHnSE',
       'MHnSChU18E', 'FHnSE', 'FHnSChU18E', 'NFam1E',
       ...
       'TAndTobE', 'USVrgIsE', 'WIndSubE', 'OWIndE', 'YugoSlvE', 'OthrE',
       'UnclsNRE', 'HH3E', 'HHCompE', 'HHIntE'],
      dtype='object', length=437)


In [9]:
def compute_stats(nta_df, attr, estimation_vars=['E', 'M', 'C', 'P', 'Z'], filter=True):
    if filter:
        filter_vars = ['E', 'P']
        estimation_vars = filter_vars
    stats = [nta_df['{}{}'.format(attr, var)].values for var in estimation_vars]
    
    return stats

In [10]:
def merge_nta_stats(nta_df, attr_vals):
    '''estimate and percentage'''
    args = [compute_stats(nta_df, attr, filter=True) for attr in attr_vals]
    ret_e, ret_p = 0, 0
    for val in args:
        e, p = val
        ret_e += e
        ret_p += p
    
    return [ret_e, ret_p]

## Individual education status

In [11]:
pop_25over = compute_stats(nta_social_df, 'EA_P25pl')
print(pop_25over)

education_mapping = {'high_school': ['EA_LT9G', 'EA_9t12ND', 'EA_HScGrd'],
                    'college_degree': ['EA_SClgND', 'EA_AscD', 'EA_BchD'],
                    'graduate_degree': ['EA_GrdPfD']
                    }

print("----"*15)

high_school = merge_nta_stats(nta_social_df, education_mapping['high_school'])
print(high_school)
college_degree = merge_nta_stats(nta_social_df, education_mapping['college_degree'])
print(college_degree)
graduate_degree = merge_nta_stats(nta_social_df, education_mapping['graduate_degree'])
print(graduate_degree)

[array([30752]), array([100.])]
------------------------------------------------------------
[array([6718]), array([21.9])]
[array([17468]), array([56.7])]
[array([6566]), array([21.4])]


In [12]:
studying = compute_stats(nta_social_df, 'Pop3plEn')
print(studying)

[array([5339]), array([100.])]


In [14]:
def get_nta_education(df, nta_id, education_mapping):
    '''Education status can be completed (>25) or studying (for <=25)'''
    nta_df = df[df['GeoID'] == nta_id]
    nta_education = {} # for agents 25 and above

    studying = compute_stats(nta_social_df, 'Pop3plEn')[0]
    nolonger_studying = compute_stats(nta_social_df, 'EA_P25pl')[0]
    total_educated_studying = studying + nolonger_studying

    for key in education_mapping: # education_mapping is for >25 yr old agents
        nta_education[key] = {}
        estimate, _ = merge_nta_stats(nta_df, education_mapping[key])

        nta_education[key]['estimate'] = max(0, estimate[0])
        nta_education[key]['percentage'] = nta_education[key]['estimate'] / total_educated_studying

    return nta_education

In [15]:
education_mapping = {'high_school': ['EA_LT9G', 'EA_9t12ND', 'EA_HScGrd'],
                    'college_degree': ['EA_SClgND', 'EA_AscD', 'EA_BchD'],
                    'graduate_degree': ['EA_GrdPfD'],
                    'studying': ['Pop3plEn']
                    }

result = get_nta_education(social_df, 'BK0101', education_mapping)

In [18]:
ans = 0
for key in result:
    ans += result[key]['estimate']

ans

36091

## Household demographics

The main task is to assign individuals to households

In [19]:
total_households = compute_stats(nta_social_df, 'HH1')
family_households = compute_stats(nta_social_df, 'Fam1')
nonfamily_households = compute_stats(nta_social_df, 'NFam1')

print(total_households, family_households, nonfamily_households)

[array([17487]), array([100.])] [array([7252]), array([41.5])] [array([10235]), array([58.5])]


In [27]:
average_household_size = compute_stats(nta_social_df, 'AvgHHSz', filter=False)
average_family_size = compute_stats(nta_social_df, 'AvgFmSz', filter=False)

print("household size: ", average_household_size)
print("family size: ", average_family_size)

household size:  [array([2.13]), array([0.12]), array([3.5]), array([nan]), array([nan])]
family size:  [array([2.75]), array([0.29]), array([6.4]), array([nan]), array([nan])]


In [28]:
population_in_households = compute_stats(nta_social_df, 'HHPop')
print("population in households: ", population_in_households)

family_household_population = family_households[0] * average_family_size[0]
print("family household population: ", family_household_population)

non_family_population = population_in_households[0] - family_household_population
print("non-family household population: ", non_family_population)

non_family_household_size = non_family_population / nonfamily_households[0]
print("average size of non-family household size: ", non_family_household_size)

population in households:  [array([37213]), array([100.])]
family household population:  [19943.]
non-family household population:  [17270.]
average size of non-family household size:  [1.68734734]


## Family Households

In [37]:
family_households = compute_stats(nta_social_df, 'Fam1')
print(family_households)

family_size = compute_stats(nta_social_df, 'AvgFmSz', filter=False)
print(family_size[0], family_size[1], family_size[2])

family_households_children = compute_stats(nta_social_df, 'FamChU18')
print(family_households_children)

print("--------------")

[array([7252]), array([41.5])]
[2.75] [0.29] [6.4]
[array([2604]), array([14.9])]
--------------


In [34]:
family_households_married = compute_stats(nta_social_df, 'MrdFam')
print(family_households_married)

family_households_married_children = compute_stats(nta_social_df, 'MrdChU18')
print(family_households_married_children)

[array([5571]), array([31.9])]
[array([1907]), array([10.9])]


## Non Family Households

In [24]:
nonfamily_households = compute_stats(nta_social_df, 'NFam1')

print(nonfamily_households)

[array([10235]), array([58.5])]


In [39]:
single_person_households = compute_stats(nta_social_df, 'NFamA')
single_person_households_retired = compute_stats(nta_social_df, 'NFamA65pl')

multiple_nonfamily_households = nonfamily_households[0] - single_person_households[0]

print(single_person_households)
print(single_person_households_retired)
print(multiple_nonfamily_households)

[array([5535]), array([31.7])]
[array([1491]), array([8.5])]
[4700]
