In [3]:
import numpy as np
import pandas as pd

# For each agent, we want: age, gender, race, geo-location, occupation-status, household-id

# For each household, we want: income-level, lat-long, assets, expenses

In [4]:
demographic_data_file = '../nta/demo_2021acs5yr_nta.xlsx'

In [40]:
start_ix = 5
frequency = 5
unique_attrs = (demo_df.columns.shape[0] - start_ix)//frequency
print("Unique attributes: ", unique_attrs)

attribute_list = demo_df.columns[start_ix:demo_df.columns.shape[0]:frequency]

f = open('attr_file_demographic.txt', 'w+')

for item in attribute_list:
    f.write(str(item) + '\n')

f.close()

Unique attributes:  96


In [122]:
# for each nta - we want to process: age, sex and race
NTA_ID = 'BK0101' # GeoId from the census data
sample_nta_df = demo_df[demo_df['GeoID'] == NTA_ID]
sample_nta_df.head()

Unnamed: 0,GeoType,NTAType,GeogName,GeoID,Borough,Pop_1E,Pop_1M,Pop_1C,Pop_1P,Pop_1Z,...,AsnOAsnE,AsnOAsnM,AsnOAsnC,AsnOAsnP,AsnOAsnZ,Asn2plE,Asn2plM,Asn2plC,Asn2plP,Asn2plZ
0,NTA2020,0,Greenpoint,BK0101,Brooklyn,37518,1616.0,2.6,100.0,0.0,...,29,28.0,58.6,1.9,1.8,73,65.0,53.9,4.9,4.2


In [23]:
def get_nta_stats(stat_variable, estimation_vars=['E', 'M', 'C', 'P', 'Z'], filter=True):
    if filter:
        filter_vars = ['E', 'M', 'P']
        estimation_vars = filter_vars
    stats = [sample_nta_df['{}{}'.format(stat_variable, var)].values for var in estimation_vars]
    
    return stats

In [24]:
estimation_vars = ['E', 'M', 'C', 'P', 'Z'] # estimates, margins of error, coeff of variation, percents, percent margin of error

# total population stats
total_population_var = 'Pop_1'
total_population_stats = get_nta_stats(total_population_var, estimation_vars)
print(total_population_stats)

[array([37518]), array([1616.]), array([100.])]


In [30]:
agent_ages = ['PopU5', 'Pop5t9', 'Pop10t14', 'Pop15t19', 'Pop20t24', 'Pop25t29', 'Pop30t34', 'Pop35t39', 
            'Pop40t44', 'Pop45t49', 'Pop50t54', 'Pop55t59', 'Pop60t64', 'Pop65t69', 'Pop70t74', 'Pop75t79',
            'Pop80t84', 'Pop85pl', 'PopU181', 'Pop65pl1']

agents_under_18 = get_nta_stats('PopU181', estimation_vars)[0]
print("---"*20)

male_under_18 = get_nta_stats('PopU18M')
female_under_18 = get_nta_stats('PopU18F')
print("Under 18:", agents_under_18, female_under_18, male_under_18)

print("---"*20)
people_over_65 = get_nta_stats('Pop65pl2')
male_over_65 = get_nta_stats('Pop65plM')
female_over_65 = get_nta_stats('Pop65plF')

print("Over 65: ", people_over_65, male_over_65, female_over_65)

middle_age_males = 

# rougly male and female are split with 55% (Male) to 45% (Female) in middle ages

------------------------------------------------------------
Under 18: [4731] [array([2390]), array([387.]), array([50.5])] [array([2341]), array([404.]), array([49.5])]
------------------------------------------------------------
Over 65:  [array([3910]), array([442.]), array([100.])] [array([1735]), array([298.]), array([44.4])] [array([2175]), array([306.]), array([55.6])]


In [95]:
def merge_nta_stats(*args):
    ret_e, ret_p = 0, 0
    for val in args:
        e, _, p = val
        ret_e += e
        ret_p += p
    
    print(ret_e, ret_p)
    return [ret_e, ret_p]

def get_middle_age_gender(agents_stats, male_ratio=0.51):
    e, p = agents_stats
    male_e = int(male_ratio*e)
    female_e = e[0] - male_e

    male_p = male_ratio*p
    female_p = p[0] - male_p

    return {'male': [male_e, male_p], 'female': [female_e, female_p]}


agents_20t29 = merge_nta_stats(get_nta_stats('Pop20t24'), get_nta_stats('Pop25t29'))
agents_20t29_split = get_middle_age_gender(agents_20t29)
print("20 to 29: ", agents_20t29_split)

print("-------------")

agents_30t39 = merge_nta_stats(get_nta_stats('Pop30t34'), get_nta_stats('Pop35t39'))
agents_30t39_split = get_middle_age_gender(agents_30t39)
print("30 to 39: ", agents_30t39_split)

print("-------------")

agents_40t49 = merge_nta_stats(get_nta_stats('Pop40t44'), get_nta_stats('Pop45t49'))
agents_40t49_split = get_middle_age_gender(agents_40t49)
print("40 to 49: ", agents_40t49_split)

print("-------------")

agents_50t64 = merge_nta_stats(get_nta_stats('Pop50t54'), get_nta_stats('Pop55t59'), get_nta_stats('Pop60t64'))
agents_50t64_split = get_middle_age_gender(agents_50t64)
print("50 to 64: ", agents_50t64_split)

[6723] [18.]
20 to 29:  {'male': [3428, array([9.18])], 'female': [3295, array([8.82])]}
-------------
[11573] [30.9]
30 to 39:  {'male': [5902, array([15.759])], 'female': [5671, array([15.141])]}
-------------
[5724] [15.2]
40 to 49:  {'male': [2919, array([7.752])], 'female': [2805, array([7.448])]}
-------------
[4647] [12.4]
50 to 64:  {'male': [2369, array([6.324])], 'female': [2278, array([6.076])]}


In [45]:
# assign race to agents
# race: hispanic / latino, white, asian, black, nati

print("All agents: ", get_nta_stats('Pop_2'))

hispanic_agents = get_nta_stats('Hsp1')
print("Hispanic Agents: ", hispanic_agents)
print("Not Hispanic Agents: ", get_nta_stats('NHsp'))

hispanic_agents = get_nta_stats('Hsp1')
white_agents = get_nta_stats('WtNH')
black_agents = get_nta_stats('BlNH')
asian_agents = get_nta_stats('AsnNH')
native_agents = get_nta_stats('NHPINH')
american_indian_alaska_agents = get_nta_stats('AIANNH')

# print(hispanic_agents, white_agents, black_agents, asian_agents, native_agents, american_indian_alaska_agents)

ans = 0
race_mapping = {'hispanic': ['Hsp1'], 'white': ['WtNH'], 'black': ['AsnNH'], 'black': ['BlNH'], 
                'native': ['NHPINH', 'AIANNH'], 'other': ['OthNH', 'Rc2plNH'], 'asian': ['AsnNH']}

# asian, hispanic, white, black, native, other
race_list = ['Hsp1', 'WtNH', 'BlNH', 'AsnNH', 'NHPINH', 'AIANNH', 'OthNH', 'Rc2plNH']
for ix in race_list:
    val = get_nta_stats(ix)
    print(ix, val)
    ans += val[0]

print(ans)

All agents:  [array([37518]), array([1616.]), array([100.])]
Hispanic Agents:  [array([5498]), array([793.]), array([14.7])]
Not Hispanic Agents:  [array([32020]), array([1537.]), array([85.3])]
Hsp1 [array([5498]), array([793.]), array([14.7])]
WtNH [array([27273]), array([1360.]), array([72.7])]
BlNH [array([937]), array([349.]), array([2.5])]
AsnNH [array([1464]), array([308.]), array([3.9])]
NHPINH [array([0]), array([nan]), array([nan])]
AIANNH [array([22]), array([33.]), array([0.1])]
OthNH [array([210]), array([106.]), array([0.6])]
Rc2plNH [array([2114]), array([547.]), array([5.6])]
[37518]


# Compute Age, Gender and Race data for given NTA

In [153]:
def compute_stats(nta_df, attr, estimation_vars=['E', 'M', 'C', 'P', 'Z'], filter=True):
    if filter:
        filter_vars = ['E', 'P']
        estimation_vars = filter_vars
    stats = [nta_df['{}{}'.format(attr, var)].values for var in estimation_vars]
    
    return stats

def merge_nta_stats(nta_df, attr_vals):
    '''estimate and percentage'''
    args = [compute_stats(nta_df, attr, filter=True) for attr in attr_vals]
    ret_e, ret_p = 0, 0
    for val in args:
        e, p = val
        ret_e += e
        ret_p += p
    
    return [ret_e, ret_p]

def get_cumulative_stats(nta_df, attr_vals):
    total_estimate, total_percentage = merge_nta_stats(nta_df, attr_vals)
    return total_estimate, total_percentage

def get_gender_split(total_estimate, total_percentage, male_ratio):
    total_estimate = total_estimate.item()

    male_estimate = int(male_ratio*total_estimate)
    female_estimate = total_estimate - male_estimate

    male_percentage = male_ratio*total_percentage.item()
    female_percentage = total_percentage - male_percentage

    return {'male': [male_estimate, male_percentage], 'female': [female_estimate, female_percentage]}

In [154]:
def get_nta_race(df, nta_id, race_mapping):
    nta_df = df[df['GeoID'] == nta_id]

    nta_race = {}
    for key in race_mapping:
        nta_race[key] = {'percentage': 0.0, 'estimate': 0.0}

        for attr_value in race_mapping[key]:
            estimate, percentage = compute_stats(nta_df, attr_value, filter=True)

            nta_race[key]['percentage'] +=  max(0, percentage[0])
            nta_race[key]['estimate'] +=  max(0, estimate[0])

    return nta_race

In [155]:
def get_nta_age_gender(df, nta_id, age_mapping, male_ratio=0.508):
    '''estimate, percentage'''
    nta_df = df[df['GeoID'] == nta_id]

    nta_age_gender = {}
    for key in age_mapping:
        attr_vals = age_mapping[key]    
        total_estimate, total_percentage = get_cumulative_stats(nta_df, attr_vals)
        if key == 'U19':
            male_ratio = compute_stats(nta_df, 'PopU18M')[-1] / 100.0 # percentage of male < 19
        if key == '65A':
            male_ratio = compute_stats(nta_df, 'Pop65plM')[-1] / 100.0 # percentage of male > 65
        
        age_gender_stats = get_gender_split(total_estimate, total_percentage, male_ratio=male_ratio)

        nta_age_gender[key] = age_gender_stats

    return nta_age_gender

In [156]:
nta_demographics_file = '../nta/demo_2021acs5yr_nta.xlsx'

df = pd.read_excel(nta_demographics_file)
NTA_ID = 'BK0101'

In [150]:
race_mapping = {'hispanic': ['Hsp1'], 'white': ['WtNH'], 'black': ['AsnNH'], 'black': ['BlNH'], 
                'native': ['NHPINH', 'AIANNH'], 'other': ['OthNH', 'Rc2plNH'], 'asian': ['AsnNH']}

# race data
nta_race = get_nta_race(df, NTA_ID, race_mapping)

print(nta_race)

p = 0
for key in nta_race:
    p += nta_race[key]['estimate']
print(p)

{'hispanic': {'percentage': 14.7, 'estimate': 5498.0}, 'white': {'percentage': 72.7, 'estimate': 27273.0}, 'black': {'percentage': 2.5, 'estimate': 937.0}, 'native': {'percentage': 0.1, 'estimate': 22.0}, 'other': {'percentage': 6.199999999999999, 'estimate': 2324.0}, 'asian': {'percentage': 3.9, 'estimate': 1464.0}}
37518.0


In [157]:
age_mapping = {'U19': ['PopU5', 'Pop5t9', 'Pop10t14', 'Pop15t19'], '20t29': ['Pop20t24', 'Pop25t29'],  
                '30t39': ['Pop30t34', 'Pop35t39'], '40t49': ['Pop40t44', 'Pop45t49'], 
                '50t64': ['Pop50t54', 'Pop55t59', 'Pop60t64'], '65A': ['Pop65t69', 'Pop70t74', 'Pop75t79','Pop80t84', 'Pop85pl']}

# age data
nta_age_gender = get_nta_age_gender(df, NTA_ID, age_mapping)

In [158]:
nta_age_gender

{'U19': {'male': [2445, array([6.534])], 'female': [2496, array([6.666])]},
 '20t29': {'male': [3327, array([8.91])], 'female': [3396, array([9.09])]},
 '30t39': {'male': [5728, array([15.2955])],
  'female': [5845, array([15.6045])]},
 '40t49': {'male': [2833, array([7.524])], 'female': [2891, array([7.676])]},
 '50t64': {'male': [2300, array([6.138])], 'female': [2347, array([6.262])]},
 '65A': {'male': [1736, array([4.6176])], 'female': [2174, array([5.7824])]}}

In [162]:
val = 0
for key in nta_age_gender:
    val = val + nta_age_gender[key]['male'][0] + nta_age_gender[key]['female'][0]

print(val)

37518


In [144]:
NTA_ID = 'BK0101'

nta_df = df[df['GeoID'] == NTA_ID]

total_male = compute_stats(nta_df, 'Male')
total_female = compute_stats(nta_df,'Fem')

print("total: ", total_male, total_female)

male_under_18 = compute_stats(nta_df, 'PopU18M')
female_under_18 = compute_stats(nta_df, 'PopU18F')

print("total under 18: ", male_under_18, female_under_18)

male_over_65 = compute_stats(nta_df, 'Pop65plM')
female_over_65 = compute_stats(nta_df, 'Pop65plF')

print("total over 65: ", male_over_65, female_over_65)

middle_age_male_estimate = total_male[0] - male_under_18[0] - male_over_65[0]
middle_age_female_estimate = total_male[0] - female_under_18[0] - female_over_65[0]

print("Middle Age male estimates: ", middle_age_male_estimate)
print("Middle Age female estimates: ", middle_age_female_estimate)

middle_age_male_ratio = middle_age_male_estimate / (middle_age_male_estimate + middle_age_female_estimate)
print(middle_age_male_ratio)

total:  [array([18545]), array([49.4])] [array([18973]), array([50.6])]
total under 18:  [array([2341]), array([49.5])] [array([2390]), array([50.5])]
total over 65:  [array([1735]), array([44.4])] [array([2175]), array([55.6])]
Middle Age male estimates:  [14469]
Middle Age female estimates:  [13980]
[0.50859433]


In [147]:
compute_stats(nta_df, 'Pop65plM')[-1]

array([44.4])