In [14]:
import numpy as np
import pandas as pd
import json

# Source data

In [2]:
demographic_file = '../nta/demo_2021acs5yr_nta.xlsx'
economic_file = '../nta/econ_2021acs5yr_nta.xlsx'
social_file = '../nta/soc_2021acs5yr_nta.xlsx'

In [3]:
race_mapping = {'hispanic': ['Hsp1'], 'white': ['WtNH'], 'black': ['AsnNH'], 'black': ['BlNH'], 
                'native': ['NHPINH', 'AIANNH'], 'other': ['OthNH', 'Rc2plNH'], 'asian': ['AsnNH']}

age_mapping = {'U19': ['PopU5', 'Pop5t9', 'Pop10t14', 'Pop15t19'], '20t29': ['Pop20t24', 'Pop25t29'],  
                '30t39': ['Pop30t34', 'Pop35t39'], '40t49': ['Pop40t44', 'Pop45t49'], 
                '50t64': ['Pop50t54', 'Pop55t59', 'Pop60t64'], '65A': ['Pop65t69', 'Pop70t74', 'Pop75t79','Pop80t84', 'Pop85pl']}

education_mapping = {'high_school': ['EA_LT9G', 'EA_9t12ND', 'EA_HScGrd'],
                    'college_degree': ['EA_SClgND', 'EA_AscD', 'EA_BchD'],
                    'graduate_degree': ['EA_GrdPfD'],
                    'studying': ['Pop3plEn']}

employment_insurance_mapping = {'employed_insured': 'EmHIns', 'employed_uninsured': 'EmNHIns',
                                'unemployed_insured': 'UEmHIns', 'unemployed_uninsured': 'UEmNHIns',
                                'nolabor_insured': 'NLFHIns', 'nolabor_uninsured': 'NLFNHIns'}

# parsers

In [9]:
def compute_stats(nta_df, attr, estimation_vars=['E', 'M', 'C', 'P', 'Z'], filter=True):
    if filter:
        filter_vars = ['E', 'P']
        estimation_vars = filter_vars
    stats = [nta_df['{}{}'.format(attr, var)].values for var in estimation_vars]
    
    return stats

def merge_nta_stats(nta_df, attr_vals):
    '''estimate and percentage'''
    args = [compute_stats(nta_df, attr, filter=True) for attr in attr_vals]
    ret_e, ret_p = 0, 0
    for val in args:
        e, p = val
        ret_e += e
        ret_p += p
    
    return [ret_e, ret_p]

def get_gender_split(total_estimate, total_percentage, male_ratio):
    total_estimate = total_estimate.item()

    male_estimate = int(male_ratio*total_estimate)
    female_estimate = total_estimate - male_estimate

    male_percentage = (male_ratio*total_percentage.item())
    female_percentage = (total_percentage - male_percentage)

    male_probability, female_probability = male_percentage / 100, female_percentage / 100

    return_dict = {'male': [male_estimate, male_probability], 'female': [female_estimate, female_probability]}

    #return_dict = {'estimate': {'male': male_estimate, 'female': female_estimate}, 'probability': {'male': male_percentage, 'female': female_percentage}}

    return return_dict

## helpers: Age, Race, Gender, Education, Employment, Insurance, Occupation [to be done]

In [10]:
# Property 1: Race
def get_nta_race(df, nta_id, race_mapping):
    nta_df = df[df['GeoID'] == nta_id]

    nta_race = {}
    for key in race_mapping:
        nta_race[key] = {}

        estimate, percentage = merge_nta_stats(nta_df, race_mapping[key])

        nta_race[key]['estimate'] = max(0, estimate[0])
        nta_race[key]['probability'] = max(0, percentage[0])/100

    return nta_race


# Property 2,3: Age and Gender

def get_nta_age_gender(df, nta_id, age_mapping, male_ratio=0.508):
    '''estimate, percentage'''
    nta_df = df[df['GeoID'] == nta_id]

    nta_age_gender = {}
    for key in age_mapping:
        attr_vals = age_mapping[key]

        total_estimate, total_percentage = merge_nta_stats(nta_df, attr_vals)
        if key == 'U19':
            male_ratio = compute_stats(nta_df, 'PopU18M')[-1] / 100.0 # percentage of male < 19
        if key == '65A':
            male_ratio = compute_stats(nta_df, 'Pop65plM')[-1] / 100.0 # percentage of male > 65
        
        gender_split_stats = get_gender_split(total_estimate, total_percentage, male_ratio=male_ratio)

        male_stats = gender_split_stats['male']
        female_stats = gender_split_stats['female']

        key_male = key + '_male'
        key_female = key + '_female'

        nta_age_gender[key_male] = {'estimate': male_stats[0], 'probability': male_stats[1]}
        nta_age_gender[key_female] = {'estimate': female_stats[0], 'probability': female_stats[1]}

        # nta_age_gender[key] = age_gender_stats

    return nta_age_gender


# Property 4: Education Status

def get_nta_education(df, nta_id, education_mapping):
    '''Education status can be completed (>25) or studying (for <=25)'''
    nta_df = df[df['GeoID'] == nta_id]
    nta_education = {} # for agents 25 and above

    studying = compute_stats(nta_df, 'Pop3plEn')[0]
    nolonger_studying = compute_stats(nta_df, 'EA_P25pl')[0]
    total_educated_studying = studying + nolonger_studying

    for key in education_mapping: # education_mapping is for >25 yr old agents
        nta_education[key] = {}
        estimate, _ = merge_nta_stats(nta_df, education_mapping[key])

        nta_education[key]['estimate'] = max(0, estimate[0])
        nta_education[key]['probability'] = nta_education[key]['estimate'] / total_educated_studying

    return nta_education


# Property 5,6: Employment and Insurance status

def get_nta_employ_insure(df, nta_id, employ_insure_mapping):
    nta_df = df[df['GeoID']==nta_id]

    total_eligible_agents = compute_stats(nta_df, 'CNI1864_2')[0]

    nta_employ_insure = {}
    for category in employ_insure_mapping:
        estimate, _ = compute_stats(nta_df, employ_insure_mapping[category])

        nta_employ_insure[category] = {}
        nta_employ_insure[category]['estimate'] = estimate[0]
        nta_employ_insure[category]['probability'] = estimate[0] / total_eligible_agents

    return nta_employ_insure

# processing

In [42]:
# education from social file
social_df = pd.read_excel(social_file)
# age, gender, race from demographic file
demo_df = pd.read_excel(demographic_file)
# employment, insurance from economic file
econ_df = pd.read_excel(economic_file)

In [11]:
NTA_ID = 'BK0101'

In [13]:
nta_race = get_nta_race(demo_df, NTA_ID, race_mapping)
print("Total agents: ", sum([nta_race[key_ix]['estimate'] for key_ix in nta_race]), " prob: ", sum([nta_race[key_ix]['probability'] for key_ix in nta_race]))

nta_age_gender = get_nta_age_gender(demo_df, NTA_ID, age_mapping)
print("Total agents: ", sum([nta_age_gender[key_ix]['estimate'] for key_ix in nta_age_gender]), " prob: ", sum([nta_age_gender[key_ix]['probability'] for key_ix in nta_age_gender]))

nta_employ_insure = get_nta_employ_insure(econ_df, NTA_ID, employment_insurance_mapping)
print("Total employed agents: ", sum([nta_employ_insure[key_ix]['estimate'] for key_ix in nta_employ_insure]), " prob: ", sum([nta_employ_insure[key_ix]['probability'] for key_ix in nta_employ_insure]))

nta_education = get_nta_education(social_df, NTA_ID, education_mapping)
print("Total education agents: ", sum([nta_education[key_ix]['estimate'] for key_ix in nta_education]), " prob: ", sum([nta_education[key_ix]['probability'] for key_ix in nta_education]))

Total agents:  37518  prob:  1.0
Total agents:  37518  prob:  [1.001]
Total employed agents:  28760  prob:  [1.]
Total education agents:  36091  prob:  [1.]


In [19]:
# NTA_ID: {'num_agents': [], 'race_prob': [], 'age_gender_prob': [], 'education_prob': [], 'insurance_employ_prob': []}

num_agents_age = sum([nta_race[key_ix]['estimate'] for key_ix in nta_race])
num_agents_race = sum([nta_age_gender[key_ix]['estimate'] for key_ix in nta_age_gender])

assert int(num_agents_age) == int(num_agents_race)

race_prob = [nta_race[key_ix]['probability'] for key_ix in nta_race]
age_gender_prob = [nta_age_gender[key_ix]['probability'] for key_ix in nta_age_gender]
education_prob = [nta_education[key_ix]['probability'] for key_ix in nta_education]
insurance_employ_prob = [nta_employ_insure[key_ix]['probability'] for key_ix in nta_employ_insure]

nta_dict = {}

nta_dict['nta_id'] = NTA_ID
nta_dict['num_agents'] = num_agents_age

nta_dict['race_prob'] = race_prob
nta_dict['age_gender_prob'] = age_gender_prob
nta_dict['education_prob'] = education_prob
nta_dict['insurance_employ_prob'] = insurance_employ_prob

print(nta_dict)

{'nta_id': 'BK0101', 'num_agents': 37518, 'race_prob': [0.147, 0.727, 0.025, 0.0, 0.06199999999999999, 0.039], 'age_gender_prob': [array([0.06534]), array([0.06666]), array([0.0891]), array([0.0909]), array([0.152955]), array([0.156045]), array([0.07524]), array([0.07676]), array([0.06138]), array([0.06262]), array([0.046176]), array([0.057824])], 'education_prob': [array([0.18614059]), array([0.48399878]), array([0.18192901]), array([0.14793162])], 'insurance_employ_prob': [array([0.74954798]), array([0.06442976]), array([0.05027816]), array([0.01630737]), array([0.11001391]), array([0.00942281])]}


In [20]:
all_nta_ids = demo_df['GeoID']

0    BK0101
1    BK0102
2    BK0103
3    BK0104
4    BK0201
5    BK0202
6    BK0203
7    BK0204
8    BK0301
9    BK0302
Name: GeoID, dtype: object


In [21]:
all_nta_ids.shape[0]

262

In [22]:
import numpy as np

In [34]:
file_path = '../generate_data/all_nta_dict.npy'

nta_dicts = np.load(file_path, allow_pickle=True).item()

In [35]:
nta_dicts.keys()

dict_keys(['BK0101', 'BK0102', 'BK0103'])

In [41]:
sum(nta_dicts['BK0101']['insurance_employ_prob'])

1.0