in this script there are two functions

ebike_flag_binary
wfh_flag_binary

Last updated: Brian - May 1, 2022

WFH function work done in [wfh conditional probabilities and wfh function check notebook](./WFH%20Conditional%20Probabilities%20and%20WFH%20Function%20Check.ipynb)

In [27]:
import pandas as pd
import random

ipums_df = pd.read_csv("../ipums_data/disaggregated_cleaned_ipums_data.csv",index_col=0)

In [28]:
bike_friendly_origins = ipums_df[(ipums_df['PUMA_NAME'].str.contains("NYC-Brook"))|
        (ipums_df['PUMA_NAME'].str.contains("NYC-Queen"))|
        (ipums_df['PUMA_NAME'].str.contains("NYC-Bronx"))|
        (ipums_df['PUMA_NAME'].str.contains("NYC-Manh"))|
        (ipums_df['PUMA_NAME'].str.contains("Bergen"))]['PUMA_NAME'].unique()

In [29]:
def ebike_flag_binary(max_age,max_distance,bike_friendly_origins,male_pct,female_pct,age_dist):
    '''
    inputs
    Hard Caps:
    max_age - maximum age of e-bikers. Early research indicates 70 is a realistic cut-off
    max_distance - maximum distance traveled by e-bikers. 15 miles (24 KM) to start.
    bike_friendly_origins - what origin points have bike infrastructure leading into Manhattan?
        ** Bronx, Queens, Brooklyn, Northern NJ
        

    Changable inputs:
    male_pct & female_pct - how many, of each sex, will ride an e-bike of eligible riders? 0-100 value
    age_dist - to be determined how we can use age distributions to determine ridership. 
        Ex) 35 year olds may be 2x more likely to ride than a 50 year old
        
    output:
        series (0,1) indicating whether each line is an eligible e-bike rider or not
    '''
    
    
    age_hardcap = ipums_df['AGE']<=max_age
    dist_hardcap = ipums_df['DISTANCE_KM']<=max_distance
    bike_infra_locs = ipums_df['PUMA_NAME'].isin(bike_friendly_origins)
    
    ### Gender - 
    male_sex_flag = ipums_df['SEX'].apply(lambda x: True if random.random() <= male_pct/100 and x=='M' else False)
    female_sex_flag = ipums_df['SEX'].apply(lambda x: True if random.random() <= female_pct/100 and x=='F' else False)
    sex_flag = male_sex_flag|female_sex_flag

    
    ### Age - TBD if we use an age distribution or buckets
    final_series = age_hardcap&dist_hardcap&bike_infra_locs&sex_flag
    
    return final_series.astype(int)
    
    
    
  

In [35]:
wfh_probs = pd.read_csv("wfh_conditional_probs.csv",index_col=0).drop("WFH_TAG",axis=1)
def wfh_flag_binary(wfh_dampener):
    '''  
    Changable inputs:
    wfh_dampener: decimal 0-1
        if 1, then just use probs as is.
        if <1, multiply probabilities by dampener to reduce WFH population
        if 0, nobody will WFH
    
    WFH taken as a conditional prob of income and education from Census Household Pulse Survey results last year
    
    output:
        series (0,1) indicating whether each line is a likely WFH candidate
    '''   
    
    if wfh_dampener < 0 or wfh_dampener > 1:
        return "Not a valid dampener"
    
    def income_prob_label(inp_income):
        if inp_income >= 200000:
            return '7) 200K+'
        elif inp_income >= 150000:
            return '6) 150-200K'
        elif inp_income >= 100000:
            return '5) 100-150K'
        elif inp_income >= 75000:
            return '4) 75-100K'
        elif inp_income >= 50000:
            return '3) 50-75K'
        elif inp_income >= 35000:
            return '2) 35-50K'
        elif inp_income >= 25000:
            return '1) 25-35K'
        else:
            return '0) 0-25K'
        
    def educ_label(inp_educ):
        if inp_educ in ['College_4Year','College_5PlusYears']:
            return 1
        else:
            return 0
        
    ipums_df['INC_TAG'] = ipums_df["TOTAL_PERSONAL_INCOME"].apply(income_prob_label)
    ipums_df['EDUC_TAG'] = ipums_df["EDUC_LABEL"].apply(educ_label)
    
    
    industries_cannot_wfh = ["Educational Services, and Health Care and Social Assistance"\
                            ,"Arts, Entertainment, and Recreation, and Accommodation and Food Services"\
                            ,"Retail Trade", "Construction"\
                            ,"Other Services, Except Public Administration"\
                            ,"Transportation and Warehousing, and Utilities","Manufacturing"\
                            ,"Agriculture, Forestry, Fishing, and Hunting, and Mining"]
    

    
    industry_binary = ipums_df["IND_CAT"].apply(lambda x: 0 if x in industries_cannot_wfh else 1)
    wfh_probabilities = ipums_df.merge(right=wfh_probs,on=["INC_TAG","EDUC_TAG"])["PROB WFH | INC * EDUC"]
    wfh_binary = wfh_probabilities.apply(lambda x: 1 if random.random() <= x*wfh_dampener else 0)
    
    wfh_overall_binary = industry_binary&wfh_binary
    
    return wfh_overall_binary

In [36]:
    
########################    
#### How to Run ########
########################
ipums_df['FLAG_EBIKE']=ebike_flag_binary(max_age=70
                                        ,max_distance = 24
                                        ,male_pct = 100
                                        ,female_pct = 100
                                        ,age_dist = None
                                        ,bike_friendly_origins=bike_friendly_origins
                                        )

ipums_df["FLAG_WFH"]=wfh_flag_binary(wfh_dampener=1) #dampener just to reduce probabilties overall. Keep at 1 for most runs

In [37]:
######################    
###### CHECKS ########
######################

check_df = ipums_df[ipums_df['YEAR']==2019].copy() #just to make checks simpler
### population by ebike flag
check_df.groupby(by=['YEAR','FLAG_EBIKE']).agg({"PERWT":"sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,PERWT
YEAR,FLAG_EBIKE,Unnamed: 2_level_1
2019,0,982351.0
2019,1,1818979.0


In [38]:
### population by wfh flag
check_df.groupby(by=['YEAR','FLAG_WFH']).agg({"PERWT":"sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,PERWT
YEAR,FLAG_WFH,Unnamed: 2_level_1
2019,0,2122387.0
2019,1,678943.0


In [39]:
### education by wfh flag
wfh_by_edu = check_df.groupby(['YEAR',"FLAG_WFH",'EDUC_LABEL']).agg({"PERWT":"sum"}).reset_index()\
.pivot_table(index='EDUC_LABEL',columns='FLAG_WFH',values='PERWT')
wfh_by_edu.divide(wfh_by_edu.sum(axis=1),axis=0)
wfh_by_edu

FLAG_WFH,0,1
EDUC_LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1
College_1Year,203085.0,53317.0
College_2Year,118632.0,30394.0
College_4Year,714684.0,299741.0
College_5PlusYears,504730.0,187152.0
Grade0_4,6198.0,1349.0
Grade10,18843.0,2433.0
Grade11,26618.0,2880.0
Grade9,18730.0,2603.0
Grades12,444347.0,91362.0
Grades5_8,41818.0,5036.0


In [40]:
### cross-tabs of the 2 flags
ct = check_df.groupby(['YEAR',"FLAG_WFH",'FLAG_EBIKE']).agg({"PERWT":"sum"})\
.reset_index().pivot_table(index='FLAG_EBIKE',columns='FLAG_WFH',values='PERWT')

"pcts overall",ct / ct.sum().sum(),"--"*50,'gross counts', ct

('pcts overall',
 FLAG_WFH           0         1
 FLAG_EBIKE                    
 0           0.257673  0.093000
 1           0.499963  0.149364,
 '----------------------------------------------------------------------------------------------------',
 'gross counts',
 FLAG_WFH            0         1
 FLAG_EBIKE                     
 0            721826.0  260525.0
 1           1400561.0  418418.0)