In [1]:
'''
in this script there are two functions

ebike_flag_binary
wfh_flag_binary

Brian - April 24, 2022

'''

'\nin this script there are two functions\n\nebike_flag_binary\nwfh_flag_binary\n\nBrian - April 24, 2022\n\n'

In [2]:
import pandas as pd
import random

ipums_df = pd.read_csv("../ipums_data/disaggregated_cleaned_ipums_data.csv",index_col=0)

In [3]:
bike_friendly_origins = ipums_df[(ipums_df['PUMA_NAME'].str.contains("NYC-Brook"))|
        (ipums_df['PUMA_NAME'].str.contains("NYC-Queen"))|
        (ipums_df['PUMA_NAME'].str.contains("NYC-Bronx"))|
        (ipums_df['PUMA_NAME'].str.contains("NYC-Manh"))|
        (ipums_df['PUMA_NAME'].str.contains("Bergen"))]['PUMA_NAME'].unique()

In [4]:
def ebike_flag_binary(max_age,max_distance,bike_friendly_origins,male_pct,female_pct,age_dist):
    '''
    inputs
    Hard Caps:
    max_age - maximum age of e-bikers. Early research indicates 70 is a realistic cut-off
    max_distance - maximum distance traveled by e-bikers. 15 miles (24 KM) to start.
    bike_friendly_origins - what origin points have bike infrastructure leading into Manhattan?
        ** Bronx, Queens, Brooklyn, Northern NJ
        

    Changable inputs:
    male_pct & female_pct - how many, of each sex, will ride an e-bike of eligible riders? 0-100 value
    age_dist - to be determined how we can use age distributions to determine ridership. 
        Ex) 35 year olds may be 2x more likely to ride than a 50 year old
        
    output:
        series (0,1) indicating whether each line is an eligible e-bike rider or not
    '''
    
    
    age_hardcap = ipums_df['AGE']<=max_age
    dist_hardcap = ipums_df['DISTANCE_KM_TOCBD']<=max_distance
    bike_infra_locs = ipums_df['PUMA_NAME'].isin(bike_friendly_origins)
    
    ### Gender - 
    male_sex_flag = ipums_df['SEX'].apply(lambda x: True if random.random() <= male_pct/100 and x=='M' else False)
    female_sex_flag = ipums_df['SEX'].apply(lambda x: True if random.random() <= female_pct/100 and x=='F' else False)
    sex_flag = male_sex_flag|female_sex_flag

    
    ### Age - TBD if we use an age distribution or buckets
    final_series = age_hardcap&dist_hardcap&bike_infra_locs&sex_flag
    
    return final_series.astype(int)
    
    
    
  

In [5]:
def wfh_flag_binary(overall_wfh_pct):
    '''  
    Changable inputs:
    overall_wfh_pct - NOT USING CURRENTLY - using demographic-based percentages
    
    Given that most of these stats were taken from when everybody had to WFH
    We can take the future assumed overall wfh pct and multiply it by that to get the presumed 
    future WFH demographic distribution
    
        
    output:
        series (0,1) indicating whether each line is a likely WFH candidate
    '''   
    
    if overall_wfh_pct < 0 or overall_wfh_pct > 1:
        return "Not a valid overall wfh pct"
    
    education_prob = ipums_df['EDUC_LABEL'].apply(lambda x: .61 if x in ['College_4Year','College_5PlusYears'] 
                                                      else .19) #* overall_wfh_pct 
    #Those with a bachelorâ€™s degree or higher were more than three times 
    # as likely as those with a high school education 
    # or GED only to have an adult in their household substitute in-person work for telework (Figure 2)
    # 61.7% compared to 19.1%.
    #https://www.census.gov/library/stories/2021/03/working-from-home-during-the-pandemic.html 
    
    def income_prob_tagger(inp_income):
        if inp_income >= 200000:
            return .73
        elif inp_income >= 150000:
            return .675
        elif inp_income >= 100000:
            return .575
        elif inp_income >= 75000:
            return .441
        elif inp_income >= 50000:
            return .321
        elif inp_income >= 35000:
            return .231
        elif inp_income >= 25000:
            return .175
        else:
            return .127
    
    income_prob = ipums_df['TOTAL_PERSONAL_INCOME'].apply(income_prob_tagger) #* overall_wfh_pct 

    #https://www.census.gov/content/dam/Census/library/stories/2021/03/working-from-home-during-the-pandemic-figure-1.jpg
    # 200k+ : 73%
    # 150k-199,999 : 67.5%
    # 100k-149,999 : 57.5%
    # 75k-99,999 : 44.1%
    # 50k-74,999 : 32.1%
    # 35k-49,999 : 23.1%
    # 25k-34,999 : 17.5%
    # <=25K : 12.7%  
    
    education_binary = education_prob.apply(lambda x: 1 if random.random() <= x else 0) 
    income_binary = income_prob.apply(lambda x: 1 if random.random() <= x else 0)
#     wfh_overall_binary = income_prob.apply(lambda x: 1 if random.random() <= overall_wfh_pct else 0)
       
    final_series = education_binary&income_binary
    
    return final_series.astype(int)
    
    

In [6]:
    
########################    
#### How to Run ########
########################
ipums_df['FLAG_EBIKE']=ebike_flag_binary(max_age=70
                                        ,max_distance = 24
                                        ,male_pct = 100
                                        ,female_pct = 100
                                        ,age_dist = None
                                        ,bike_friendly_origins=bike_friendly_origins
                                        )

ipums_df["FLAG_WFH"]=wfh_flag_binary(overall_wfh_pct=0.2) #no longer using overall pct - just demographics

In [7]:
######################    
###### CHECKS ########
######################

check_df = ipums_df[ipums_df['YEAR']==2019].copy() #just to make checks simpler
### population by ebike flag
check_df.groupby(by=['YEAR','FLAG_EBIKE']).agg({"PERWT":"sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,PERWT
YEAR,FLAG_EBIKE,Unnamed: 2_level_1
2019,0,905095.0
2019,1,1889160.0


In [8]:
### population by wfh flag
check_df.groupby(by=['YEAR','FLAG_WFH']).agg({"PERWT":"sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,PERWT
YEAR,FLAG_WFH,Unnamed: 2_level_1
2019,0,2264206.0
2019,1,530049.0


In [9]:
### education by wfh flag
wfh_by_edu = check_df.groupby(['YEAR',"FLAG_WFH",'EDUC_LABEL']).agg({"PERWT":"sum"}).reset_index()\
.pivot_table(index='EDUC_LABEL',columns='FLAG_WFH',values='PERWT')
wfh_by_edu.divide(wfh_by_edu.sum(axis=1),axis=0)
wfh_by_edu

FLAG_WFH,0,1
EDUC_LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1
College_1Year,240459.0,15748.0
College_2Year,140010.0,8972.0
College_4Year,747255.0,262669.0
College_5PlusYears,475825.0,214659.0
Grade0_4,7547.0,
Grade10,19939.0,1337.0
Grade11,28779.0,719.0
Grade9,20800.0,533.0
Grades12,511952.0,22820.0
Grades5_8,44760.0,2094.0


In [17]:
### cross-tabs of the 2 flags
ct = check_df.groupby(['YEAR',"FLAG_WFH",'FLAG_EBIKE']).agg({"PERWT":"sum"})\
.reset_index().pivot_table(index='FLAG_EBIKE',columns='FLAG_WFH',values='PERWT')

"pcts overall",ct / ct.sum().sum(),"--"*50,'gross counts', ct

('pcts overall',
 FLAG_WFH           0         1
 FLAG_EBIKE                    
 0           0.254177  0.069736
 1           0.556131  0.119956,
 '----------------------------------------------------------------------------------------------------',
 'gross counts',
 FLAG_WFH            0         1
 FLAG_EBIKE                     
 0            710234.0  194861.0
 1           1553972.0  335188.0)