In [1]:
import pandas as pd
import random

ipums_df = pd.read_csv("../ipums_data/disaggregated_cleaned_ipums_data.csv",index_col=0)

In [2]:
ipums_df.head()

Unnamed: 0,YEAR,PERWT,HOME_STATEFIP,HOME_PUMA,PUMA_NAME,SEX,AGE,HRS_WK_DAILY,TOTAL_PERSONAL_INCOME,MODE_TRANSP_TO_WORK,...,PUMAKEY_HOME,PUMAKEY_WORK,DISTANCE_KM,COGNITIVE_DIFFICULTY,AMBULATORY_DIFFICULTY,IND_LIVING_DIFFICULTY,SELFCARE_DIFFICULTY,VISION_OR_HEARING_DIFFICULTY,VISION_DIFFICULTY,HEARING_DIFFICULTY
0,2018,19.0,9,500,Litchfield County,F,48,10.0,718000,"Auto, truck, or van",...,09_00500,36_03800,170.50491,0,0,0,0,0,0,0
1,2018,62.0,9,500,Litchfield County,M,46,10.0,718000,"Auto, truck, or van",...,09_00500,36_03800,170.50491,0,0,0,0,0,0,0
2,2018,123.0,9,500,Litchfield County,M,47,9.0,400000,"Auto, truck, or van",...,09_00500,36_03800,170.50491,0,0,0,0,0,0,0
3,2018,57.0,9,500,Litchfield County,F,45,8.0,300000,"Auto, truck, or van",...,09_00500,36_03800,170.50491,0,0,0,0,0,0,0
4,2018,33.0,9,500,Litchfield County,M,66,8.0,80000,"Auto, truck, or van",...,09_00500,36_03800,170.50491,0,0,0,0,0,0,0


In [None]:
'''
    Auto inputs
    Hard Caps:
    max_age - maximum age of drivers. Research from Kaiser Permanente and retirement age indicates 75 is a realistic cut-off
    min_distance - minimum distance traveled by drivers, does not make sense to drive for under 1 mile (2km)
    min_income - minimum income of drivers. Current cutoff is set at the NY poverty line
    cognitive_diff - if the individual has cognitive difficulties, they would not have a drivers license
    ambulatory_diff - if the individual has walking difficulties, they would not have a drivers license
    ind_living_diff - if the individual has difficulties taking care of themselves, they would not have a drivers license
    selfcare_diff - if the individual has difficulties taking care of themselves, they would not have a drivers license
    vision_diff - if the individual has vision difficulties, they would not have a drivers license
    vehicle_availabile - if the individual does not have a car, they cannot drive to work

    Changable inputs:
    male_pct & female_pct - how many, of each sex, will drive a car of eligible riders? 0-100 value
    age_dist - to be determined how we can use age distributions to determine ridership. 
        Ex) 35 year olds may be 2x more likely to ride than a 50 year old
    home_ownership_pct - 80% of EV owners charge at home (USDOE)
        
    output:
        series (0,1) indicating whether each line is an eligible driver or not
    '''

In [10]:
def auto_flag_binary(max_age,min_distance,min_income,male_pct,female_pct,age_dist, home_owner_pct):
    age_hardcap = ipums_df['AGE'] <= max_age
    dist_hardcap = ipums_df['DISTANCE_KM'] >= min_distance
    income_hardcap = ipums_df['TOTAL_PERSONAL_INCOME'] >= min_income #poverty line in NY 2019
    cog_diff_hardcap = ipums_df['COGNITIVE_DIFFICULTY'] <= 0
    amb_diff_hardcap = ipums_df['AMBULATORY_DIFFICULTY'] <= 0
    ind_living_diff_hardcap = ipums_df['IND_LIVING_DIFFICULTY'] <= 0
    selfcare_diff_hardcap = ipums_df['SELFCARE_DIFFICULTY'] <= 0
    vision_diff_hardcap = ipums_df['VISION_DIFFICULTY'] <= 0
    car_hardcap = ipums_df['VEHICLE_AVAILABLE'] == 1
    
    
    ### Gender - 
    male_sex_flag = ipums_df['SEX'].apply(lambda x: True if random.random() <= male_pct/100 and x=='M' else False)
    female_sex_flag = ipums_df['SEX'].apply(lambda x: True if random.random() <= female_pct/100 and x=='F' else False)
    sex_flag = male_sex_flag|female_sex_flag

    # grab a random number, if less than prob, home flag, else no flag
    home_flag = ipums_df['HOMEOWNER_LABEL'].apply(lambda x: True if random.random() >= home_owner_pct/100 and x=='Own' else False)
    
    ### Age - TBD if we use an age distribution or buckets
    final_series = age_hardcap&dist_hardcap&income_hardcap&cog_diff_hardcap&amb_diff_hardcap&ind_living_diff_hardcap&selfcare_diff_hardcap&vision_diff_hardcap&car_hardcap&sex_flag&home_flag
    
    

    
    return final_series.astype(int)

In [11]:
ipums_df['FLAG_AUTO']=auto_flag_binary(max_age=70
                                       ,min_distance = 2
                                       ,min_income = 32626
                                       ,male_pct = 100
                                       ,female_pct = 100
                                       ,age_dist = None
                                       ,home_owner_pct = 80 #% of people who would charge at home, so we're only interested in the people who do not charge
                                        )

In [12]:
ipums_df.groupby(by=['FLAG_AUTO']).agg({"PERWT":"sum"})

Unnamed: 0_level_0,PERWT
FLAG_AUTO,Unnamed: 1_level_1
0,5416007.0
1,150722.0


In [None]:
'''
    Motorcycle inputs
    Hard Caps:
    max_age - maximum age of drivers. Research from Kaiser Permanente and retirement age indicates 75 is a realistic cut-off
    max_distance - max distance traveled by drivers, avg motorcycle tank only holds 150 miles
    min_income - minimum income of drivers. Current cutoff is set at the NY poverty line
    cognitive_diff - if the individual has cognitive difficulties, they would not have a drivers license
    ambulatory_diff - if the individual has walking difficulties, they would not have a drivers license
    ind_living_diff - if the individual has difficulties taking care of themselves, they would not have a drivers license
    selfcare_diff - if the individual has difficulties taking care of themselves, they would not have a drivers license
    vision_diff - if the individual has vision difficulties, they would not have a drivers license

    Changable inputs:
    male_pct & female_pct - how many, of each sex, will drive a car of eligible riders? 0-100 value
    age_dist - to be determined how we can use age distributions to determine ridership. 
        Ex) 35 year olds may be 2x more likely to ride than a 50 year old
        
    output:
        series (0,1) indicating whether each line is an eligible driver or not
    '''

In [21]:
def motorcycle_flag_binary(max_age,max_distance,min_income,male_pct,female_pct,age_dist):
    age_hardcap = ipums_df['AGE'] <= max_age
    dist_hardcap = ipums_df['DISTANCE_KM'] <= max_distance
    income_hardcap = ipums_df['TOTAL_PERSONAL_INCOME'] >= 32626 #poverty line in NY 2019
    cog_diff_hardcap = ipums_df['COGNITIVE_DIFFICULTY'] <= 0
    amb_diff_hardcap = ipums_df['AMBULATORY_DIFFICULTY'] <= 0
    ind_living_diff_hardcap = ipums_df['IND_LIVING_DIFFICULTY'] <= 0
    selfcare_diff_hardcap = ipums_df['SELFCARE_DIFFICULTY'] <= 0
    vision_diff_hardcap = ipums_df['VISION_DIFFICULTY'] <= 0
    
    
    ### Gender - 
    male_sex_flag = ipums_df['SEX'].apply(lambda x: True if random.random() <= male_pct/100 and x=='M' else False)
    female_sex_flag = ipums_df['SEX'].apply(lambda x: True if random.random() <= female_pct/100 and x=='F' else False)
    sex_flag = male_sex_flag|female_sex_flag

    
    ### Age - TBD if we use an age distribution or buckets
    final_series = age_hardcap&dist_hardcap&income_hardcap&cog_diff_hardcap&amb_diff_hardcap&ind_living_diff_hardcap&selfcare_diff_hardcap&vision_diff_hardcap&sex_flag
    
    

    
    return final_series.astype(int)

In [22]:
ipums_df['FLAG_MOTORCYCLE']=motorcycle_flag_binary(max_age=70
                                       ,max_distance = 300
                                       ,min_income = 32626
                                       ,male_pct = 100
                                       ,female_pct = 100
                                       ,age_dist = None
                                        )

In [23]:
ipums_df.groupby(by=['FLAG_MOTORCYCLE']).agg({"PERWT":"sum"})

Unnamed: 0_level_0,PERWT
FLAG_MOTORCYCLE,Unnamed: 1_level_1
0,2449849.0
1,5872196.0


In [None]:
'''
    Taxicab inputs
    Hard Caps:
    max_distance - max distance of taxicab ride (~15 miles or 30 km)
    min_income - minimum income of drivers. Current cutoff is set at the NY poverty line

    Changable inputs:
    male_pct & female_pct - how many, of each sex, will take a taxi of eligible riders? 0-100 value
    age_dist - to be determined how we can use age distributions to determine ridership. 
        Ex) 35 year olds may be 2x more likely to ride than a 50 year old
        
    output:
        series (0,1) indicating whether each line is an eligible driver or not
    '''

In [26]:
def taxicab_flag_binary(max_distance,min_income, male_pct,female_pct,age_dist):
    dist_hardcap = ipums_df['DISTANCE_KM'] <= max_distance
    income_hardcap = ipums_df['TOTAL_PERSONAL_INCOME'] >= min_income #poverty line in NY 2019
    
    
    ### Gender - 
    male_sex_flag = ipums_df['SEX'].apply(lambda x: True if random.random() <= male_pct/100 and x=='M' else False)
    female_sex_flag = ipums_df['SEX'].apply(lambda x: True if random.random() <= female_pct/100 and x=='F' else False)
    sex_flag = male_sex_flag|female_sex_flag

    
    ### Age - TBD if we use an age distribution or buckets
    final_series = dist_hardcap&income_hardcap&sex_flag
    
    

    
    return final_series.astype(int)

In [27]:
ipums_df['FLAG_TAXICAB']=taxicab_flag_binary(max_distance = 30
                                       ,min_income = 32626
                                       ,male_pct = 100
                                       ,female_pct = 100
                                       ,age_dist = None
                                        )

In [28]:
ipums_df.groupby(by=['FLAG_TAXICAB']).agg({"PERWT":"sum"})

Unnamed: 0_level_0,PERWT
FLAG_TAXICAB,Unnamed: 1_level_1
0,3592678.0
1,4729367.0
