# Transit Equity/ Bus Routes (MBTA only)

In [8]:
# imports
import pandas as pd
import geopandas as gpd
import requests
import numpy as np
import math
from shapely.ops import nearest_points

## Step 1: Data collection
Read [this report](http://www.wrrb.org/wp-content/uploads/2019/05/WRRB-FareFree-Transit-Report.pdf) to understand the issue. Collect data - create a spreadsheet of all the different bus stops in Massachusetts including MBTA, Regional Transit Authorities, and City/Town buses.

**Make sure datasets --- 'mbtabus/', 'CENSUS2010_BLK_BG_TRCT_SHP/' and 'census.csv' are under './data/'**

In [9]:
# Collect data for bus stops 
def load_bus_stop():
    """
    Data description link: https://docs.digital.mass.gov/dataset/massgis-data-mbta-bus-routes-and-stops
    """
    file = gpd.read_file("./data/stops_routes/mbtabus/MBTABUSSTOPS_PT.shp")
    return file


def load_census_tract():
    """
    Data description link: https://docs.digital.mass.gov/dataset/massgis-data-datalayers-2010-us-census
    """
    file = gpd.read_file("./data/census_tract/CENSUS2010_BLK_BG_TRCT_SHP/CENSUS2010TRACTS_POLY.shp")
    return file

def load_census_data():
    file = gpd.read_file("./data/census_tract/census/ACSDP5Y2018.DP03_data_with_overlays_2020-11-12T115259.csv")
    return file

# Collect Census Data:
def get_median_hh_income():
    '''
        Returns Pandas DataFrame representation Median Household Income Estimate by Census Tract for MA.
        American Community Survey (ACS) 2018 Census data used.
        Specific table: ACS 2018 5-year detailed table "B19013_001E"
    '''
    URL = "https://api.census.gov/data/2018/acs/acs5?get=B19013_001E&for=tract:*&in=state:25"

    response = requests.get(url = URL)
    data = response.json()
    
    median_income_df = pd.DataFrame(data[1:len(data)-1], columns = data[0])
    
    return median_income_df

def load_median_hh_income():
    '''
        *** USE THIS FUNCTION TO LOAD INCOME DATA FROM LOCAL ***
        Returns Pandas DataFrame representation Median Household Income Estimate by Census Tract for MA.
        American Community Survey (ACS) 2018 Census data used.
        Specific table: ACS 2018 5-year detailed table "B19013_001E"
    '''
    median_income_df = pd.read_csv("./data/census_tract/census/census.csv")
    return median_income_df

In [10]:
busstop_gdf = load_bus_stop()
census_tract_gdf = load_census_tract()
median_income_for_tract_df = load_median_hh_income()
census_data_df = load_census_data()

# Collect Census Data:
# res = get_median_hh_income()
# res.to_csv()

In [11]:
# Removed functions linking Tract Shapes to Income; instead, link income data in census_data_df with
# shapes in census_tract_gdf

# census_data_df has GEOIDs pre-pended with an arbitrary 9-character US code
# removing those to match on just the GEOID
census_data_df['GEO_ID'] = census_data_df['GEO_ID'].apply(lambda x: x[9:])
check = pd.merge(census_data_df, census_tract_gdf, how='outer', left_on='GEO_ID', right_on='GEOID10')
check[check['TRACTCE10'].isnull()]

Unnamed: 0,GEO_ID,NAME,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,AREA_ACRES,POP100_RE,HU100_RE,LOGPL94171,LOGSF1,LOGACS0610,LOGSF1C,SHAPE_AREA,SHAPE_LEN,geometry_y
0,,Geographic Area Name,Estimate!!EMPLOYMENT STATUS!!Population 16 yea...,Margin of Error!!EMPLOYMENT STATUS!!Population...,Percent Estimate!!EMPLOYMENT STATUS!!Populatio...,Percent Margin of Error!!EMPLOYMENT STATUS!!Po...,Estimate!!EMPLOYMENT STATUS!!Population 16 yea...,Margin of Error!!EMPLOYMENT STATUS!!Population...,Percent Estimate!!EMPLOYMENT STATUS!!Populatio...,Percent Margin of Error!!EMPLOYMENT STATUS!!Po...,...,,,,,,,,,,
57,25001990000.0,"Census Tract 9900, Barnstable County, Massachu...",0,12,0,(X),0,12,-,**,...,,,,,,,,,,
222,25005990000.0,"Census Tract 9900, Bristol County, Massachusetts",0,12,0,(X),0,12,-,**,...,,,,,,,,,,
227,25007990000.0,"Census Tract 9900, Dukes County, Massachusetts",0,12,0,(X),0,12,-,**,...,,,,,,,,,,
390,25009990100.0,"Census Tract 9901, Essex County, Massachusetts",0,12,0,(X),0,12,-,**,...,,,,,,,,,,
871,25019990000.0,"Census Tract 9900, Nantucket County, Massachus...",0,12,0,(X),0,12,-,**,...,,,,,,,,,,
1102,25023990003.0,"Census Tract 9900.03, Plymouth County, Massach...",0,12,0,(X),0,12,-,**,...,,,,,,,,,,


In [12]:
busstop_gdf

Unnamed: 0,STOP_ID,STOP_NAME,TOWN,TOWN_ID,geometry
0,3077,Gallivan Blvd @ opp Marsh St,BOSTON,35,POINT (237120.669 892643.408)
1,841,Lagrange St @ Virgil Rd,BOSTON,35,POINT (227915.195 892644.017)
2,446,Norfolk St @ Nelson St,BOSTON,35,POINT (234385.661 892644.944)
3,847,Lagrange St opp Virgil St,BOSTON,35,POINT (227912.601 892650.156)
4,3079,Adams St @ Minot St,BOSTON,35,POINT (236644.812 892651.990)
...,...,...,...,...,...
7805,9097,Grove St @ Lebanon St,MELROSE,178,POINT (236229.381 911541.866)
7806,5911,Grove St @ Lebanon St,MELROSE,178,POINT (236236.036 911542.538)
7807,5975,Wyoming Ave opp Cleveland St,MELROSE,178,POINT (234977.387 911544.999)
7808,15976,Wyoming Ave @ Cleveland St,MELROSE,178,POINT (234971.098 911547.184)


In [13]:
census_tract_gdf

Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,GEOID10,NAME10,NAMELSAD10,MTFCC10,ALAND10,AWATER10,INTPTLAT10,...,AREA_ACRES,POP100_RE,HU100_RE,LOGPL94171,LOGSF1,LOGACS0610,LOGSF1C,SHAPE_AREA,SHAPE_LEN,geometry
0,25,021,418003,25021418003,4180.03,Census Tract 4180.03,G5020,1705668.0,2936.0,+42.2350240,...,422.1740,2481,1350,0141872,123764,0003690,0123764,1.708484e+06,6274.185034,"POLYGON ((240678.759 886748.073, 240607.995 88..."
1,25,021,417701,25021417701,4177.01,Census Tract 4177.01,G5020,1543651.0,12275.0,+42.2523398,...,384.4502,5417,2983,0141838,123730,0003683,0123730,1.555821e+06,6229.137913,"POLYGON ((240969.666 890122.806, 240985.156 89..."
2,25,021,417702,25021417702,4177.02,Census Tract 4177.02,G5020,1685529.0,317947.0,+42.2582818,...,416.4918,2765,1109,0141843,123735,0003684,0123735,1.685489e+06,8542.498286,"POLYGON ((242328.221 890545.050, 242276.888 89..."
3,25,021,418102,25021418102,4181.02,Census Tract 4181.02,G5020,771203.0,2371.0,+42.2470410,...,191.1397,3377,1676,0141882,123774,0003693,0123774,7.735182e+05,4540.296101,"POLYGON ((240372.666 888331.030, 240393.057 88..."
4,25,021,418004,25021418004,4180.04,Census Tract 4180.04,G5020,1316466.0,0.0,+42.2383744,...,325.2812,4280,1987,0141874,123766,0003691,0123766,1.316372e+06,5854.497366,"POLYGON ((240687.299 888298.408, 240700.449 88..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1467,25,027,709501,25027709501,7095.01,Census Tract 7095.01,G5020,5484847.0,0.0,+42.5343643,...,1355.2791,2149,809,0189917,171462,0004065,0171462,5.484642e+06,9858.734830,"POLYGON ((182231.394 921599.665, 182405.418 92..."
1468,25,027,709502,25027709502,7095.02,Census Tract 7095.02,G5020,6089034.0,1287.0,+42.5573788,...,1504.8969,5670,2488,0189919,171464,0004066,0171464,6.090126e+06,11906.751110,"POLYGON ((181045.532 923444.745, 181902.923 92..."
1469,25,027,730801,25027730801,7308.01,Census Tract 7308.01,G5020,3158638.0,7980.0,+42.2816214,...,782.4305,3665,1557,0190133,171678,0004114,0171678,3.166396e+06,9559.778037,"POLYGON ((174482.678 891951.535, 174407.978 89..."
1470,25,027,710800,25027710800,7108,Census Tract 7108,G5020,2337125.0,0.0,+42.5913348,...,577.4996,4539,1940,0189967,171512,0004077,0171512,2.337067e+06,9574.497510,"POLYGON ((176303.717 926403.339, 176284.512 92..."


In [14]:
median_income_for_tract_df

Unnamed: 0.1,Unnamed: 0,B19013_001E,state,county,tract
0,0,132750,25,25,60501
1,1,12759,25,25,61101
2,2,84083,25,25,70101
3,3,28851,25,25,70402
4,4,52676,25,25,71101
...,...,...,...,...,...
1472,1472,69750,25,5,613902
1473,1473,52741,25,5,630101
1474,1474,58362,25,5,630102
1475,1475,34692,25,5,640901


In [None]:
# Find abnormal values and correct them with its neighbors' average incomes
# census_tract_gdf.plot()
# busstop_gdf.plot()

# while True:
#     abnormal_tracts = ['980300', '980700', '981000', '981201', '981202', '981502', '981600', '981700']
    
#     for tract in abnormal_tracts:
#         polygon = census_tract_gdf[census_tract_gdf['TRACTCE10'] == tract].geometry
#         print(polygon)
#         neighbors = census_tract_gdf[census_tract_gdf.geometry.touches(polygon)].TRACTCE10.tolist()
#         print(len(neighbors))
#     break

Then traverse through bus stops to indentify which tracts they are in.

In [45]:
# Changede references in census_tract_gdf from TRACTCE10 to GEOID10

def tract_for_stop(busstop_gdf, census_tract_gdf):
    """
    This function takes in busstop and tract data in geoDataFrame.
    Returns a dictionary in {stop_id: tract_id} format
    """
    stopid_tract_dict = {}
    
    points = busstop_gdf.geometry
    polygons = census_tract_gdf.geometry
    
    stop_ids = busstop_gdf['STOP_ID']
    # CHANGE NUMBER ONE
    tract_ids = census_tract_gdf['GEOID10']
    
    for i in range(len(busstop_gdf)):
        stopid = stop_ids[i]
        point = points[i]
        for j in range(len(census_tract_gdf)):
            tractid = tract_ids[j]
            polygon = polygons[j]
            if point.within(polygon):
                stopid_tract_dict[stopid] = tractid
                break;
    
    return stopid_tract_dict

In [46]:
stopid_tract_dict = tract_for_stop(busstop_gdf, census_tract_gdf)

Then add a column 'TRACT_ID' in bus stop data

In [47]:
busstop_gdf['GEO_ID'] = stopid_tract_dict.values()
busstop_gdf

Unnamed: 0,STOP_ID,STOP_NAME,TOWN,TOWN_ID,geometry,GEO_ID
0,3077,Gallivan Blvd @ opp Marsh St,BOSTON,35,POINT (237120.669 892643.408),25025100700
1,841,Lagrange St @ Virgil Rd,BOSTON,35,POINT (227915.195 892644.017),25025130200
2,446,Norfolk St @ Nelson St,BOSTON,35,POINT (234385.661 892644.944),25025100300
3,847,Lagrange St opp Virgil St,BOSTON,35,POINT (227912.601 892650.156),25025130200
4,3079,Adams St @ Minot St,BOSTON,35,POINT (236644.812 892651.990),25025100800
...,...,...,...,...,...,...
7805,9097,Grove St @ Lebanon St,MELROSE,178,POINT (236229.381 911541.866),25017336300
7806,5911,Grove St @ Lebanon St,MELROSE,178,POINT (236236.036 911542.538),25017336300
7807,5975,Wyoming Ave opp Cleveland St,MELROSE,178,POINT (234977.387 911544.999),25017336402
7808,15976,Wyoming Ave @ Cleveland St,MELROSE,178,POINT (234971.098 911547.184),25017336402


Then assign median income for each stop by 'TRACT_ID'

In [15]:
# merged_gdf is the intersection of census_data_df and census_tract_gdf on their GEOID's
# census_tract_gdf (the gdf containing tract geometries) is missing some tracts in census_data_df, but these
# don't have any bus stops in them so we can ignore them

merged_gdf = pd.merge(census_data_df, census_tract_gdf, how='inner', left_on='GEO_ID', right_on='GEOID10')
merged_gdf

Unnamed: 0,GEO_ID,NAME,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,AREA_ACRES,POP100_RE,HU100_RE,LOGPL94171,LOGSF1,LOGACS0610,LOGSF1C,SHAPE_AREA,SHAPE_LEN,geometry_y
0,25001010100,"Census Tract 101, Barnstable County, Massachus...",2829,65,2829,(X),1781,142,63.0,5.0,...,6327.4075,2942,4494,0015150,13422,0002740,0013422,2.560621e+07,46334.519974,"MULTIPOLYGON (((308489.445 871020.643, 308542...."
1,25001010206,"Census Tract 102.06, Barnstable County, Massac...",3129,220,3129,(X),1723,198,55.1,6.1,...,12989.7775,2750,4305,0015156,13428,0002741,0013428,5.256797e+07,81419.705965,"MULTIPOLYGON (((325461.284 856725.933, 325518...."
2,25001010208,"Census Tract 102.08, Barnstable County, Massac...",1122,224,1122,(X),638,194,56.9,10.3,...,13843.3306,2003,3077,0015160,13432,0002742,0013432,5.602220e+07,48367.069196,"POLYGON ((324602.042 858479.130, 323895.294 85..."
3,25001010304,"Census Tract 103.04, Barnstable County, Massac...",2282,238,2282,(X),1141,214,50.0,7.2,...,4742.8976,2421,2706,0015164,13436,0002743,0013436,1.919390e+07,68689.291139,"MULTIPOLYGON (((328644.258 846623.078, 328660...."
4,25001010306,"Census Tract 103.06, Barnstable County, Massac...",2195,222,2195,(X),1157,200,52.7,7.0,...,4428.3132,2535,3254,0015168,13440,0002744,0013440,1.792082e+07,19147.555020,"POLYGON ((328644.258 846623.078, 328562.662 84..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1467,25027760100,"Census Tract 7601, Worcester County, Massachus...",2840,102,2840,(X),1878,131,66.1,4.7,...,10623.8098,3390,1493,0190546,172093,0004213,0172093,4.299320e+07,32625.081217,"POLYGON ((152698.506 887012.714, 152695.388 88..."
1468,25027761100,"Census Tract 7611, Worcester County, Massachus...",4249,198,4249,(X),2793,286,65.7,6.8,...,17694.7230,5135,2211,0190550,172097,0004214,0172097,7.160829e+07,34469.208131,"POLYGON ((146528.911 881420.064, 146720.113 88..."
1469,25027761200,"Census Tract 7612, Worcester County, Massachus...",4270,295,4270,(X),2648,281,62.0,5.0,...,6002.2942,5780,2486,0190555,172102,0004215,0172102,2.429052e+07,25335.083060,"POLYGON ((195918.967 890629.420, 195881.671 89..."
1470,25027761300,"Census Tract 7613, Worcester County, Massachus...",2738,177,2738,(X),2022,160,73.8,4.3,...,2639.3459,3059,1313,0190559,172106,0004216,0172106,1.068110e+07,17545.353232,"POLYGON ((186581.687 888399.211, 186579.944 88..."


In [16]:
# clean merged_gdf null values which are represented as '-' instead of -66666666 as in Rishab's original
# API call

merged_gdf['DP03_0062E'] = merged_gdf['DP03_0062E'].replace('-', '-1')
merged_gdf['DP03_0062E'] = merged_gdf['DP03_0062E'].replace('2,500-', '2500')
merged_gdf['DP03_0062E'] = merged_gdf['DP03_0062E'].replace('250,000+', '250000')
merged_gdf[merged_gdf['DP03_0062E'].str.contains('-1')]
#merged_gdf

Unnamed: 0,GEO_ID,NAME,DP03_0001E,DP03_0001M,DP03_0001PE,DP03_0001PM,DP03_0002E,DP03_0002M,DP03_0002PE,DP03_0002PM,...,AREA_ACRES,POP100_RE,HU100_RE,LOGPL94171,LOGSF1,LOGACS0610,LOGSF1C,SHAPE_AREA,SHAPE_LEN,geometry_y
518,25015820802,"Census Tract 8208.02, Hampshire County, Massac...",1190,232,1190,(X),655,158,55.0,7.3,...,173.2352,1158,1,96253,75462,3262,75462,701060.7,3795.705773,"POLYGON ((114918.848 898058.419, 114889.829 89..."
860,25017980000,"Census Tract 9800, Middlesex County, Massachus...",0,12,0,(X),0,12,-,**,...,953.4351,0,0,103244,107673,3604,107673,3858430.0,8278.744458,"POLYGON ((202514.834 905791.361, 202515.576 90..."
1065,25023525300,"Census Tract 5253, Plymouth County, Massachusetts",1648,399,1648,(X),0,12,0.0,2.1,...,1814.0235,2063,6,159344,137642,3810,137642,7341122.0,13506.119501,"POLYGON ((246896.153 856272.516, 246890.573 85..."
1286,25025980101,"Census Tract 9801.01, Suffolk County, Massachu...",405,506,405,(X),180,274,44.4,27.6,...,819.4016,535,0,175650,148640,4032,148640,3316014.0,27244.387422,"MULTIPOLYGON (((244755.253 897977.825, 244763...."
1287,25025980300,"Census Tract 9803, Suffolk County, Massachusetts",369,257,369,(X),82,99,22.2,19.8,...,519.7562,338,2,175652,148642,4033,148642,2103387.0,6459.788458,"POLYGON ((233623.183 895566.486, 233646.040 89..."
1288,25025980700,"Census Tract 9807, Suffolk County, Massachusetts",0,12,0,(X),0,12,-,**,...,726.6817,6,5,175654,148644,4034,148644,2940788.0,12432.467943,"POLYGON ((229642.444 891696.458, 229639.371 89..."
1289,25025981000,"Census Tract 9810, Suffolk County, Massachusetts",0,12,0,(X),0,12,-,**,...,279.4592,22,5,175656,148646,4035,148646,1130936.0,6061.142554,"POLYGON ((231525.406 894745.243, 231566.793 89..."
1291,25025981201,"Census Tract 9812.01, Suffolk County, Massachu...",0,12,0,(X),0,12,-,**,...,194.3706,0,0,175663,148653,4037,148653,786593.0,13619.120683,"MULTIPOLYGON (((239663.440 898808.995, 239696...."
1292,25025981202,"Census Tract 9812.02, Suffolk County, Massachu...",215,39,215,(X),215,39,100.0,15.0,...,394.0731,207,0,175665,148655,4038,148655,1594764.0,15892.417628,"MULTIPOLYGON (((237803.665 899433.890, 237814...."
1294,25025981501,"Census Tract 9815.01, Suffolk County, Massachu...",0,12,0,(X),0,12,-,**,...,226.2459,12,0,175670,148660,4040,148660,915588.3,27384.144523,"MULTIPOLYGON (((230658.615 902572.582, 230696...."


In [None]:
# output merged_gdf to shapefile, keeping only columns that we need (GEO_ID, DP03_0062E, and some pop stuff)


Then add a column 'income' in bus stop data

Save the bus stop data into a new csv file

## Step 2: Income level assignment
Assign an income level to each stop based on the census tract data

**No need to run blocks above!!**
Here we use the income group standard according to [Pew Research](http://www.pewsocialtrends.org/2015/12/09/the-american-middle-class-is-losing-ground/), which shows as follows:

| LEVEL | INCOME GROUP | INCOME/\$ |
| :- | :- | -: |
| 0 | Lowest income | 31,000 or less|
| 1 | Lower-middle income | 31,000 - 42,000 |
| 2 | Middle-income | 42,000 - 126,000 |
| 3 | Upper-middle income | 126,000 - 188,000 |
| 4 | Higher-income | 188,000 or more |

In [None]:
import pandas as pd
# read result csv generated from step one
busstop_df = pd.read_csv("./output/stops_with_income.csv")
busstop_df

In [None]:
income_level = []

incomes = busstop_df['income']
for income in incomes:
    if income<=0:
        income_level.append(-1)
    elif 0<income < 31000:
        income_level.append(0)
    elif 31000 <= income < 42000:
        income_level.append(1)
    elif 42000 <= income < 126000:
        income_level.append(2)
    elif 126000 <= income < 188000:
        income_level.append(3)
    elif 188000 <= income:
        income_level.append(4)

In [None]:
busstop_df['income_level'] = income_level
busstop_df

In [None]:
# Save to csv file
busstop_df.to_csv("./output/stops_with_incomeLevel.csv", index_label=False)

In [None]:
# show stops whose income are unknown
busstop_df[busstop_df.income<0]

## Step 3&4: Ridership & Revenue for each stop
Calculate annual revenue for each stop
1. Find fare for each route
| route_type    | fare | fare (reduced) |
|---------------|------|----------------|
| Local Bus     | 1.7  | 0.85           |
| Inner Express | 4.25 | 2.10           |
| Outer Express | 5.25 | 2.60           |
2. Connect routes for each stop
3. Collect ridership for each route per stop
3. Calculate annual renevue for each stop, note: reduced fare, monthly pass

In [None]:
import pandas as pd
import geopandas as gpd

### 1. Find fare for each route

In [None]:
# route info
routes_df = pd.read_csv('./data/fare&ridership/routes.csv')
routes_df

In [None]:
route_fare_class = {'Local Bus': 1.7, 'Inner Express': 4.25, 'Outer Express': 5.25, 'Free':0}
route_reduced_fare_class = {'Local Bus': 0.85, 'Inner Express': 2.1, 'Outer Express': 2.6, 'Free':0}

# add a column of fare for each route
routes_df['fare'] = float('nan')
routes_df['reduced_fare'] = float('nan')
for idx, row in routes_df.iterrows():
    this_fare_class = row['route_fare_class']
    if this_fare_class not in route_fare_class:
        continue
    else:
        routes_df.at[idx, 'fare'] = route_fare_class[this_fare_class]
        routes_df.at[idx, 'reduced_fare'] = route_reduced_fare_class[this_fare_class]

routes_df.to_csv('./output/routes_with_fare.csv', index_label=False)

### 2. Connect routes for each stop

In [126]:
#stops_df = pd.read_csv('./output/stops_with_incomeLevel.csv')
stops_df = pd.read_csv('./output/stop_with_weighed_level.csv')
stops_df

Unnamed: 0,STOP_ID,geometry,impacted_tractid,proportion,income,income_level
0,3077,"POLYGON ((237925.3409028954 892643.4076999985,...","25021416400,25025100800,25025100601,2502510070...","0.01,0.17,0.13,0.99,0.29",86562,2
1,841,"POLYGON ((228719.867402897 892644.0174999982, ...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02",107359,2
2,446,"POLYGON ((235190.3329028941 892644.9441, 23518...","25025100500,25025100400,25025100300,2502510110...","0.17,0.18,1.00,0.41,0.06,0.24,0.02,1.00,0.37",46390,2
3,847,"POLYGON ((228717.2732028968 892650.1563000008,...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02",107359,2
4,3079,"POLYGON ((237449.4844028956 892651.9899999984,...","25021416400,25025100800,25025100601,2502510050...","0.00,0.51,0.32,0.01,0.03,0.67,0.06",83532,2
...,...,...,...,...,...,...
7805,9097,"POLYGON ((237034.0528028987 911541.8663000017,...","25017336402,25017336401,25017336300,2501733620...","0.17,0.18,0.26,0.11,0.01",99903,2
7806,5911,"POLYGON ((237040.7078028999 911542.5381000005,...","25017336402,25017336401,25017336300,2501733620...","0.17,0.17,0.27,0.12,0.01",101026,2
7807,5975,"POLYGON ((235782.0594028986 911544.9987000003,...","25017337102,25017336402,25017336401,2501733630...","0.05,0.93,0.20,0.00,0.08",80781,2
7808,15976,"POLYGON ((235775.7703028999 911547.1840000004,...","25017337102,25017336402,25017336401,2501733630...","0.05,0.93,0.20,0.00,0.08",80781,2


In [123]:
# line_and_stop.csv is not a part of the repo because the file is too large to push
ridership_df = pd.read_csv('./data/line_and_stop.csv', low_memory=False)
ridership_df

Unnamed: 0,FID,season,route_id,route_variant,direction_id,trip_start_time,day_type_id,day_type_name,stop_name,stop_id,stop_sequence,boardings,alightings,load_,sample_size
0,1,Fall 2016,1,1-_-0,0,04:37:00,day_type_01,weekday,WASHINGTON ST OPP RUGGLES ST,1,2,0.4,0.3,7.8,13
1,2,Fall 2016,1,1-_-0,0,04:37:00,day_type_01,weekday,ALBANY ST OPP RANDALL ST,10003,5,0.1,0.0,9.4,13
2,3,Fall 2016,1,1-_-0,0,04:37:00,day_type_01,weekday,MASSACHUSETTS AVE @ SIDNEY ST,101,19,0.0,0.4,10.4,13
3,4,Fall 2016,1,1-_-0,0,04:37:00,day_type_01,weekday,MASSACHUSETTS AVE @ PROSPECT,102,20,0.6,2.5,8.5,13
4,5,Fall 2016,1,1-_-0,0,04:37:00,day_type_01,weekday,MASSACHUSETTS AVE @ BIGELOW S,104,21,0.1,0.2,8.4,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3451727,3451728,Fall 2019,99,99-8-1,1,22:30:00,day_type_02,saturday,HIGHLAND AVE @ PINE ST,5062,9,0.0,0.0,0.0,5
3451728,3451729,Fall 2019,99,99-8-1,1,22:30:00,day_type_02,saturday,CLIFTON ST @ DEXTER ST,5066,15,0.0,0.0,0.0,5
3451729,3451730,Fall 2019,99,99-8-1,1,22:30:00,day_type_02,saturday,CLIFTON ST @ CEDAR ST,5067,16,0.0,0.0,0.0,5
3451730,3451731,Fall 2019,99,99-8-1,1,22:30:00,day_type_02,saturday,SUMMER ST @ CLIFTON ST,5068,17,0.0,0.0,0.0,5


In [None]:
stops_df['route_ids'] = ""
for idx, row in stops_df.iterrows():
    stop_id = row['STOP_ID']
    route_ids = list(set(ridership_df[ridership_df.stop_id == int(stop_id)].route_id.tolist()))
    stops_df.at[idx, 'route_ids'] = ','.join(route_ids)
stops_df.to_csv('./output/stops.csv', index_label=False)
stops_df

### 3. Collect ridership for each route per stop

In [None]:
routes_df = pd.read_csv('output/routes_with_fare.csv')
routes_df

In [None]:
ridership_df = pd.read_csv('./data/fare&ridership/Line,_and_Stop.csv', low_memory=False)
# load only rows for Fall 2019
ridership_df = ridership_df[ridership_df.season == 'Fall 2019']
ridership_df

In [None]:
stops_df = pd.read_csv('./output/stops.csv', low_memory=False)
stops_df.info()

In [None]:
# ridership_df[(ridership_df.route_id=='99') & (ridership_df.stop_id==5327) & (ridership_df.day_type_name=='weekday')]
stops_df['ridership'] = ""
for idx, row in stops_df.iterrows():
    print('{}/{}'.format(idx, len(stops_df)))
    stop_id = int(row['STOP_ID'])
    route_ids = str(row['route_ids']).split(',')
    riderships = []
    for route_id in route_ids:
        weekday_ons = sum(ridership_df[(ridership_df.route_id==route_id) & (ridership_df.stop_id==stop_id) & (ridership_df.day_type_name=='weekday')].boardings) * 5
        saturday_ons = sum(ridership_df[(ridership_df.route_id==route_id) & (ridership_df.stop_id==stop_id) & (ridership_df.day_type_name=='saturday')].boardings)
        sunday_ons = sum(ridership_df[(ridership_df.route_id==route_id) & (ridership_df.stop_id==stop_id) & (ridership_df.day_type_name=='sunday')].boardings)
        week_ons = weekday_ons+saturday_ons+sunday_ons
        year_ons = week_ons * 52
        riderships.append("{:.1f}".format(year_ons))
    stops_df.at[idx, 'ridership'] =  ','.join(riderships)
stops_df

In [None]:
stops_df.to_csv('./output/stops.csv', index_label=False)

### 4. Calculate annual renevue for each stop, note: reduced fare, monthly pass
Here I assume the revenue composition (payment method) is fixed for each route, and that a rider uses a monthly pass twice every weekday, and that a month has 22 weekdays.

In [None]:
routes_df = pd.read_csv('./output/routes_with_fare.csv')
routes_df

In [None]:
stops_df['revenues'] = ""

for idx, row in stops_df.iterrows():    
    route_ids = str(row['route_ids']).split(',')
    riderships = [float(x) for x in str(row['ridership']).split(',')]
    assert len(route_ids)==len(riderships)
    
    revenues = []
    for i in range(len(riderships)):
        route_id = route_ids[i]
        ridership = riderships[i]
        
        route_info = routes_df[routes_df.route_id == route_id]
        if len(route_info)==0:
            # no route info, assume it as local bus
            fare = 1.7
            fare_reduced = 0.85
        else:
            fare = float(route_info.fare)
            fare_reduced = float(route_info.reduced_fare)
        
        # monthly pass - 70%
        ridership_monthlyPass = ridership * 0.7
        revenue_0 = ridership_monthlyPass * 0.17 * (30/(22*2))
        revenue_1 = ridership_monthlyPass * 0.1 * (55/(22*2))
        revenue_2 = ridership_monthlyPass * 0.69 * (90/(22*2))
        revenue_3 = ridership_monthlyPass * 0.04 * (90/(22*2))
        revenue_monthlyPass = revenue_0 + revenue_1 + revenue_2 + revenue_3
        
        # pay-per-ride - 22%
        ridership_payPerRide = ridership * 0.22
        revenue_4 = ridership_payPerRide * 0.03 * 1.7
        revenue_5 = ridership_payPerRide * 0.16 * fare_reduced
        revenue_6 = ridership_payPerRide * 0.79 * fare
        revenue_payPerRide = revenue_4 + revenue_5 + revenue_6
        
        # others, ignore
        # sum up
        revenues.append("{:.1f}".format(revenue_monthlyPass+revenue_payPerRide))
    stops_df.at[idx, 'revenues'] = ','.join(revenues)

stops_df

In [None]:
# Add revenues up for each stop
stops_df['revenue_annual'] = ""
for idx, row in stops_df.iterrows():
    revenues = list(map(float, str(row['revenues']).split(',')))
    stops_df.at[idx, 'revenue_annual'] = sum(revenues)
stops_df

In [None]:
stops_df.to_csv('./output/stops.csv', index_label=False)

## Step 5: Identify which bus routes, stops, or zones would have the most positive effect on low income riders if free. Identify which towns would be impacted?
1. Draw a 0.5 mile radius circle around each stop, store as a column
2. Calculate percentage of the interception between the circle and tracts
3. Calculate weighed median income for people that the stop impacts
4. Calculate weighed number of people impacted by each stop

In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon, Point

### 0. Data loading & Preprocessing

In [2]:
# gdf_tract_geo = gpd.read_file('./data/census_tract/tl_2019_25_tract/tl_2019_25_tract.shp')
# gdf_tract_geo.head()
gdf_tract_geo = gpd.read_file('./data/census_tract/CENSUS2010_BLK_BG_TRCT_SHP/CENSUS2010TRACTS_POLY.shp')
gdf_tract_geo.head()

Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,GEOID10,NAME10,NAMELSAD10,MTFCC10,ALAND10,AWATER10,INTPTLAT10,...,AREA_ACRES,POP100_RE,HU100_RE,LOGPL94171,LOGSF1,LOGACS0610,LOGSF1C,SHAPE_AREA,SHAPE_LEN,geometry
0,25,21,418003,25021418003,4180.03,Census Tract 4180.03,G5020,1705668.0,2936.0,42.235024,...,422.174,2481,1350,141872,123764,3690,123764,1708484.0,6274.185034,"POLYGON ((240678.759 886748.073, 240607.995 88..."
1,25,21,417701,25021417701,4177.01,Census Tract 4177.01,G5020,1543651.0,12275.0,42.2523398,...,384.4502,5417,2983,141838,123730,3683,123730,1555821.0,6229.137913,"POLYGON ((240969.666 890122.806, 240985.156 89..."
2,25,21,417702,25021417702,4177.02,Census Tract 4177.02,G5020,1685529.0,317947.0,42.2582818,...,416.4918,2765,1109,141843,123735,3684,123735,1685489.0,8542.498286,"POLYGON ((242328.221 890545.050, 242276.888 89..."
3,25,21,418102,25021418102,4181.02,Census Tract 4181.02,G5020,771203.0,2371.0,42.247041,...,191.1397,3377,1676,141882,123774,3693,123774,773518.2,4540.296101,"POLYGON ((240372.666 888331.030, 240393.057 88..."
4,25,21,418004,25021418004,4180.04,Census Tract 4180.04,G5020,1316466.0,0.0,42.2383744,...,325.2812,4280,1987,141874,123766,3691,123766,1316372.0,5854.497366,"POLYGON ((240687.299 888298.408, 240700.449 88..."


In [3]:
# load stop info
gdf_stop = gpd.read_file('./data/stops_routes/mbtabus/MBTABUSSTOPS_PT.shp')
gdf_stop.head()

Unnamed: 0,STOP_ID,STOP_NAME,TOWN,TOWN_ID,geometry
0,3077,Gallivan Blvd @ opp Marsh St,BOSTON,35,POINT (237120.669 892643.408)
1,841,Lagrange St @ Virgil Rd,BOSTON,35,POINT (227915.195 892644.017)
2,446,Norfolk St @ Nelson St,BOSTON,35,POINT (234385.661 892644.944)
3,847,Lagrange St opp Virgil St,BOSTON,35,POINT (227912.601 892650.156)
4,3079,Adams St @ Minot St,BOSTON,35,POINT (236644.812 892651.990)


### 1. Draw a 0.5 mile radius circle around each stop, store as a column

In [4]:
# This shows the units
gdf_tract_geo.crs

<Projected CRS: EPSG:26986>
Name: NAD83 / Massachusetts Mainland
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: USA - Massachusetts - SPCS - mainland
- bounds: (-73.5, 41.46, -69.86, 42.89)
Coordinate Operation:
- name: SPCS83 Massachusetts Mainland zone (meters)
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [5]:
def mile2meter(mile):
    conversion_factor = 0.62137119
    return mile / conversion_factor * 1000

def computeArea(radius):
    from math import pi
    return pi * radius * radius

In [6]:
gdf_stop_circle = gpd.GeoDataFrame()
gdf_stop_circle['STOP_ID'] = gdf_stop['STOP_ID']

radius = mile2meter(0.5)
gdf_stop_circle['geometry'] = gdf_stop['geometry'].buffer(radius)
gdf_stop_circle = gdf_stop_circle.set_crs('EPSG:26986')
gdf_stop_circle = gdf_stop_circle.set_geometry('geometry')
gdf_stop_circle.to_file('./output/radius.shp', driver='ESRI Shapefile')

gdf_stop_circle.head()

Unnamed: 0,STOP_ID,geometry
0,3077,"POLYGON ((237925.341 892643.408, 237921.466 89..."
1,841,"POLYGON ((228719.867 892644.017, 228715.993 89..."
2,446,"POLYGON ((235190.333 892644.944, 235186.458 89..."
3,847,"POLYGON ((228717.273 892650.156, 228713.398 89..."
4,3079,"POLYGON ((237449.484 892651.990, 237445.610 89..."


### 2. Calculate percentage of the interception between the circle and tracts
- Find overlapping tracts
- Calculate overlapping area
- Calculate proportion
    - for median income, $proportion_{ij}=\frac{overlapping\_area_{ij}}{area_{circle}}$ for tract $i$ intercepted with circle around stop $j$.
    - for population, $proportion_{ij}=\frac{overlapping\_area_{ij}}{area_{j}}$ for tract $i$ intercepted with circle around stop $j$.

In [8]:
circle_area = computeArea(mile2meter(0.5))
circle_area

2034171.9197277634

In [13]:
import warnings
# ignore warnings
warnings.filterwarnings(action='ignore')

# Changed all references from TRACTCE10 to GEOID10
gdf_stop_circle['impacted_geoid'] = ""
gdf_stop_circle['proportion_2tract'] = ""
gdf_stop_circle['proportion_2circle'] = ""
for idx, row in gdf_stop_circle.iterrows():
    # tract_ids for overlapping
    overlaps = gdf_tract_geo[gdf_tract_geo['geometry'].overlaps(row['geometry'])|gdf_tract_geo['geometry'].covers(row['geometry'])|gdf_tract_geo['geometry'].within(row['geometry'])]['GEOID10'].tolist()
    # area for each overlapping
    proportion_2tract = []
    proportion_2circle = []
    for y in overlaps:
        tract_area = sum(gdf_tract_geo.loc[gdf_tract_geo['GEOID10']==y,]['SHAPE_AREA'].tolist())
        gpd_interceptions = gpd.overlay(gdf_tract_geo.loc[gdf_tract_geo.GEOID10==y,], gdf_stop_circle.loc[gdf_stop_circle.STOP_ID==row.STOP_ID,], how='intersection')
        area = sum([row.geometry.area for idx, row in gpd_interceptions.iterrows()])
        proportion_2tract.append(float(area/tract_area))
        proportion_2circle.append(float(area/circle_area))
    
    # raise error if no tracts 
    if len(overlaps)==0: 
        raise ValueError()
    
    # add columns
    gdf_stop_circle.at[idx, 'impacted_geoid'] = ','.join(overlaps)
    gdf_stop_circle.at[idx, 'proportion_2tract'] = ','.join(["{:.2f}".format(x) for x in proportion_2tract])
    gdf_stop_circle.at[idx, 'proportion_2circle'] = ','.join(["{:.2f}".format(x) for x in proportion_2circle])
gdf_stop_circle

# to ask Jun Li about his math for assigning income level to stop. This code takes the proportion of each tract
# that the buffer passes through; for income, we should use the proportion of the circle that passes through
# each tract.

# but, it does work for population which is cool

  and should_run_async(code)


Unnamed: 0,STOP_ID,geometry,impacted_tractid,proportion_2tract,proportion_2circle
0,3077,"POLYGON ((237925.341 892643.408, 237921.466 89...","25021416400,25025100800,25025100601,2502510070...","0.01,0.17,0.13,0.99,0.29","0.03,0.15,0.04,0.59,0.13"
1,841,"POLYGON ((228719.867 892644.017, 228715.993 89...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02","0.09,0.19,0.42,0.28,0.02"
2,446,"POLYGON ((235190.333 892644.944, 235186.458 89...","25025100500,25025100400,25025100300,2502510110...","0.17,0.18,1.00,0.41,0.06,0.24,0.02,1.00,0.37","0.07,0.06,0.24,0.11,0.01,0.14,0.01,0.17,0.19"
3,847,"POLYGON ((228717.273 892650.156, 228713.398 89...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02","0.09,0.19,0.42,0.29,0.02"
4,3079,"POLYGON ((237449.484 892651.990, 237445.610 89...","25021416400,25025100800,25025100601,2502510050...","0.00,0.51,0.32,0.01,0.03,0.67,0.06","0.01,0.44,0.09,0.00,0.01,0.40,0.03"
...,...,...,...,...,...
7805,9097,"POLYGON ((237034.053 911541.866, 237030.178 91...","25017336402,25017336401,25017336300,2501733620...","0.17,0.18,0.26,0.11,0.01","0.10,0.22,0.53,0.14,0.01"
7806,5911,"POLYGON ((237040.708 911542.538, 237036.833 91...","25017336402,25017336401,25017336300,2501733620...","0.17,0.17,0.27,0.12,0.01","0.09,0.21,0.54,0.14,0.01"
7807,5975,"POLYGON ((235782.059 911544.999, 235778.185 91...","25017337102,25017336402,25017336401,2501733630...","0.05,0.93,0.20,0.00,0.08","0.15,0.52,0.24,0.01,0.08"
7808,15976,"POLYGON ((235775.770 911547.184, 235771.896 91...","25017337102,25017336402,25017336401,2501733630...","0.05,0.93,0.20,0.00,0.08","0.16,0.52,0.24,0.01,0.08"


In [16]:
gdf_stop_circle.to_csv('./output/stop_radius.csv', index_label=False)

## 3. Calculate weighed median income for people that the stop impacts

In [7]:
gdf_income_tract = gpd.read_file('./output/radius.shp')
# df_income_tract = merged_gdf
gdf_income_tract.head()

Unnamed: 0,STOP_ID,geometry
0,3077,"POLYGON ((237925.341 892643.408, 237921.466 89..."
1,841,"POLYGON ((228719.867 892644.017, 228715.993 89..."
2,446,"POLYGON ((235190.333 892644.944, 235186.458 89..."
3,847,"POLYGON ((228717.273 892650.156, 228713.398 89..."
4,3079,"POLYGON ((237449.484 892651.990, 237445.610 89..."


In [18]:
df_stop_circle = pd.read_csv('./output/stop_radius.csv')
df_stop_circle.head()

Unnamed: 0,STOP_ID,geometry,impacted_geoid,proportion_2tract,proportion_2circle
0,3077,"POLYGON ((237925.3409028954 892643.4076999985,...","25021416400,25025100800,25025100601,2502510070...","0.01,0.17,0.13,0.99,0.29","0.03,0.15,0.04,0.59,0.13"
1,841,"POLYGON ((228719.867402897 892644.0174999982, ...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02","0.09,0.19,0.42,0.28,0.02"
2,446,"POLYGON ((235190.3329028941 892644.9441, 23518...","25025100500,25025100400,25025100300,2502510110...","0.17,0.18,1.00,0.41,0.06,0.24,0.02,1.00,0.37","0.07,0.06,0.24,0.11,0.01,0.14,0.01,0.17,0.19"
3,847,"POLYGON ((228717.2732028968 892650.1563000008,...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02","0.09,0.19,0.42,0.29,0.02"
4,3079,"POLYGON ((237449.4844028956 892651.9899999984,...","25021416400,25025100800,25025100601,2502510050...","0.00,0.51,0.32,0.01,0.03,0.67,0.06","0.01,0.44,0.09,0.00,0.01,0.40,0.03"


In [100]:
# again, changed references from TRACTCE10 to GEO_ID
df_stop_circle['income'] = -1
for idx, row in df_stop_circle.iterrows():
    tracts = str(row['impacted_tractid']).split(',')
    proportions = list(map(float, str(row['proportion']).split(',')))
    incomes = []
    for i in range(len(tracts)):
        tract = tracts[i]
        income = int(df_income_tract[df_income_tract['GEO_ID']==tract].DP03_0062E.values[0])
        if income<0:
            incomes.append(0)
            proportions[i]==0
        else:
            incomes.append(income)
    
    assert len(tracts)==len(incomes)==len(proportions)
    weighed_income = (sum([incomes[i]*proportions[i] for i in range(len(tracts))]))/sum(proportions)

    df_stop_circle.at[idx, 'income'] = weighed_income
df_stop_circle

Unnamed: 0,STOP_ID,geometry,impacted_tractid,proportion,income
0,3077,"POLYGON ((237925.3409028954 892643.4076999985,...","25021416400,25025100800,25025100601,2502510070...","0.01,0.17,0.13,0.99,0.29",86562
1,841,"POLYGON ((228719.867402897 892644.0174999982, ...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02",107359
2,446,"POLYGON ((235190.3329028941 892644.9441, 23518...","25025100500,25025100400,25025100300,2502510110...","0.17,0.18,1.00,0.41,0.06,0.24,0.02,1.00,0.37",46390
3,847,"POLYGON ((228717.2732028968 892650.1563000008,...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02",107359
4,3079,"POLYGON ((237449.4844028956 892651.9899999984,...","25021416400,25025100800,25025100601,2502510050...","0.00,0.51,0.32,0.01,0.03,0.67,0.06",83532
...,...,...,...,...,...
7805,9097,"POLYGON ((237034.0528028987 911541.8663000017,...","25017336402,25017336401,25017336300,2501733620...","0.17,0.18,0.26,0.11,0.01",99903
7806,5911,"POLYGON ((237040.7078028999 911542.5381000005,...","25017336402,25017336401,25017336300,2501733620...","0.17,0.17,0.27,0.12,0.01",101026
7807,5975,"POLYGON ((235782.0594028986 911544.9987000003,...","25017337102,25017336402,25017336401,2501733630...","0.05,0.93,0.20,0.00,0.08",80781
7808,15976,"POLYGON ((235775.7703028999 911547.1840000004,...","25017337102,25017336402,25017336401,2501733630...","0.05,0.93,0.20,0.00,0.08",80781


In [103]:
# assign income level for each stop

income_level = []
incomes = df_stop_circle['income']
for income in incomes:
    if income<=0:
        income_level.append(-1)
    elif 0 < income < 31000:
        income_level.append(0)
    elif 31000 <= income < 42000:
        income_level.append(1)
    elif 42000 <= income < 126000:
        income_level.append(2)
    elif 126000 <= income < 188000:
        income_level.append(3)
    elif 188000 <= income:
        income_level.append(4)
df_stop_circle['income_level'] = income_level

In [104]:
df_stop_circle.to_csv('./output/stop_with_weighed_level.csv', index_label=False)

## 4. Calculate weighed number of people impacted by each stop

In [112]:
df_tract_pop = pd.read_csv('./output/tracts_with_population.csv')
df_tract_pop.drop(columns='Unnamed: 0', inplace=True)
df_tract_pop.dtypes

GEO_ID              int64
public_transport    int64
walking             int64
other               int64
total_employed      int64
impacted_pop        int64
dtype: object

In [106]:
df_stop_circle = pd.read_csv('./output/stop_with_weighed_level.csv')
df_stop_circle.head()

Unnamed: 0,STOP_ID,geometry,impacted_tractid,proportion,income,income_level
0,3077,"POLYGON ((237925.3409028954 892643.4076999985,...","25021416400,25025100800,25025100601,2502510070...","0.01,0.17,0.13,0.99,0.29",86562,2
1,841,"POLYGON ((228719.867402897 892644.0174999982, ...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02",107359,2
2,446,"POLYGON ((235190.3329028941 892644.9441, 23518...","25025100500,25025100400,25025100300,2502510110...","0.17,0.18,1.00,0.41,0.06,0.24,0.02,1.00,0.37",46390,2
3,847,"POLYGON ((228717.2732028968 892650.1563000008,...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02",107359,2
4,3079,"POLYGON ((237449.4844028956 892651.9899999984,...","25021416400,25025100800,25025100601,2502510050...","0.00,0.51,0.32,0.01,0.03,0.67,0.06",83532,2


In [114]:
# changed references from tract_id to GEO_ID
# also adjusted dataset used to create df_tract_pop to use GEO_ID instead of tract_ID
df_stop_circle['impacted_pop'] = 0
for idx, row in df_stop_circle.iterrows():
    tracts = str(row['impacted_tractid']).split(',')
    proportions = list(map(float, str(row['proportion']).split(',')))
    impacted_pops = []
    for tract in tracts:
        impacted_pop = int(df_tract_pop[df_tract_pop.GEO_ID==int(tract)].impacted_pop.values[0])
        impacted_pops.append(impacted_pop)
    weighed_pop = sum([impacted_pops[i]*proportions[i] for i in range(len(tracts))])

    df_stop_circle.at[idx, 'impacted_pop'] = weighed_pop
df_stop_circle

Unnamed: 0,STOP_ID,geometry,impacted_tractid,proportion,income,income_level,impacted_pop
0,3077,"POLYGON ((237925.3409028954 892643.4076999985,...","25021416400,25025100800,25025100601,2502510070...","0.01,0.17,0.13,0.99,0.29",86562,2,1046
1,841,"POLYGON ((228719.867402897 892644.0174999982, ...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02",107359,2,632
2,446,"POLYGON ((235190.3329028941 892644.9441, 23518...","25025100500,25025100400,25025100300,2502510110...","0.17,0.18,1.00,0.41,0.06,0.24,0.02,1.00,0.37",46390,2,3056
3,847,"POLYGON ((228717.2732028968 892650.1563000008,...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02",107359,2,632
4,3079,"POLYGON ((237449.4844028956 892651.9899999984,...","25021416400,25025100800,25025100601,2502510050...","0.00,0.51,0.32,0.01,0.03,0.67,0.06",83532,2,1647
...,...,...,...,...,...,...,...
7805,9097,"POLYGON ((237034.0528028987 911541.8663000017,...","25017336402,25017336401,25017336300,2501733620...","0.17,0.18,0.26,0.11,0.01",99903,2,639
7806,5911,"POLYGON ((237040.7078028999 911542.5381000005,...","25017336402,25017336401,25017336300,2501733620...","0.17,0.17,0.27,0.12,0.01",101026,2,644
7807,5975,"POLYGON ((235782.0594028986 911544.9987000003,...","25017337102,25017336402,25017336401,2501733630...","0.05,0.93,0.20,0.00,0.08",80781,2,1204
7808,15976,"POLYGON ((235775.7703028999 911547.1840000004,...","25017337102,25017336402,25017336401,2501733630...","0.05,0.93,0.20,0.00,0.08",80781,2,1204


In [117]:
gdf_stop = gpd.read_file('./data/stops_routes/mbtabus/MBTABUSSTOPS_PT.shp')
df_stop_circle['location'] = gdf_stop['geometry']

cols = df_stop_circle.columns.tolist()
cols = cols[:1] + cols[7:] + cols[1:7]

df_stop_circle = df_stop_circle[cols]

df_stop_circle.to_csv('./output/stops_weighed.csv', index_label=False)

In [118]:
df_stop_circle.head()

Unnamed: 0,STOP_ID,location,geometry,impacted_tractid,proportion,income,income_level,impacted_pop
0,3077,POINT (237120.669 892643.408),"POLYGON ((237925.3409028954 892643.4076999985,...","25021416400,25025100800,25025100601,2502510070...","0.01,0.17,0.13,0.99,0.29",86562,2,1046
1,841,POINT (227915.195 892644.017),"POLYGON ((228719.867402897 892644.0174999982, ...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02",107359,2,632
2,446,POINT (234385.661 892644.944),"POLYGON ((235190.3329028941 892644.9441, 23518...","25025100500,25025100400,25025100300,2502510110...","0.17,0.18,1.00,0.41,0.06,0.24,0.02,1.00,0.37",46390,2,3056
3,847,POINT (227912.601 892650.156),"POLYGON ((228717.2732028968 892650.1563000008,...","25025130402,25025130300,25025130200,2502513010...","0.08,0.26,0.63,0.13,0.02",107359,2,632
4,3079,POINT (236644.812 892651.990),"POLYGON ((237449.4844028956 892651.9899999984,...","25021416400,25025100800,25025100601,2502510050...","0.00,0.51,0.32,0.01,0.03,0.67,0.06",83532,2,1647


In [122]:
# save as shapefile
gdf = gpd.GeoDataFrame(df_stop_circle, geometry='location')
gdf.drop(columns='geometry', inplace=True)
gdf.to_file('./output/stops.shp', driver='ESRI Shapefile')