In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from geopandas.tools import overlay
from shapely.geometry import Point, Polygon, LineString
import matplotlib.pyplot as plt
import fiona
import warnings
warnings.filterwarnings("ignore")

#### Read in Datasets and Files

* walskhedbuffer is the Metro station walksheds with overlapping boundaries
* all_boundaries All boundaries for census blocks in MD, DC and VA
* gdb_MD_BG, gdb_DC_BG, gdb_VA_BG are the block group geodataframes for MD, DC, VA
* ACS_income_MD, ACS_income_DC, ACS_income_VA are the ACS incomes for MD, DC, VA

In [2]:
walkshedbuffer = gpd.read_file("../../Data/walksheds_overlapping.zip")
all_boundaries= gpd.read_file("../../Data/MD-DC-VA Boundaries.zip")


gdb_MD_BG = gpd.read_file("../../Data/tlgdb_2021_a_24_md.gdb", driver='FileGDB', layer='Block_Group')
gdb_DC_BG = gpd.read_file ("../../Data/tlgdb_2021_a_11_dc.gdb", driver='FileGDB', layer='Block_Group')
gdb_VA_BG = gpd.read_file ("../../Data/tlgdb_2021_a_51_va.gdb", driver='FileGDB', layer='Block_Group')

ACS_income_MD = gpd.read_file("../../Data/ACS_2020_5YR_BG_24_MARYLAND.gdb", driver='FileGDB', layer=15)
ACS_income_DC = gpd.read_file("../../Data/ACS_2020_5YR_BG_11_DISTRICT_OF_COLUMBIA.gdb", driver='FileGDB', layer=15)
ACS_income_VA = gpd.read_file("../../Data/ACS_2020_5YR_BG_51_VIRGINIA.gdb",driver='FileGDB', layer=15)


#### Preprocessing


* Remove "15000US" from values in the the GEOID column of the ACS dataframes
* Merge the ACS data from all three locations to the block groups based on the GEOID column
* Merging creates two geometry columns. Make the geometry_y column (from the block groups) the main geometry column and remove the geometry column from the ACS data

In [3]:
ACS_income_MD['GEOID'] = ACS_income_MD['GEOID'].str.replace('15000US', '')
ACS_income_DC['GEOID'] = ACS_income_DC['GEOID'].str.replace('15000US', '')
ACS_income_VA['GEOID'] = ACS_income_VA['GEOID'].str.replace('15000US', '')

In [4]:
ACS_Income_MD = pd.merge(ACS_income_MD, gdb_MD_BG, on='GEOID', how='outer')
ACS_Income_DC = pd.merge(ACS_income_DC, gdb_DC_BG, on='GEOID', how='outer')
ACS_Income_VA = pd.merge(ACS_income_VA, gdb_VA_BG, on='GEOID',how='outer')

In [5]:
ACS_Income_VA.rename(columns = {'geometry_y':'geometry'}, inplace = True)
ACS_Income_DC.rename(columns = {'geometry_y':'geometry'}, inplace = True)
ACS_Income_MD.rename(columns = {'geometry_y':'geometry'}, inplace = True)

ACS_Income_VA = ACS_Income_VA.drop(columns=['geometry_x'])
ACS_Income_DC = ACS_Income_DC.drop(columns=['geometry_x'])
ACS_Income_MD = ACS_Income_MD.drop(columns=['geometry_x'])

In [6]:
ACS_Income_MD.head(2)

Unnamed: 0,GEOID,B19001e1,B19001m1,B19001e2,B19001m2,B19001e3,B19001m3,B19001e4,B19001m4,B19001e5,...,B19313He1,B19313Hm1,B19313Ie1,B19313Im1,NAMELSAD,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,240010001001,370.0,104.0,3.0,7.0,0.0,13.0,19.0,22.0,54.0,...,20738500.0,7778983.0,,,Block Group 1,114609102.0,2207428.0,39.6704707,-78.3878895,"MULTIPOLYGON (((-78.49625 39.60436, -78.49612 ..."
1,240010001002,793.0,140.0,63.0,43.0,71.0,61.0,10.0,16.0,0.0,...,49793100.0,12217270.0,,,Block Group 2,215159992.0,508136.0,39.6796402,-78.5461225,"MULTIPOLYGON (((-78.66648 39.64913, -78.66643 ..."


* Merge the income variables from all three locations together
* Change the CRS of both the walksheds and the combined income variable
* Intersect the incomes with the walksheds
* Remove extra characters from the station names

In [7]:
## merge two ACE_income for MD, DC, VA ###
ACS_Income1 = pd.merge(ACS_Income_MD, ACS_Income_DC, how='outer')
ACS_income = pd.merge(ACS_Income1, ACS_Income_VA, how='outer')


In [8]:
walkshedbuffer = walkshedbuffer.to_crs('EPSG:4326')
ACS_income = ACS_income.to_crs('EPSG:4326')

In [9]:
intersect_income_stations = walkshedbuffer.overlay(ACS_income, how="intersection")
intersect_income_stations.head(2)

Unnamed: 0,Name_1,Acres,Shape_Leng,Shape_Area,StnCode,GEOID,B19001e1,B19001m1,B19001e2,B19001m2,...,B19313He1,B19313Hm1,B19313Ie1,B19313Im1,NAMELSAD,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,ADDISON ROAD-SEAT PLEASANT : 0 - 2640,246.285276,16545.122053,10728140.0,,240338027001,305.0,96.0,25.0,26.0,...,1182800.0,1487037.0,29514300.0,42442256.0,Block Group 1,477273.0,0.0,38.8870128,-76.9107079,"POLYGON ((-76.90465 38.88732, -76.90452 38.887..."
1,CAPITOL HEIGHTS : 0 - 2640,358.049668,17662.828735,15596580.0,,240338027001,305.0,96.0,25.0,26.0,...,1182800.0,1487037.0,29514300.0,42442256.0,Block Group 1,477273.0,0.0,38.8870128,-76.9107079,"POLYGON ((-76.90467 38.88733, -76.90556 38.886..."


In [10]:
intersect_income_stations['Name_1']=intersect_income_stations['Name_1'].str.replace(' : 0 - 2640','')
intersect_income_stations['Name_1']=intersect_income_stations['Name_1'].str.replace(' : 0 - 22.4525758392805','')

* Rename the B19013e1 column to median household income
* Group intersect_income_stations by the station names and summarize to get the median of the median household income
* Export the new variable

In [11]:
intersect_income_stations.rename(columns = {'B19013e1':"Median household income"}, inplace = True)

In [12]:
sum_income_stations= intersect_income_stations.groupby(["Name_1"])["Median household income"].median().reset_index()
sum_income_stations.head(2)

Unnamed: 0,Name_1,Median household income
0,ADDISON ROAD-SEAT PLEASANT,77702.0
1,ANACOSTIA,36323.0


In [13]:
sum_income_stations.to_excel("output/proportional_walkshed_household_income_updated.xlsx", sheet_name=' Medianhouseholdincome', index=True)
