This notebook loads block group shapefiles for selected states, joins the information with the GeoJSON file identifying 1/4 square mile areas around transit stations and then joins the resulting file with American Community Survey data on median home values (can also be used to join with ACS data on rents and other demographics).

# **Load Census Block Group Shapefiles**

In [None]:
import zipfile
import os

# Define the paths to the zip files and the directory where to extract them
zip_files = [
    'tl_rd22_51_bg.zip',  # Virginia
    'tl_rd22_11_bg.zip',  # Washington DC
    'tl_rd22_24_bg.zip'   # Maryland
]

# Directory where files will be extracted
extract_dir_base = '/mnt/data/shapefiles/'

# Function to unzip files
def unzip_shapefiles(zip_files, extract_dir_base):
    # Create base directory if it does not exist
    if not os.path.exists(extract_dir_base):
        os.makedirs(extract_dir_base)

    # Loop through the list of zip files and extract each
    for zip_path in zip_files:
        # Determine extraction path (folder name based on the zip file)
        extract_path = os.path.join(extract_dir_base, os.path.basename(zip_path).replace('.zip', ''))
        os.makedirs(extract_path, exist_ok=True)

        # Extract the zip file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        print(f"Files extracted to: {extract_path}")

# Unzip all the shapefiles
unzip_shapefiles(zip_files, extract_dir_base)


Files extracted to: /mnt/data/shapefiles/tl_rd22_51_bg
Files extracted to: /mnt/data/shapefiles/tl_rd22_11_bg
Files extracted to: /mnt/data/shapefiles/tl_rd22_24_bg


In [None]:
import geopandas as gpd
import pandas as pd
import os

# Paths where the files were extracted
extracted_paths = [
    '/mnt/data/shapefiles/tl_rd22_51_bg',  # Virginia
    '/mnt/data/shapefiles/tl_rd22_11_bg',  # Washington DC
    '/mnt/data/shapefiles/tl_rd22_24_bg'   # Maryland
]

# Function to read shapefiles into GeoDataFrames and concatenate them
def concatenate_shapefiles(paths):
    gdfs = []
    for path in paths:
        # Each directory contains one shapefile (.shp) and associated files
        for filename in os.listdir(path):
            if filename.endswith(".shp"):
                file_path = os.path.join(path, filename)
                gdf = gpd.read_file(file_path)
                gdfs.append(gdf)
                break  # We assume there's only one .shp file in each directory
    # Concatenate all GeoDataFrames into one
    concatenated_gdf = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))
    return concatenated_gdf

# Concatenate all the shapefiles
consolidated_gdf = concatenate_shapefiles(extracted_paths)

# Define path for the consolidated shapefile
consolidated_shapefile_path = '/mnt/data/shapefiles/consolidated_bg.shp'

# Save the consolidated GeoDataFrame to a new shapefile
consolidated_gdf.to_file(consolidated_shapefile_path)


## **Join Consolidated Shapefiles with Transit Station Data**

In [None]:
import geopandas as gpd

# Assuming you've uploaded the 'buffered_stations.geojson' to your Colab environment
stations_geojson_path = '/content/buffered_stations (1).geojson'
# Assuming you've uploaded the 'consolidated_bg' shapefile components to your Colab environment
consolidated_shapefile_path = '/content/consolidated_bg.shp'

# Load the GeoJSON and shapefile into GeoDataFrames
stations_gdf = gpd.read_file(stations_geojson_path)
block_groups_gdf = gpd.read_file(consolidated_shapefile_path)

# Check the CRS for both GeoDataFrames
print("Stations CRS: ", stations_gdf.crs)
print("Block Groups CRS: ", block_groups_gdf.crs)

# If the block groups GeoDataFrame does not have a CRS, set it
if block_groups_gdf.crs is None:
    block_groups_gdf.set_crs(stations_gdf.crs, inplace=True)

# If the CRS are different, align the CRS of the block groups to match the stations
if block_groups_gdf.crs != stations_gdf.crs:
    block_groups_gdf = block_groups_gdf.to_crs(stations_gdf.crs)

# Perform the spatial join
joined_gdf = gpd.sjoin(stations_gdf, block_groups_gdf, how="inner", predicate='intersects')

# Display the first few rows of the joined GeoDataFrame
joined_gdf.head()


Stations CRS:  EPSG:4326
Block Groups CRS:  None


Unnamed: 0,RecordID,NTD ID,Agency Name,Reporter Type,Reporting Module,Primary Mode Served,Facility ID,Facility Type,Facility Name,City,...,TRACTCE,BLKGRPCE,GEOID,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON
31,63,8,Tri-County Metropolitan Transportation Distric...,Full Reporter,Urban,Light Rail,17086,Exclusive Platform Station,Tuality Hospital/SE 8th Ave MAX Station,Hillsboro,...,10700,1,110010107001,Block Group 1,G5030,S,638412,0,38.9027892,-77.0421055
403,502,30030,Washington Metropolitan Area Transit Authority,Full Reporter,Urban,Heavy Rail,8759,Underground Fixed Guideway Station,Farragut North,Washington,...,10700,1,110010107001,Block Group 1,G5030,S,638412,0,38.9027892,-77.0421055
419,518,30030,Washington Metropolitan Area Transit Authority,Full Reporter,Urban,Heavy Rail,8775,Underground Fixed Guideway Station,Farragut West,Washington,...,10700,1,110010107001,Block Group 1,G5030,S,638412,0,38.9027892,-77.0421055
420,519,30030,Washington Metropolitan Area Transit Authority,Full Reporter,Urban,Heavy Rail,8776,Underground Fixed Guideway Station,Foggy Bottom-GWU,Washington,...,10700,1,110010107001,Block Group 1,G5030,S,638412,0,38.9027892,-77.0421055
31,63,8,Tri-County Metropolitan Transportation Distric...,Full Reporter,Urban,Light Rail,17086,Exclusive Platform Station,Tuality Hospital/SE 8th Ave MAX Station,Hillsboro,...,10100,1,110010101001,Block Group 1,G5030,S,185492,0,38.9039457,-77.034953


In [None]:
joined_gdf.to_csv('joined_gdf.csv')

In [None]:
from google.colab import files

In [None]:
files.download('joined_gdf.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Load ACS Demographic Data and Join with the Transit Station Shapefile**

In [None]:
import pandas as pd

metro_housing_df = pd.read_csv('Metrohousingblockgroups.csv')
joined_gdf_df = pd.read_csv('joined_gdf.csv')


In [None]:
print(metro_housing_df.columns)
print(joined_gdf_df.columns)


Index(['FIPS', 'GEOID', 'NAME', 'B25077_001E', 'B25077_001M'], dtype='object')
Index(['Unnamed: 0', 'RecordID', 'NTD ID', 'Agency Name', 'Reporter Type',
       'Reporting Module', 'Primary Mode Served', 'Facility ID',
       'Facility Type', 'Facility Name', 'City', 'State', 'ZIP Code',
       'Latitude', 'Longitude', 'Administrative/Maintenance Facility Flag',
       'Passenger/Parking Facility Flag', 'Square Feet',
       'Section of a Larger Facility', 'Year Built or Reconstructed as New',
       'Percent Agency Capital Responsibility', 'Cross Agency Facility Flag',
       'Condition Assessment Date', 'Condition Assessment',
       'Separate Asset Flag', 'Final Address', 'geometry', 'index_right',
       'STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE', 'GEOID', 'NAMELSAD',
       'MTFCC', 'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON'],
      dtype='object')


In [None]:
print(metro_housing_df['GEOID'].dtype)
print(joined_gdf_df['GEOID'].dtype)


int64
int64


In [None]:
metro_housing_df['GEOID'] = metro_housing_df['GEOID'].astype(str)
joined_gdf_df['GEOID'] = joined_gdf_df['GEOID'].astype(str)


In [None]:
merged_df = pd.merge(metro_housing_df, joined_gdf_df, on='GEOID', how='inner')


In [None]:
merged_df.head()

Unnamed: 0.1,FIPS,GEOID,NAME,B25077_001E,B25077_001M,Unnamed: 0,RecordID,NTD ID,Agency Name,Reporter Type,...,COUNTYFP,TRACTCE,BLKGRPCE,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON
0,1500000US110010005011,110010005011,"Block Group 1, Census Tract 5.01, District of ...",735000,731309,443,542,30030,Washington Metropolitan Area Transit Authority,Full Reporter,...,1,501,1,Block Group 1,G5030,S,623056,23184,38.929191,-77.05006
1,1500000US110010005012,110010005012,"Block Group 2, Census Tract 5.01, District of ...",-,**,443,542,30030,Washington Metropolitan Area Transit Authority,Full Reporter,...,1,501,2,Block Group 2,G5030,S,37731,0,38.924493,-77.056919
2,1500000US110010005013,110010005013,"Block Group 3, Census Tract 5.01, District of ...",-,**,443,542,30030,Washington Metropolitan Area Transit Authority,Full Reporter,...,1,501,3,Block Group 3,G5030,S,279462,6319,38.922548,-77.053884
3,1500000US110010005021,110010005021,"Block Group 1, Census Tract 5.02, District of ...",1594900,66939,443,542,30030,Washington Metropolitan Area Transit Authority,Full Reporter,...,1,502,1,Block Group 1,G5030,S,405442,0,38.927602,-77.060131
4,1500000US110010005022,110010005022,"Block Group 2, Census Tract 5.02, District of ...",682200,229369,443,542,30030,Washington Metropolitan Area Transit Authority,Full Reporter,...,1,502,2,Block Group 2,G5030,S,176065,0,38.930088,-77.058253


In [None]:
merged_df.to_csv('merged.csv')

In [None]:
files.download('merged.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>