In [None]:
# --------------------------------------------------------------------------------------------------
# Cell: Download Massachusetts ZCTA shapefile
#
# This cell downloads the full US ZIP Code Tabulation Area (ZCTA) shapefile for 2025 from the
# US Census Bureau TIGER/Line repository, extracts it locally, and prepares it for further analysis.
#
# Steps performed:
# 1. Define the URL for the TIGER/Line 2025 US ZCTA shapefile ZIP.
# 2. Specify local file paths:
#    - zip_path: where the downloaded ZIP file will be temporarily saved.
#    - extract_dir: directory where the shapefile will be extracted.
# 3. Download the ZIP file using requests and save it locally.
# 4. Extract all contents of the ZIP to the specified extraction directory.
# 5. Print confirmation and list the extracted folder.
# 6. Delete the ZIP file after extraction to save disk space, keeping only the shapefile.
#
# Note:
# - This downloads the full US ZCTAs; filtering for Massachusetts will be done later
#   using the ZCTA codes or a spatial join with the Massachusetts state polygon.
# --------------------------------------------------------------------------------------------------

import os
import zipfile

import requests

# URL for the full US ZCTA 2025 shapefile from Census TIGER/Line
url = "https://www2.census.gov/geo/tiger/TIGER2025/ZCTA520/tl_2025_us_zcta520.zip"

# Local paths
base_dir = "../data/external"
zip_path = f"{base_dir}/tl_2025_us_zcta520.zip"
extract_dir = f"{base_dir}/tl_2025_us_zcta520"

# Download the ZIP
print("Downloading TIGER/Line ZCTA shapefile...")
r = requests.get(url)
r.raise_for_status()
with open(zip_path, "wb") as f:
    f.write(r.content)

print("Extracting shapefile...")
with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_dir)
print(f"ZCTA shapefile extracted to folder: {extract_dir}")

os.remove(zip_path)

Downloading TIGER/Line ZCTA shapefile...
Extracting shapefile...
ZCTA shapefile extracted to folder: ../data/external/tl_2025_us_zcta520


In [None]:
# --------------------------------------------------------------------------------------------------
# Cell: Download US States shapefile (TIGER/Line 2025)
#
# This cell downloads the US States shapefile for 2025 from the US Census Bureau TIGER/Line repository,
# extracts it locally, and prepares it for use in spatial operations such as joining with ZCTA polygons.
#
# Steps performed:
# 1. Define the URL for the TIGER/Line 2025 US States shapefile ZIP file.
# 2. Specify local file paths:
#    - zip_path: location to temporarily save the downloaded ZIP.
#    - extract_dir: directory where the shapefile will be extracted.
# 3. Download the ZIP file using requests and save it to the specified path.
# 4. Extract all files from the ZIP into the extraction directory.
# 5. Delete the ZIP file after extraction to conserve disk space.
# 6. Print confirmation and list the extracted files to verify contents.
#
# Note:
# - The extracted shapefile can be used to filter or spatially join ZCTAs for a specific state
#   (e.g., Massachusetts) by using a spatial join in GeoPandas.
# --------------------------------------------------------------------------------------------------


# URL for 2025 TIGER/Line US States shapefile
url = "https://www2.census.gov/geo/tiger/TIGER2025/STATE/tl_2025_us_state.zip"

# Local paths
zip_path = f"{base_dir}/tl_2025_us_state.zip"
extract_dir = f"{base_dir}/tl_2025_us_state"

# Download the ZIP
print("Downloading US States shapefile...")
r = requests.get(url)
r.raise_for_status()
with open(zip_path, "wb") as f:
    f.write(r.content)

# Extract the ZIP
print("Extracting shapefile...")
with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_dir)

os.remove(zip_path)

# Check extracted files
print(f"US States shapefile extracted to folder: {extract_dir}")
print("Files:", os.listdir(extract_dir))

Downloading US States shapefile...
Extracting shapefile...
US States shapefile extracted to folder: ../data/external/tl_2025_us_state
Files: ['tl_2025_us_state.dbf', 'tl_2025_us_state.shp.ea.iso.xml', 'tl_2025_us_state.cpg', 'tl_2025_us_state.shp', 'tl_2025_us_state.shx', 'tl_2025_us_state.shp.iso.xml', 'tl_2025_us_state.prj']


| Field        | Description |
|--------------|-------------|
| ZCTA5CE20    | The 5-digit ZIP Code Tabulation Area code. |
| GEOID20      | Unique identifier for the ZCTA, usually identical to ZCTA5CE20. |
| GEOIDFQ20    | Fully-qualified GEOID, sometimes including state or other prefix information. |
| CLASSFP20    | Class code describing the type of ZCTA (typically “ZCTA5”). |
| MTFCC20      | MAF/TIGER Feature Class Code, categorizing the geographic feature. |
| FUNCSTAT20   | Functional status code (e.g., active or inactive ZCTA). |
| ALAND20      | Land area of the ZCTA in square meters. |
| AWATER20     | Water area of the ZCTA in square meters. |
| INTPTLAT20   | Latitude of the ZCTA’s internal point (centroid). |
| INTPTLON20   | Longitude of the ZCTA’s internal point (centroid). |
| geometry     | Polygon geometry defining the ZCTA boundaries. |


In [10]:
# Print all columns (fields) in the shapefile
import os

import geopandas as gpd

extract_dir = "../data/external/tl_2025_us_zcta520"
shp_path = os.path.join(extract_dir, "tl_2025_us_zcta520.shp")

zcta = gpd.read_file(shp_path)

# Print all column names
print("Columns in the shapefile:")
print(list(zcta.columns))

# Optionally, inspect the first few rows
print("\nSample rows:")
print(zcta.head())

Columns in the shapefile:
['ZCTA5CE20', 'GEOID20', 'GEOIDFQ20', 'CLASSFP20', 'MTFCC20', 'FUNCSTAT20', 'ALAND20', 'AWATER20', 'INTPTLAT20', 'INTPTLON20', 'geometry']

Sample rows:
  ZCTA5CE20 GEOID20       GEOIDFQ20 CLASSFP20 MTFCC20 FUNCSTAT20    ALAND20  \
0     81137   81137  860Z200US81137        B5   G6350          S  668834096   
1     80823   80823  860Z200US80823        B5   G6350          S  731738462   
2     81237   81237  860Z200US81237        B5   G6350          S  139553440   
3     81251   81251  860Z200US81251        B5   G6350          S  360441690   
4     81435   81435  860Z200US81435        B5   G6350          S  494557780   

   AWATER20   INTPTLAT20    INTPTLON20  \
0  16883364  +37.0519107  -107.6136014   
1   2846899  +38.6634637  -103.4191862   
2     99778  +38.6163123  -106.6212538   
3   9653596  +39.1425479  -106.4375793   
4   1109655  +37.8687513  -107.9251168   

                                            geometry  
0  POLYGON ((-107.85404 37.08409, -107

In [11]:
# --------------------------------------------------------------------------------------------------
# Cell: Extract Massachusetts ZCTAs via spatial join
#
# This cell performs a spatial join between the full US ZCTA shapefile and the US States shapefile
# to extract only the ZIP Code Tabulation Areas (ZCTAs) that intersect Massachusetts.
# The resulting shapefile is saved in a separate folder for future use.
# --------------------------------------------------------------------------------------------------

import os

import geopandas as gpd

# Paths
base_dir = "../data/external"
zcta_shp = os.path.join(base_dir, "tl_2025_us_zcta520", "tl_2025_us_zcta520.shp")
states_shp = os.path.join(base_dir, "tl_2025_us_state", "tl_2025_us_state.shp")
ma_shp_dir = os.path.join(base_dir, "tl_2025_ma_zcta520")
ma_shp_path = os.path.join(ma_shp_dir, "tl_2025_ma_zcta520.shp")

# Create output directory
os.makedirs(ma_shp_dir, exist_ok=True)

# Load shapefiles
print("Loading ZCTA shapefile...")
zcta = gpd.read_file(zcta_shp)

print("Loading US States shapefile...")
states = gpd.read_file(states_shp)

# Filter Massachusetts
ma = states[states["STUSPS"] == "MA"]

# Ensure CRS match
if zcta.crs != ma.crs:
    ma = ma.to_crs(zcta.crs)

# Spatial join: keep ZCTAs that intersect Massachusetts
print("Performing spatial join to extract MA ZCTAs...")
ma_zcta = gpd.sjoin(zcta, ma, how="inner", predicate="intersects")

# Keep only original ZCTA columns
ma_zcta = ma_zcta[zcta.columns].copy()
ma_zcta.reset_index(drop=True, inplace=True)

# Save MA ZCTA shapefile
ma_zcta.to_file(ma_shp_path)
print(f"Massachusetts ZCTA shapefile saved to: {ma_shp_path}")

# Optional: delete original directories to save space (confirm MA shapefile first!)
# shutil.rmtree(os.path.join(base_dir, "tl_2025_us_zcta520"))
# shutil.rmtree(os.path.join(base_dir, "tl_2025_us_state"))
# print("Deleted original US shapefile directories to save space.")

Loading ZCTA shapefile...
Loading US States shapefile...
Performing spatial join to extract MA ZCTAs...
Massachusetts ZCTA shapefile saved to: ../data/external/tl_2025_ma_zcta520/tl_2025_ma_zcta520.shp


In [12]:
# Print all columns (fields) in the shapefile
import os

import geopandas as gpd

extract_dir = "../data/external/tl_2025_ma_zcta520"
shp_path = os.path.join(extract_dir, "tl_2025_ma_zcta520.shp")

zcta = gpd.read_file(shp_path)

# Print all column names
print("Columns in the shapefile:")
print(list(zcta.columns))

# Optionally, inspect the first few rows
print("\nSample rows:")
print(zcta.head())

Columns in the shapefile:
['ZCTA5CE20', 'GEOID20', 'GEOIDFQ20', 'CLASSFP20', 'MTFCC20', 'FUNCSTAT20', 'ALAND20', 'AWATER20', 'INTPTLAT20', 'INTPTLON20', 'geometry']

Sample rows:
  ZCTA5CE20 GEOID20       GEOIDFQ20 CLASSFP20 MTFCC20 FUNCSTAT20   ALAND20  \
0     03076   03076  860Z200US03076        B5   G6350          S  68233744   
1     03060   03060  860Z200US03060        B5   G6350          S  16987365   
2     03811   03811  860Z200US03811        B5   G6350          S  28803697   
3     03827   03827  860Z200US03827        B5   G6350          S  46254776   
4     03071   03071  860Z200US03071        B5   G6350          S  84692651   

   AWATER20   INTPTLAT20    INTPTLON20  \
0   1394546  +42.7309925  -071.3370749   
1    847514  +42.7410867  -071.4582661   
2    471773  +42.8391129  -071.1671916   
3    209977  +42.9036773  -070.9955497   
4    829429  +42.7462567  -071.8744336   

                                            geometry  
0  POLYGON ((-71.38696 42.69894, -71.38632 4