#  Download GEO shapefiles

- Create a dataset of MA Zipcode by GEO Location

In [None]:
# Steps performed:
# 1. Define the URL for the TIGER/Line 2025 US ZCTA shapefile ZIP.
# 2. Specify local file paths:
#    - zip_path: where the downloaded ZIP file will be temporarily saved.
#    - extract_dir: directory where the shapefile will be extracted.
# 3. Download the ZIP file using requests and save it locally.
# 4. Extract all contents of the ZIP to the specified extraction directory.
# 5. Print confirmation and list the extracted folder.
# 6. Delete the ZIP file after extraction to save disk space, keeping only the shapefile.
#

import os
import zipfile
import requests


def download_and_extract_tiger(input_url, output_path, base_url="https://www2.census.gov/geo/tiger/TIGER2025"):
    """
    Download and extract a TIGER/Line ZIP file.

    Parameters
    ----------
    input_url : str
        Path after TIGER2025 (e.g. "ZCTA520/tl_2025_us_zcta520.zip")

    output_path : str
        Directory where extracted files will be placed

    base_url : str
        Base Census TIGER URL (default: TIGER2025)
    """

    # Build full URL
    full_url = f"{base_url}/{input_url}"

    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)

    # Local zip path
    zip_name = os.path.basename(input_url)
    zip_path = os.path.join(output_path, zip_name)

    print(f"Downloading: {full_url}")

    # Download
    r = requests.get(full_url, stream=True)
    r.raise_for_status()

    with open(zip_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)

    print("Download complete.")

    # Extract
    print("Extracting...")
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(output_path)

    print(f"Extracted to: {output_path}")

    # Remove zip
    os.remove(zip_path)
    print("ZIP file removed.")




In [None]:
# --------------------------------------------------------------------------------------------------
# Cell: Download Massachusetts ZCTA shapefile
#
# This cell downloads the full US ZIP Code Tabulation Area (ZCTA) shapefile for 2025 from the
# US Census Bureau TIGER/Line repository, extracts it locally, and prepares it for further analysis.
# --------------------------------------------------------------------------------------------------


download_and_extract_tiger(
    input_url="ZCTA520/tl_2025_us_zcta520.zip",
    output_path="../data/external/tl_2025_us_zcta520"
)

Downloading: https://www2.census.gov/geo/tiger/TIGER2025/ZCTA520/tl_2025_us_zcta520.zip
Download complete.
Extracting...
Extracted to: ../data/external/tl_2025_us_zcta520
ZIP file removed.


In [None]:
# --------------------------------------------------------------------------------------------------
# Cell: Download US States shapefile (TIGER/Line 2025)
#
# This cell downloads the US States shapefile for 2025 from the US Census Bureau TIGER/Line repository,
# extracts it locally, and prepares it for use in spatial operations such as joining with ZCTA polygons.
# --------------------------------------------------------------------------------------------------

download_and_extract_tiger(
    input_url="/STATE/tl_2025_us_state.zip",
    output_path="../data/external/tl_2025_us_state"
)


Downloading: https://www2.census.gov/geo/tiger/TIGER2025//STATE/tl_2025_us_state.zip
Download complete.
Extracting...
Extracted to: ../data/external/tl_2025_us_state
ZIP file removed.


In [33]:

download_and_extract_tiger(
    input_url="/PLACE/tl_2025_25_place.zip",
    output_path="../data/external/tl_2025_ma_place"
)


Downloading: https://www2.census.gov/geo/tiger/TIGER2025//PLACE/tl_2025_25_place.zip
Download complete.
Extracting...
Extracted to: ../data/external/tl_2025_ma_place
ZIP file removed.


In [38]:
# --------------------------------------------------------------------------------------------------
# Cell: Extract Massachusetts ZCTAs via spatial join
#
# This cell performs a spatial join between the full US ZCTA shapefile and the US States shapefile
# to extract only the ZIP Code Tabulation Areas (ZCTAs) that intersect Massachusetts.
# The resulting shapefile is saved in a separate folder for future use.
# --------------------------------------------------------------------------------------------------

import os
import shutil
import geopandas as gpd

# Paths
base_dir = "../data/external"
zcta_shp = os.path.join(base_dir, "tl_2025_us_zcta520", "tl_2025_us_zcta520.shp")
states_shp = os.path.join(base_dir, "tl_2025_us_state", "tl_2025_us_state.shp")
ma_shp_dir = os.path.join(base_dir, "tl_2025_ma_zcta520")
ma_shp_path = os.path.join(ma_shp_dir, "tl_2025_ma_zcta520.shp")

# Create output directory
os.makedirs(ma_shp_dir, exist_ok=True)

# Load shapefiles
print("Loading ZCTA shapefile...")
zcta = gpd.read_file(zcta_shp)

print("Loading US States shapefile...")
states = gpd.read_file(states_shp)

# Filter Massachusetts
ma = states[states["STUSPS"] == "MA"]

# Ensure CRS match
if zcta.crs != ma.crs:
    ma = ma.to_crs(zcta.crs)

# Spatial join: keep ZCTAs that intersect Massachusetts
print("Performing spatial join to extract MA ZCTAs...")
ma_zcta = gpd.sjoin(zcta, ma, how="inner", predicate="intersects")

# Keep only original ZCTA columns
ma_zcta = ma_zcta[zcta.columns].copy()
ma_zcta.reset_index(drop=True, inplace=True)

# Save MA ZCTA shapefile
ma_zcta.to_file(ma_shp_path)
print(f"Massachusetts ZCTA shapefile saved to: {ma_shp_path}")

# Optional: delete original directories to save space (confirm MA shapefile first!)
shutil.rmtree(os.path.join(base_dir, "tl_2025_us_zcta520"))
shutil.rmtree(os.path.join(base_dir, "tl_2025_us_state"))
print("Deleted original US shapefile directories to save space.")

Loading ZCTA shapefile...
Loading US States shapefile...
Performing spatial join to extract MA ZCTAs...
Massachusetts ZCTA shapefile saved to: ../data/external/tl_2025_ma_zcta520/tl_2025_ma_zcta520.shp
Deleted original US shapefile directories to save space.


| Field        | Description |
|--------------|-------------|
| ZCTA5CE20    | The 5-digit ZIP Code Tabulation Area code. |
| GEOID20      | Unique identifier for the ZCTA, usually identical to ZCTA5CE20. |
| GEOIDFQ20    | Fully-qualified GEOID, sometimes including state or other prefix information. |
| CLASSFP20    | Class code describing the type of ZCTA (typically “ZCTA5”). |
| MTFCC20      | MAF/TIGER Feature Class Code, categorizing the geographic feature. |
| FUNCSTAT20   | Functional status code (e.g., active or inactive ZCTA). |
| ALAND20      | Land area of the ZCTA in square meters. |
| AWATER20     | Water area of the ZCTA in square meters. |
| INTPTLAT20   | Latitude of the ZCTA’s internal point (centroid). |
| INTPTLON20   | Longitude of the ZCTA’s internal point (centroid). |
| geometry     | Polygon geometry defining the ZCTA boundaries. |


In [40]:
# # Pull Massachusetts from the US States
# # --------------------------------------------------------------------------------------------------

# import geopandas as gpd
# extract_dir = "../data/external/tl_2025_us_state"
# shp_path = os.path.join(extract_dir, "tl_2025_us_state.shp")

# us_states = gpd.read_file(shp_path)

# print(len(us_states))
# # Filter for Massachusetts
# ma = us_states[us_states['STUSPS'] == 'MA']

# ma

In [41]:

place_shp = "../data/external/tl_2025_ma_place/tl_2025_25_place.shp"
places_gdf = gpd.read_file(place_shp)

# Filter Worcester
worcester = places_gdf[places_gdf['NAME'].str.upper() == 'WORCESTER']
print(worcester.geometry)

18    POLYGON ((-71.88404 42.28125, -71.88387 42.281...
Name: geometry, dtype: geometry


In [43]:
# # Print all columns (fields) in the shapefile
# import os

# import geopandas as gpd

# extract_dir = "../data/external/tl_2025_us_zcta520"
# shp_path = os.path.join(extract_dir, "tl_2025_us_zcta520.shp")

# zcta = gpd.read_file(shp_path)

# # Print all column names
# print("Columns in the shapefile:")
# print(list(zcta.columns))

# # Optionally, inspect the first few rows
# print("\nSample rows:")
# print(zcta.head())

In [44]:
# Print all columns (fields) in the shapefile
import os

import geopandas as gpd

extract_dir = "../data/external/tl_2025_ma_zcta520"
shp_path = os.path.join(extract_dir, "tl_2025_ma_zcta520.shp")

zcta = gpd.read_file(shp_path)

# Print all column names
print("Columns in the shapefile:")
print(list(zcta.columns))

# Optionally, inspect the first few rows
print("\nSample rows:")
print(zcta.head())

Columns in the shapefile:
['ZCTA5CE20', 'GEOID20', 'GEOIDFQ20', 'CLASSFP20', 'MTFCC20', 'FUNCSTAT20', 'ALAND20', 'AWATER20', 'INTPTLAT20', 'INTPTLON20', 'geometry']

Sample rows:
  ZCTA5CE20 GEOID20       GEOIDFQ20 CLASSFP20 MTFCC20 FUNCSTAT20   ALAND20  \
0     03076   03076  860Z200US03076        B5   G6350          S  68233744   
1     03060   03060  860Z200US03060        B5   G6350          S  16987365   
2     03811   03811  860Z200US03811        B5   G6350          S  28803697   
3     03827   03827  860Z200US03827        B5   G6350          S  46254776   
4     03071   03071  860Z200US03071        B5   G6350          S  84692651   

   AWATER20   INTPTLAT20    INTPTLON20  \
0   1394546  +42.7309925  -071.3370749   
1    847514  +42.7410867  -071.4582661   
2    471773  +42.8391129  -071.1671916   
3    209977  +42.9036773  -070.9955497   
4    829429  +42.7462567  -071.8744336   

                                            geometry  
0  POLYGON ((-71.38696 42.69894, -71.38632 4

In [46]:
import geopandas as gpd

# Load ZCTA shapefile
zcta_shp = "../data/external/tl_2025_ma_zcta520/tl_2025_ma_zcta520.shp"
zcta_gdf = gpd.read_file(zcta_shp)

# Ensure ZIPs are strings
zcta_gdf["ZCTA5CE20"] = zcta_gdf["ZCTA5CE20"].astype(str)

# Filter ZIPs starting with "016"
zips_016 = (
    zcta_gdf[zcta_gdf["ZCTA5CE20"].str.startswith("016")]
    ["ZCTA5CE20"]
    .drop_duplicates()
    .sort_values()
    .tolist()
)
zips_016

['01602',
 '01603',
 '01604',
 '01605',
 '01606',
 '01607',
 '01608',
 '01609',
 '01610',
 '01611',
 '01612']