# Extract and Collect TIGERS/ACS Files

The TIGERS files have income included in the 2021 5 year estimate, which substantially reduces the amount of compute necessary in this pipeline. Each state and territory has its own file. The granularity pulled is Census Tract level with the associated geometric files. 

In [0]:
from pyspark.sql import SparkSession

# Create new spark session
spark = SparkSession.builder \
    .appName("Geometric_Dataset_Processing") \
    .getOrCreate()

In [0]:
import os
import zipfile

# Define the source directory containing the .gdb.zip files
source_dir = "/Volumes/tabular/dataexpert/freshoats_capstone/ACS_Tract_Zipped/"  

# Define the target directory where the unzipped contents will be placed
target_dir = "/Volumes/tabular/dataexpert/freshoats_capstone/ACS_Geo_Unzipped/"  

# Ensure the target directory exists
os.makedirs(target_dir, exist_ok=True)

# Iterate through all .zip files in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith(".zip"):  # Check if the file is a .zip file
        zip_path = os.path.join(source_dir, file_name)  # Full path to the .zip file
        
        # Extract the contents of the .zip file directly into the target directory
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for member in zip_ref.namelist():
                # Extract each file/folder to the target directory
                zip_ref.extract(member, target_dir)
        
        print(f"Extracted {file_name} to {target_dir}")

In [0]:
pip install geopandas pyogrio fiona


In [0]:
dbutils.library.restartPython()

In [0]:
import geopandas as gpd
import pyogrio

# Path to the .gdb file
gdb_path = "/Volumes/tabular/dataexpert/freshoats_capstone/ACS_Geo_Unzipped/ACS_2021_5YR_TRACT_01_ALABAMA.gdb/"

# List all layers in the geodatabase
layers = pyogrio.list_layers(gdb_path)
print("Available layers:", layers)

In [0]:
# Load the spatial layer
spatial_layer = "ACS_2021_5YR_TRACT_01_ALABAMA"
gdf = gpd.read_file(gdb_path, layer=spatial_layer)

# Show the first few rows of the GeoDataFrame
print(gdf.head())

# Check the columns
print(gdf.columns)

# Check the GEOID format
print(gdf.GEOID)

In [0]:
# Select the needed columns from spatial data
spatial_gdf = gdf[['GEOID_Data', 'STATEFP', 'COUNTYFP', 'TRACTCE', 'INTPTLAT', 'INTPTLON', 'geometry']]


In [0]:
spatial_gdf.head()

In [0]:
gdf.head()

In [0]:
# Load the attribute layer (e.g., income data)
attribute_layer = "X19_INCOME"
attribute_gdf = gpd.read_file(gdb_path, layer=attribute_layer)

# Show the first few rows of the attribute GeoDataFrame
print(attribute_gdf.head())

# Check the columns
print(attribute_gdf.columns)

attribute_gdf.tail(5)

Notice there are 3045 columns here. They break up all of the information by different demographics that I'm not interested. I just want the median income of everyone living in the tract. We need to find this with the metadata controlling for any race being false. 

The column titles are completely unusable at the moment, and most are likely unnecessary. To get the titles for each column, I need to extract the information from the Tract metadata

In [0]:
# Load the metadata layer
metadata_layer = "TRACT_METADATA_2021"
metadata_gdf = gpd.read_file(gdb_path, layer=metadata_layer)

# Display the first few rows of the metadata layer
print(metadata_gdf.head())

# Check the columns in the metadata layer
print(metadata_gdf.columns)

In [0]:
metadata_gdf.head()

In [0]:
# Search for columns related to "median"
median_columns = metadata_gdf[metadata_gdf["Full_Name"].str.contains("median", case=False, na=False)]

# Search for columns related to "per capita"
per_capita_columns = metadata_gdf[metadata_gdf["Full_Name"].str.contains("per capita", case=False, na=False)]

# Search for columns related to "mean"
mean_columns = metadata_gdf[metadata_gdf["Full_Name"].str.contains("mean", case=False, na=False)]

# Display the results
print("Median Columns:")
print(median_columns)

print("\nPer Capita Columns:")
print(per_capita_columns)

print("\nMean Columns:")
print(mean_columns)

In [0]:
import pandas as pd
pd.set_option("display.max_colwidth", None)  # Show full column content

# Filter metadata for "Median Household Income" and exclude race-related rows
overall_median_income = metadata_gdf[
    metadata_gdf["Full_Name"].str.contains("Median Household Income", case=False, na=False) &
    ~metadata_gdf["Full_Name"].str.contains("race|ethnicity|hispanic|white|black|asian|native", case=False, na=False)
]

# Display the filtered rows
print(overall_median_income)

I've Identified the columns to use: 
Estimate Column: **B19013e1**
Description: "Median Household Income in the Past 12 Months (in 2021 Inflation-Adjusted Dollars): Households -- (Estimate)"
This column provides the overall median household income for all households.
Margin of Error Column: **B19013m1**
Description: "Median Household Income in the Past 12 Months (in 2021 Inflation-Adjusted Dollars): Households -- (Margin of Error)"
This column provides the margin of error for the overall median household income estimate.

I'm going select these columns and then rename them to be more meaningful. 

In [0]:
# Select relevant columns from the income table
filtered_attribute_gdf = attribute_gdf[["GEOID", "B19013e1", "B19013m1"]]

# Show the filtered income table
filtered_attribute_gdf.head()

In [0]:
# Rename columns in PySpark DataFrame
filtered_attribute_gdf = filtered_attribute_gdf.rename(columns={"B19013e1": "median_income", "B19013m1": "median_income_margin"})

filtered_attribute_gdf.head()

In [0]:
# Merge the spatial layer and the X19_INCOME layer on the appropriate columns
merged_gdf = gdf.merge(filtered_attribute_gdf, left_on="GEOID_Data", right_on="GEOID")

# Verify the merged GeoDataFrame
merged_gdf.columns

In [0]:
print(merged_gdf)

In [0]:
final_gdf = merged_gdf[["GEOID_Data", "median_income", "median_income_margin", "geometry", "STATEFP", "TRACTCE", "INTPTLAT", "INTPTLON"]]



In [0]:
final_gdf.to_parquet("/Volumes/tabular/dataexpert/freshoats_capstone/Geo_Census.parquet", engine="pyarrow", index=False)


In [0]:
df = pd.read_parquet("/Volumes/tabular/dataexpert/freshoats_capstone/Geo_Census.parquet")

/Volumes/tabular/dataexpert/freshoats_capstone/ACS_Geo_Unzipped/

That worked - no we need to iterate through the rest of the files to perform the same transformation to all states and territories and append them to the Geo_Census.parquet file. 

In [0]:
pip install geopandas pyarrow pyogrio fiona


In [0]:
dbutils.library.restartPython() 

In [0]:
import os
import pandas as pd
import geopandas as gpd

# Input folder containing the files
input_folder = "/Volumes/tabular/dataexpert/freshoats_capstone/ACS_Geo_Unzipped/"

# Output Parquet file
output_file = "/Volumes/tabular/dataexpert/freshoats_capstone/Geo_Census.parquet"



In [0]:
def process_gdb(gdb_path, output_file):
    try:
        # Extract the state name from the folder name
        state_name = os.path.basename(gdb_path).replace(".gdb", "")
        print(f"Processing: {state_name}")

        # Step 1a: Load the spatial layer
        spatial_layer = state_name  # The folder name is the spatial layer name
        gdf = gpd.read_file(gdb_path, layer=spatial_layer)

        # Step 1b: Select the needed columns from spatial data
        spatial_gdf = gdf[['GEOID_Data', 'STATEFP', 'COUNTYFP', 'TRACTCE', 'INTPTLAT', 'INTPTLON', 'geometry']]

        # Step 2: Load the attribute layer (e.g., income data)
        attribute_layer = "X19_INCOME"
        attribute_gdf = gpd.read_file(gdb_path, layer=attribute_layer)

        # Step 3: Select relevant columns from the income table
        filtered_attribute_gdf = attribute_gdf[["GEOID", "B19013e1", "B19013m1"]]

        # Step 4: Rename columns
        filtered_attribute_gdf = filtered_attribute_gdf.rename(columns={
            "B19013e1": "median_income",
            "B19013m1": "median_income_margin"
        })

        # Step 5: Merge the spatial layer and the X19_INCOME layer
        merged_gdf = spatial_gdf.merge(filtered_attribute_gdf, left_on="GEOID_Data", right_on="GEOID")

        # Step 6: Handle appending to the Parquet file
        if os.path.exists(output_file):
            # Read the existing Parquet file
            existing_gdf = gpd.read_parquet(output_file)

            # Concatenate the new data with the existing data
            combined_gdf = pd.concat([existing_gdf, merged_gdf], ignore_index=True)

            # Overwrite the Parquet file with the combined data
            combined_gdf.to_parquet(output_file, engine="pyarrow", index=False)
        else:
            # Save as a new Parquet file
            merged_gdf.to_parquet(output_file, engine="pyarrow", index=False)

        print(f"Successfully processed and appended: {state_name}")

    except Exception as e:
        print(f"Error processing {gdb_path}: {e}")

In [0]:
# Iterate through all .gdb folders in the input directory
for folder_name in os.listdir(input_folder):
    folder_path = os.path.join(input_folder, folder_name)

    # Check if the folder is a .gdb folder
    if folder_name.endswith(".gdb"):
        process_gdb(folder_path, output_file)