# Making Food Desert Dataset

### Mount Google Drive

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

target_folder = "207 Machine Learning: Final Project"
found_path = None

for root, dirs, files in os.walk("/content/drive/MyDrive"):
    if target_folder in dirs:
        found_path = os.path.join(root, target_folder)
        break

if found_path:
    os.chdir(found_path)
    print("Changed working directory to:", os.getcwd())
else:
    print(f"Folder '{target_folder}' not found.")

# 1. Download the Following 2 Data Source
- USDA Food Desert: https://www.ers.usda.gov/data-products/food-access-research-atlas/download-the-data
- Census Tract Shape Files : https://www2.census.gov/geo/tiger/TIGER2020/TRACT/


### Download all shape files of Census Tracts and read them into a dataframe

In [None]:
import os
import zipfile
import requests
import geopandas as gpd
import pandas as pd

def view_missing_values(df):
    row_count, col_count = df.shape
    summary = pd.DataFrame({
        'Missing Value Count': df.isnull().sum(),
        'Proportion of Missing Values': df.isnull().sum() / row_count,
        'Data Type': df.dtypes
    })
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(summary)
        print("Shape of the data frame", df.shape)

def download_tracts(states, base_url, output_dir):
    # State is a dictionary
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for fips, state_name in states.items():
        filename = f"tl_2020_{fips}_tract"
        zip_url = f"{base_url}{filename}.zip"
        state_dir = os.path.join(output_dir, state_name)

        if not os.path.exists(state_dir):
            os.makedirs(state_dir)

        zip_path = os.path.join(state_dir, f"{filename}.zip")
        shp_path = os.path.join(state_dir, f"{filename}.shp")

        # Download only if zip file doesn't exist
        if not os.path.exists(zip_path):
            print(f"Downloading {state_name} shapefile...")
            r = requests.get(zip_url, verify=False)
            with open(zip_path, "wb") as f:
                f.write(r.content)

        # Extract only if shapefile doesn't exist
        if not os.path.exists(shp_path):
            print(f"Extracting shapefile for {state_name}")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(state_dir)

        print(f"{state_name} shapefile is ready.")

def load_and_process_tracts(states, data_dir):
    """
    Args:
        states (_type_): dictionary of states
        data_dir (_type_): _description_

    """
    gdfs = []
    output_dir = os.path.join(data_dir, "Shape_Files")

    for fips, state_name in states.items():
        shp_path = os.path.join(output_dir, state_name, f"tl_2020_{fips}_tract.shp")
        gdf = gpd.read_file(shp_path)
        gdfs.append(gdf)

    # Combine all into one GeoDataFrame
    all_tracts = pd.concat(gdfs, ignore_index=True)
    print(f"Total census tracts loaded: {len(all_tracts)}")

    # Calculate centroids and extract coordinates
    all_tracts["centroid"] = all_tracts.geometry.centroid
    all_tracts["latitude"] = all_tracts["centroid"].apply(lambda p: p.y)
    all_tracts["longitude"] = all_tracts["centroid"].apply(lambda p: p.x)

    # Keep relevant columns
    all_tracts = all_tracts[['STATEFP', 'GEOID', 'geometry', 'centroid', 'latitude', 'longitude']]

    return all_tracts



### Load Cenus Tract GeoPandas

In [None]:
states = {
    "01": "Alabama",
    "02": "Alaska",
    "04": "Arizona",
    "05": "Arkansas",
    "06": "California",
    "08": "Colorado",
    "09": "Connecticut",
    "10": "Delaware",
    "11": "District of Columbia",
    "12": "Florida",
    "13": "Georgia",
    "15": "Hawaii",
    "16": "Idaho",
    "17": "Illinois",
    "18": "Indiana",
    "19": "Iowa",
    "20": "Kansas",
    "21": "Kentucky",
    "22": "Louisiana",
    "23": "Maine",
    "24": "Maryland",
    "25": "Massachusetts",
    "26": "Michigan",
    "27": "Minnesota",
    "28": "Mississippi",
    "29": "Missouri",
    "30": "Montana",
    "31": "Nebraska",
    "32": "Nevada",
    "33": "New Hampshire",
    "34": "New Jersey",
    "35": "New Mexico",
    "36": "New York",
    "37": "North Carolina",
    "38": "North Dakota",
    "39": "Ohio",
    "40": "Oklahoma",
    "41": "Oregon",
    "42": "Pennsylvania",
    "44": "Rhode Island",
    "45": "South Carolina",
    "46": "South Dakota",
    "47": "Tennessee",
    "48": "Texas",
    "49": "Utah",
    "50": "Vermont",
    "51": "Virginia",
    "53": "Washington",
    "54": "West Virginia",
    "55": "Wisconsin",
    "56": "Wyoming"
}

data_dir = r"./Data (1)"
base_url = "https://www2.census.gov/geo/tiger/TIGER2020/TRACT/"
output_dir = os.path.join(data_dir, "Shape_Files")


In [None]:
download_tracts(states, base_url, output_dir)

In [None]:
all_tracts = load_and_process_tracts(states, data_dir)
all_tracts.head()

Total census tracts loaded: 84414



  all_tracts["centroid"] = all_tracts.geometry.centroid


Unnamed: 0,STATEFP,GEOID,geometry,centroid,latitude,longitude
0,1,1045020801,"POLYGON ((-85.70064 31.3851, -85.70043 31.3853...",POINT (-85.6616 31.39714),31.397144,-85.661596
1,1,1017954200,"POLYGON ((-85.31549 32.8058, -85.31354 32.8088...",POINT (-85.26768 32.84273),32.842726,-85.267677
2,1,1017953800,"POLYGON ((-85.59345 33.00012, -85.59333 33.000...",POINT (-85.39957 33.03866),33.038655,-85.399571
3,1,1017954300,"POLYGON ((-85.38077 32.78301, -85.37985 32.782...",POINT (-85.27874 32.77708),32.777078,-85.278743
4,1,1017954700,"POLYGON ((-85.22896 32.74543, -85.22894 32.745...",POINT (-85.17226 32.77063),32.770629,-85.172263


### 2. USDA Data

### Distribution of Food Desert Tracts across USA

In [None]:
# Read in Centroid data and only keep the 8 states
centroids = pd.read_csv(
    os.path.join(data_dir, "Food Access Research Atlas.csv"),
    encoding="latin1",
    dtype={"CensusTract": str}
)

# ensure tract IDs align exactly as 11-digit strings
centroids["CensusTract"] = (
    centroids["CensusTract"]
      .str.replace(r"\.0+$", "", regex=True)  # drop any trailing .0
      .str.zfill(11)                          # pad to 11 digits
)

# Distribution of LILA tracts
state_summary = centroids.groupby('State').agg(
    total_tracts=('CensusTract', 'count'),
    lilatracts_count=('LILATracts_1And10', 'sum')
)
state_summary['lilatracts_proportion'] = state_summary['lilatracts_count'] / state_summary['total_tracts']
state_summary.reset_index(inplace=True)
state_summary.sort_values(by='lilatracts_proportion', ascending=False, inplace=True)

display(state_summary)
view_missing_values(centroids)

Unnamed: 0,State,total_tracts,lilatracts_count,lilatracts_proportion
24,Mississippi,659,208,0.31563
31,New Mexico,498,126,0.253012
3,Arkansas,686,171,0.249271
18,Louisiana,1129,258,0.228521
0,Alabama,1178,267,0.226655
10,Georgia,1957,441,0.225345
40,South Carolina,1091,218,0.199817
1,Alaska,167,33,0.197605
43,Texas,5238,1022,0.195113
14,Indiana,1507,291,0.193099


Unnamed: 0,Missing Value Count,Proportion of Missing Values,Data Type
CensusTract,0,0.0,object
State,0,0.0,object
County,0,0.0,object
Urban,0,0.0,int64
Pop2010,0,0.0,int64
OHU2010,0,0.0,int64
GroupQuartersFlag,0,0.0,int64
NUMGQTRS,25,0.000345,float64
PCTGQTRS,25,0.000345,float64
LILATracts_1And10,0,0.0,int64


Shape of the data frame (72531, 147)


### Feature Selection

In [None]:
columns_to_keep = [
    "CensusTract",
    "State",
    "County",
    "Urban",
    "Pop2010",
    "OHU2010",
    "GroupQuartersFlag",
    "NUMGQTRS",
    "PCTGQTRS",
    "LILATracts_1And10",
    "LILATracts_halfAnd10",
    "LILATracts_1And20",
    "LILATracts_Vehicle",
    "HUNVFlag",
    "LowIncomeTracts",
    "PovertyRate",
    "MedianFamilyIncome",
    "LA1and10",
    "LAhalfand10",
    # "LA1and20",
    # "LATracts_half",
    "LATracts1",
    "LATracts10",
    # "LATracts20",
    "LATractsVehicle_20",
    "LAPOP1_10",
    # "LAPOP05_10",
    # "LAPOP1_20",
    "LALOWI1_10",
    # "LALOWI05_10",
    # "LALOWI1_20",
    # "lapophalf",
    # "lapophalfshare",
    # "lalowihalf",
    # "lalowihalfshare",
    # "lakidshalf",
    # "lakidshalfshare",
    # "laseniorshalf",
    # "laseniorshalfshare",
    # "lawhitehalf",
    # "lawhitehalfshare",
    # "lablackhalf",
    # "lablackhalfshare",
    # "laasianhalf",
    # "laasianhalfshare",
    # "lanhopihalf",
    # "lanhopihalfshare",
    # "laaianhalf",
    # "laaianhalfshare",
    # "laomultirhalf",
    # "laomultirhalfshare",
    # "lahisphalf",
    # "lahisphalfshare",
    # "lahunvhalf",
    # "lahunvhalfshare",
    # "lasnaphalf",
    # "lasnaphalfshare",
    "lapop1",
    "lapop1share",
    "lalowi1",
    "lalowi1share",
    # "lakids1",
    # "lakids1share",
    # "laseniors1",
    # "laseniors1share",
    # "lawhite1",
    # "lawhite1share",
    # "lablack1",
    # "lablack1share",
    # "laasian1",
    # "laasian1share",
    # "lanhopi1",
    # "lanhopi1share",
    # "laaian1",
    # "laaian1share",
    # "laomultir1",
    # "laomultir1share",
    # "lahisp1",
    # "lahisp1share",
    "lahunv1",
    "lahunv1share",
    "lasnap1",
    "lasnap1share",
    "lapop10",
    "lapop10share",
    "lalowi10",
    "lalowi10share",
    # "lakids10",
    # "lakids10share",
    # "laseniors10",
    # "laseniors10share",
    # "lawhite10",
    # "lawhite10share",
    # "lablack10",
    # "lablack10share",
    # "laasian10",
    # "laasian10share",
    # "lanhopi10",
    # "lanhopi10share",
    # "laaian10",
    # "laaian10share",
    # "laomultir10",
    # "laomultir10share",
    # "lahisp10",
    # "lahisp10share",
    "lahunv10",
    "lahunv10share",
    "lasnap10",
    "lasnap10share",
    # "lapop20",
    # "lapop20share",
    # "lalowi20",
    # "lalowi20share",
    # "lakids20",
    # "lakids20share",
    # "laseniors20",
    # "laseniors20share",
    # "lawhite20",
    # "lawhite20share",
    # "lablack20",
    # "lablack20share",
    # "laasian20",
    # "laasian20share",
    # "lanhopi20",
    # "lanhopi20share",
    # "laaian20",
    # "laaian20share",
    # "laomultir20",
    # "laomultir20share",
    # "lahisp20",
    # "lahisp20share",
    # "lahunv20",
    # "lahunv20share",
    # "lasnap20",
    # "lasnap20share",
    "TractLOWI",
    "TractKids",
    "TractSeniors",
    "TractWhite",
    "TractBlack",
    "TractAsian",
    "TractNHOPI",
    "TractAIAN",
    "TractOMultir",
    "TractHispanic",
    "TractHUNV",
    "TractSNAP",
]

centroids = centroids[columns_to_keep]
centroids.dropna(inplace=True)
view_missing_values(centroids)

Unnamed: 0,Missing Value Count,Proportion of Missing Values,Data Type
CensusTract,0,0.0,object
State,0,0.0,object
County,0,0.0,object
Urban,0,0.0,int64
Pop2010,0,0.0,int64
OHU2010,0,0.0,int64
GroupQuartersFlag,0,0.0,int64
NUMGQTRS,0,0.0,float64
PCTGQTRS,0,0.0,float64
LILATracts_1And10,0,0.0,int64


Shape of the data frame (7707, 52)


In [None]:
centroids.State.value_counts()

In [None]:
# Do an explicit inner join and KEEP geometry
merged = centroids.merge(all_tracts, left_on="CensusTract", right_on="GEOID",  how="inner")
print(f"merged dataset is a {type(merged)}and there are {len(merged)} rows.")

merged dataset is a <class 'pandas.core.frame.DataFrame'>and there are 6267 rows.


In [None]:
missing_states = set(states.values()) - set(centroids.State.unique())
print("States not present in centroids:", missing_states)

States not present in centroids: {'District of Columbia', 'Delaware'}


# Get google maps images
- The image name will follow the pattern {`IsFoodDesert`}_{`Tract_ID`}
    - `IsFoodDesert` is defined by the column `LILATracts_1And10` in Census Data
    - This way when we do the CNN model we can know the label right away from image file names

In [None]:
import os
import requests
from io import BytesIO
from time import sleep

def download_static_map_images(df, dest_dir, api_key):
    """
    Download satellite images from Google Static Maps API and store file paths in the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame with columns 'latitude', 'longitude', 'State', 'LILATracts_1And10', and 'CensusTract'
        dest_dir (str): Base directory to save images
        api_key (str): Google Maps Static API key

    Returns:
        pd.DataFrame: Original DataFrame with an added 'ImagePath' column
    """
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    all_dir = os.path.join(dest_dir, "All")
    if not os.path.exists(all_dir):
        os.makedirs(all_dir)

    image_paths = []

    for index, row in df.iterrows():
        lat = row["latitude"]
        lon = row["longitude"]
        state = row["State"]
        label = int(row["LILATracts_1And10"])
        tract_id = str(row["CensusTract"])

        url = (
            f"https://maps.googleapis.com/maps/api/staticmap?"
            f"center={lat},{lon}&zoom=16&size=400x400&maptype=satellite&key={api_key}"
        )
        filename = f"{label}_{tract_id}.jpg"
        state_dir = os.path.join(all_dir, state)
        if not os.path.exists(state_dir):
            os.makedirs(state_dir)
        filepath = os.path.join(state_dir, filename)


        try:
            response = requests.get(url)
            if response.status_code == 200:
                img_bytes = BytesIO(response.content)

                # Check for "no imagery" based on small image size
                if len(img_bytes.getbuffer())<10000: #10KB
                    print(f"No imagery for {tract_id}, skipping.")
                    image_paths.append(None)
                    continue

                with open(filepath, 'wb') as f:
                    f.write(response.content)
                print(f"Saved: {filename}")
                image_paths.append(filepath)
            else:
                print(f"Failed for {tract_id}: Status {response.status_code}")
                image_paths.append(None)
        except Exception as e:
            print(f"Error for {tract_id}: {e}")
            image_paths.append(None)

        sleep(0.1)  # avoid hitting API rate limits

    df = df.copy()
    df["ImagePath"] = image_paths
    return df


In [None]:
def remove_no_imagery(df, size_threshold_kb=10):
    """
    Deletes images below a size threshold and removes corresponding rows from the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame with an 'ImagePath' column.
        size_threshold_kb (int): Minimum acceptable file size in kilobytes.

    Returns:
        pd.DataFrame: Cleaned DataFrame with bad images and rows removed.
    """
    df = df.copy()
    indices_to_drop = []

    for idx, path in df["ImagePath"].items():
        if path and os.path.exists(path):
            file_size_kb = os.path.getsize(path) / 1024
            if file_size_kb < size_threshold_kb:
                try:
                    os.remove(path)
                    print(f"Deleted: {path} (size: {file_size_kb:.2f} KB)")
                    indices_to_drop.append(idx)
                except Exception as e:
                    print(f"Failed to delete {path}: {e}")

    df.drop(index=indices_to_drop, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


In [None]:
API_KEY = "AIzaSyCPKaTlKiP_bUvdJSNnESWeKG0ufHPyQAU"
DIR_PATH = os.path.join(data_dir, "Pictures")

output_df = download_static_map_images(merged, DIR_PATH, API_KEY)


In [None]:
output_df = remove_no_imagery(merged, size_threshold_kb=10)

In [None]:
# Output data for EDA
output_df.to_csv(data_dir, "Regional_combined_centroids_YL.csv", index=False)

In [None]:
modi[modi["CensusTract"]=='22075050100']

Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractHispanic,TractHUNV,TractSNAP,STATEFP,GEOID,geometry,centroid,latitude,longitude,ImagePath
2458,22075050100,Louisiana,Plaquemines Parish,0,1943,702,0,0.0,0.0,1,...,56.0,87.0,104.0,22,22075050100,"MULTIPOLYGON (((-89.119513 29.508513999999998,...",POINT (-89.48219844615771 29.43552780652579),29.435528,-89.482198,./Data (1)/Pictures/All/Louisiana/1_2207505010...


In [None]:
modi = pd.read_csv(
    os.path.join(data_dir, "Regional_combined_centroids_YL.csv"),
    encoding="latin1",
    dtype={"CensusTract": str}
)

tract_ids_to_remove = ['55029100100',
'02164000100',
'39123050100',
'02110000600',
'26041970100',
'02122000600',
'26089970300',
'02290000300',
'06037599100',
'02013000100',
'28047003600',
'02150000100',
'22075050100',
'32031940200']

cleaned_df = modi[~modi["CensusTract"].isin(tract_ids_to_remove)].reset_index(drop=True)
cleaned_df.to_csv(os.path.join(data_dir, "Regional_combined_centroids_YL.csv"), index=False)