# Making Food Desert Dataset

### Mount Google Drive

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

target_folder = "207 Machine Learning: Final Project"
found_path = None

for root, dirs, files in os.walk("/content/drive/MyDrive"):
    if target_folder in dirs:
        found_path = os.path.join(root, target_folder)
        break

if found_path:
    os.chdir(found_path)
    print("Changed working directory to:", os.getcwd())
else:
    print(f"Folder '{target_folder}' not found.")

Mounted at /content/drive
Changed working directory to: /content/drive/.shortcut-targets-by-id/1MnqFmQYij3NcmwhbYUVc-9h4Dp6eLuMH/207 Machine Learning: Final Project


# Download the Following 2 Data Source
1. USDA Food Desert: https://www.ers.usda.gov/data-products/food-access-research-atlas/download-the-data

2. Census Tract Shape Files : https://www2.census.gov/geo/tiger/TIGER2020/TRACT/



In [None]:
import os
import zipfile
import requests
import geopandas as gpd
import pandas as pd

states = {
    "01": "Alabama",
    "05": "Arkansas",
    "13": "Georgia",
    "21": "Kentucky",
    "22": "Louisiana",
    "28": "Mississippi",
    "45": "South Carolina",
    "47": "Tennessee"
}

base_url = "https://www2.census.gov/geo/tiger/TIGER2020/TRACT/"
output_dir = r"./Data/Shape_Files"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

gdfs = []  # Use a list to collect GeoDataFrames

for fips, state_name in states.items():
    filename = f"tl_2020_{fips}_tract"
    zip_url = f"{base_url}{filename}.zip"
    state_dir = os.path.join(output_dir, state_name)
    if not os.path.exists(state_dir):
        os.makedirs(state_dir)
    zip_path = os.path.join(state_dir, f"{filename}.zip")

    if not os.path.exists(zip_path):
        print(f"Downloading {state_name} shapefile...")
        r = requests.get(zip_url, verify=False)
        with open(zip_path, "wb") as f:
            f.write(r.content)

    shp_path = os.path.join(state_dir, f"{filename}.shp")
    if not os.path.exists(shp_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(state_dir)

    print(f"Reading shapefile for {state_name}")
    gdf = gpd.read_file(shp_path)
    gdfs.append(gdf)  # Add each GeoDataFrame to the list

# Combine all into one GeoDataFrame
all_tracts = pd.concat(gdfs, ignore_index=True)
print(f"Total census tracts loaded: {len(all_tracts)}")

# Calculate centroids
all_tracts["centroid"] = all_tracts.geometry.centroid
all_tracts["latitude"] = all_tracts["centroid"].apply(lambda p: p.y)
all_tracts["longitude"] = all_tracts["centroid"].apply(lambda p: p.x)



Reading shapefile for Alabama
Reading shapefile for Arkansas
Reading shapefile for Georgia
Reading shapefile for Kentucky
Reading shapefile for Louisiana
Reading shapefile for Mississippi
Reading shapefile for South Carolina
Reading shapefile for Tennessee
Total census tracts loaded: 11652



  all_tracts["centroid"] = all_tracts.geometry.centroid


### Visual Inspection

In [None]:
states = {
    "01": "Alabama",
    "05": "Arkansas",
    "13": "Georgia",
    "21": "Kentucky",
    "22": "Louisiana",
    "28": "Mississippi",
    "45": "South Carolina",
    "47": "Tennessee"
}

all_tracts[all_tracts['STATEFP'] == '47'][['STATEFP', 'centroid', 'latitude', 'longitude']]



Unnamed: 0,STATEFP,centroid,latitude,longitude
9951,47,POINT (-85.48725 35.74885),35.748851,-85.487255
9952,47,POINT (-85.43613 35.67082),35.670825,-85.436131
9953,47,POINT (-86.55757 35.65175),35.651749,-86.557566
9954,47,POINT (-86.57928 35.58469),35.584694,-86.579280
9955,47,POINT (-83.84228 35.99948),35.999477,-83.842277
...,...,...,...,...
11647,47,POINT (-82.77152 36.56591),36.565910,-82.771519
11648,47,POINT (-82.64134 36.57679),36.576789,-82.641339
11649,47,POINT (-82.65025 36.54469),36.544687,-82.650252
11650,47,POINT (-82.75849 36.5277),36.527699,-82.758494


# Merge the Centroids with USDA Data

In [None]:
# Read in Centroid data and only keep the 8 states
centroids = pd.read_csv(
    r"./Data/Food Access Research Atlas.csv",
    encoding="latin1",
    dtype={"CensusTract": str}
)

centroids = centroids[centroids["State"].isin(states.values())]
centroids


Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
0,1001020100,Alabama,Autauga County,1,1912,693,0,0.0,0.00,0,...,221.0,1622.0,217.0,14.0,0.0,14.0,45.0,44.0,6.0,102.0
1,1001020200,Alabama,Autauga County,1,2170,743,0,181.0,8.34,1,...,214.0,888.0,1217.0,5.0,0.0,5.0,55.0,75.0,89.0,156.0
2,1001020300,Alabama,Autauga County,1,3373,1256,0,0.0,0.00,0,...,439.0,2576.0,647.0,17.0,5.0,11.0,117.0,87.0,99.0,172.0
3,1001020400,Alabama,Autauga County,1,4386,1722,0,0.0,0.00,0,...,904.0,4086.0,193.0,18.0,4.0,11.0,74.0,85.0,21.0,98.0
4,1001020500,Alabama,Autauga County,1,10766,4082,0,181.0,1.68,0,...,1126.0,8666.0,1437.0,296.0,9.0,48.0,310.0,355.0,230.0,339.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61182,47189030800,Tennessee,Wilson County,0,5787,2189,0,16.0,0.28,0,...,790.0,5389.0,208.0,21.0,2.0,23.0,144.0,141.0,17.0,397.0
61183,47189030901,Tennessee,Wilson County,0,9572,3642,0,0.0,0.00,0,...,771.0,8433.0,499.0,319.0,1.0,46.0,274.0,263.0,135.0,308.0
61184,47189030903,Tennessee,Wilson County,0,6081,2321,0,0.0,0.00,0,...,908.0,5596.0,340.0,38.0,1.0,24.0,82.0,70.0,12.0,204.0
61185,47189030904,Tennessee,Wilson County,0,3523,1347,0,0.0,0.00,0,...,445.0,3350.0,89.0,7.0,1.0,13.0,63.0,52.0,87.0,39.0


In [None]:
# ensure tract IDs align exactly as 11-digit strings
centroids["CensusTract"] = (
    centroids["CensusTract"]
      .str.replace(r"\.0+$", "", regex=True)  # drop any trailing .0
      .str.zfill(11)                          # pad to 11 digits
)

# Do an explicit inner join and KEEP geometry
merged = centroids.merge(all_tracts, left_on="CensusTract", right_on="GEOID",  how="inner")
print(f"merged dataset is a {type(merged)}and there are {len(merged)} rows.")


merged dataset is a <class 'pandas.core.frame.DataFrame'>and there are 7121 rows.


In [None]:
merged[merged["State"] == "Kentucky"]['latitude']

Unnamed: 0,latitude
2728,37.238336
2729,37.174825
2730,37.131255
2731,37.059959
2732,37.120540
...,...
3639,38.029163
3640,38.045754
3641,37.954142
3642,38.102577


# Get google maps images
- The image name will follow the pattern {`IsFoodDesert`}_{`Tract_ID`}
    - `IsFoodDesert` is defined by the column `LILATracts_1And10` in Census Data
    - This way when we do the CNN model we can know the label right away from image file names

In [None]:
import requests
import os
from time import sleep
import pandas as pd

API_KEY = "AIzaSyCPKaTlKiP_bUvdJSNnESWeKG0ufHPyQAU"

# Make dir if needed
states_to_download = ["Arkansas", "Alabama"]
DIR_PATH = "Data/Pictures"
for state in states_to_download:
    state_dir = os.path.join(DIR_PATH, state)
    if not os.path.exists(state_dir):
        os.makedirs(state_dir)

df = merged[merged["State"].isin(states_to_download)]

for index, row in df.iterrows():
    lat = row["latitude"]
    lon = row["longitude"]
    state = row["State"]
    label = int(row["LILATracts_1And10"])
    tract_id = str(row["CensusTract"])

    url = f"https://maps.googleapis.com/maps/api/staticmap?center={lat},{lon}&zoom=16&size=400x400&maptype=satellite&key={API_KEY}"
    filename = f"{label}_{tract_id}.jpg"

    # Switch comments to download different targets
    # filepath = os.path.join(DIR_PATH, state, filename)
    filepath = os.path.join(DIR_PATH, "All", filename)

    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                f.write(response.content)
            merged["ImagePath"] = filepath
            print(f"Saved: {filename}")
        else:
            print(f"Failed for {tract_id}: Status {response.status_code}")
    except Exception as e:
        print(f"Error for {tract_id}: {e}")

    merged["ImagePath"] = filepath

    sleep(0.1)


Saved: 0_13001950100.jpg
Saved: 1_13001950400.jpg
Saved: 0_13001950500.jpg
Saved: 1_13003960100.jpg
Saved: 0_13003960200.jpg
Saved: 0_13003960300.jpg
Saved: 0_13005970100.jpg
Saved: 0_13005970201.jpg
Saved: 0_13005970202.jpg
Saved: 1_13007960100.jpg
Saved: 1_13007960200.jpg
Saved: 0_13009970400.jpg
Saved: 1_13009970600.jpg
Saved: 1_13009970701.jpg
Saved: 1_13009970702.jpg
Saved: 0_13011970100.jpg
Saved: 0_13011970200.jpg
Saved: 0_13011970300.jpg
Saved: 0_13011970400.jpg
Saved: 0_13013180103.jpg
Saved: 0_13013180104.jpg
Saved: 0_13013180105.jpg
Saved: 1_13013180106.jpg
Saved: 0_13013180107.jpg
Saved: 0_13013180108.jpg
Saved: 1_13013180203.jpg
Saved: 0_13013180204.jpg
Saved: 1_13013180205.jpg
Saved: 0_13013180206.jpg
Saved: 0_13013180301.jpg
Saved: 1_13013180302.jpg
Saved: 0_13013180303.jpg
Saved: 0_13013180401.jpg
Saved: 1_13013180402.jpg
Saved: 0_13013180501.jpg
Saved: 0_13013180502.jpg
Saved: 0_13013180503.jpg
Saved: 0_13015960101.jpg
Saved: 1_13015960801.jpg
Saved: 1_13015960802.jpg


In [None]:
# Output data for EDA
merged.to_csv("Data\regional_combined_centroids_YL.csv", index=False)