## Environment Setup
Making sure that the environment is working as expected


In [3]:
# Install required packages
!pip install geopandas osmnx rasterio scikit-learn xgboost matplotlib seaborn rasterstats censusdata

Collecting osmnx
  Downloading osmnx-2.0.6-py3-none-any.whl.metadata (4.9 kB)
Collecting rasterio
  Downloading rasterio-1.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting rasterstats
  Downloading rasterstats-0.20.0-py3-none-any.whl.metadata (4.2 kB)
Collecting censusdata
  Downloading CensusData-1.15.post1.tar.gz (26.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting fiona (from rasterstats)
  Downloading fiona-1.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

## Testing Data Access
Making all the ethical checks and ensuring that I have access to the data

In [7]:
# -----------------------------
# Import libraries
# -----------------------------

import geopandas as gpd
import osmnx as ox
import rasterio
import sklearn
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import rasterstats
import censusdata
import os
import requests
import pandas as pd

In [9]:
os.getcwd()

'/content'

In [10]:
# -----------------------------
# Setup directories
# -----------------------------
current_dir = os.getcwd()  # notebooks folder
throwaway_dir = os.path.join(os.path.dirname(current_dir), "throwaway-data")
os.makedirs(throwaway_dir, exist_ok=True)
print(f"Throwaway data folder: {throwaway_dir}")

Throwaway data folder: /throwaway-data


In [15]:
# -----------------------------
# ACS 2022 Data Test
# -----------------------------
print("\n--- ACS 2022 Test ---")

try:
    acs_data = censusdata.download(
        'acs5', 2022,
        censusdata.censusgeo([('state', '51'), ('county', '107')]),
        ['B19013_001E']  # Median household income
    )
    print("ACS sample data:")
    print(acs_data.head())
    acs_status = "✅ Accessible"
except Exception as e:
    print("ACS access failed:", e)
    acs_status = "⚠ Network/Access failed"
    print("Alternative verification: go to https://www.census.gov/data/developers/data-sets/acs-5year.html and confirm public access.")


--- ACS 2022 Test ---
ACS access failed: HTTPSConnectionPool(host='api.census.gov', port=443): Max retries exceeded with url: /data/2022/acs/acs5?get=NAME,B19013_001E&for=county:107&in=state:51 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x79534900a630>: Failed to establish a new connection: [Errno 101] Network is unreachable'))
Alternative verification: go to https://www.census.gov/data/developers/data-sets/acs-5year.html and confirm public access.


In [16]:
# -----------------------------
# OpenStreetMap Test (Overpass API)
# -----------------------------
print("\n--- OpenStreetMap Test ---")

try:
    tags = {'amenity': True}  # Example: amenities
    gdf = ox.geometries.geometries_from_place(
        "Montgomery County, Virginia, USA",
        tags
    )
    print("OSM sample data:")
    print(gdf.head())
    osm_status = "✅ Accessible"
except Exception as e:
    print("OSM access failed:", e)
    osm_status = "⚠ Access failed"
    print("Alternative verification: go to https://www.openstreetmap.org, search for Montgomery County, VA, and confirm data is available. License: ODbL.")

# Plot OSM points if available
if osm_status.startswith("✅"):
    if 'geometry' in gdf.columns:
        gdf.plot(figsize=(6,6), markersize=10, alpha=0.5)
        plt.title("OSM Amenities Preview")
        plt.show()


--- OpenStreetMap Test ---
OSM access failed: module 'osmnx' has no attribute 'geometries'
Alternative verification: go to https://www.openstreetmap.org, search for Montgomery County, VA, and confirm data is available. License: ODbL.


In [17]:
# -----------------------------
# WorldPop Auto-Download
# -----------------------------
print("\n--- WorldPop Test ---")

worldpop_url = "https://data.worldpop.org/GIS/Population/USA/USA_ppp_2020_constrained.tif"
worldpop_path = os.path.join(throwaway_dir, "worldpop_va.tif")

try:
    if not os.path.exists(worldpop_path):
        print("Downloading WorldPop raster (may take a few minutes)...")
        r = requests.get(worldpop_url, stream=True)
        with open(worldpop_path, "wb") as f:
            for chunk in r.iter_content(8192):
                f.write(chunk)
        print("Download complete.")
    else:
        print("WorldPop raster already exists, skipping download.")

    # Open raster
    with rasterio.open(worldpop_path) as src:
        pop_data = src.read(1)
        print("WorldPop raster shape:", pop_data.shape)
        print("Population max value:", pop_data.max())
        pop_status = "✅ Accessible"

        # Quick raster preview
        plt.figure(figsize=(6,6))
        plt.imshow(pop_data, cmap='viridis')
        plt.colorbar(label="Population")
        plt.title("WorldPop Preview")
        plt.show()
except Exception as e:
    print("WorldPop access failed:", e)
    pop_status = "⚠ Access failed"
    print("Alternative verification: go to https://www.worldpop.org and confirm Virginia 2020 population raster is available. License: Open Access.")



--- WorldPop Test ---
WorldPop raster already exists, skipping download.
WorldPop access failed: '/throwaway-data/worldpop_va.tif' not recognized as being in a supported file format.
Alternative verification: go to https://www.worldpop.org and confirm Virginia 2020 population raster is available. License: Open Access.


In [18]:
# -----------------------------
# Summary Table
# -----------------------------
summary = pd.DataFrame({
    'Data Source': ['ACS 2022', 'OpenStreetMap', 'WorldPop'],
    'Access Status': [acs_status, osm_status, pop_status],
    'License / Access': [
        'Public domain',
        'ODbL',
        'Open access'
    ]
})

print("\nData Access Summary:")
display(summary)


Data Access Summary:


Unnamed: 0,Data Source,Access Status,License / Access
0,ACS 2022,⚠ Network/Access failed,Public domain
1,OpenStreetMap,⚠ Access failed,ODbL
2,WorldPop,⚠ Access failed,Open access
