In [None]:
# Import necessary libraries
import pandas as pd                # For working with data tables
import geopandas as gpd            # For working with geographic data
from shapely.geometry import Point # For creating point locations on the map
import leafmap                     # For displaying interactive maps
import os
import urllib.request
from urllib.parse import urlparse
import zipfile
import time
import zipfile

# Step 1: Start the timer for the entire operation
start_time = time.time()

# Create a map centered on the United States, with a zoom level of 4
m = leafmap.Map(center=[37.8, -96.9], zoom=4)

In [None]:
# Create the directory if it doesn't exist
unzip_dir = './extracted_files'
os.makedirs(unzip_dir, exist_ok=True)

# URL of the shapefile
url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip"

# Parse the URL and get the path
path = urlparse(url).path

# Get the filename without extension
filename = os.path.splitext(os.path.basename(path))[0]

# Path where you want to save the zip file
zip_path = os.path.join(unzip_dir, f"{filename}.zip")

# Download the shapefile zip
urllib.request.urlretrieve(url, zip_path)

In [None]:
# Unzip the downloaded shapefile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_dir)

In [None]:
# Step 1: Load the UniversityDirectory2023 CSV
csv_path_university = "./UniversityData/UniversityDirectory2023.csv"
columns_to_use_university = ["UNITID", "INSTNM", "ADDR", "CITY", "STABBR", "LONGITUD", "LATITUDE"]
df_university = pd.read_csv(csv_path_university, usecols=columns_to_use_university, encoding='ISO-8859-1')
print(df_university.head())

In [None]:
# Step 2: Load the UniversityAwardedDegrees2023 CSV
csv_path_awarded_degrees = "./UniversityData/UniversityAwardedDegrees2023.csv"
df_awarded_degrees = pd.read_csv(csv_path_awarded_degrees, usecols=["UNITID", "CTOTALT"], encoding='ISO-8859-1')
df_awarded_degrees.head()

In [None]:

# Step 3: Merge the two DataFrames on UNITID
df_merged = pd.merge(df_university, df_awarded_degrees, on="UNITID", how="inner")
df_merged.head()

In [None]:
df_merged = df_merged.groupby("INSTNM").agg({
    'CTOTALT': 'sum',  # Sum of degrees awarded
    'LATITUDE': 'first',  # You can also use 'mean' or 'median', if preferred
    'LONGITUD': 'first'  # Similarly, 'mean' or 'median' can be used
}).reset_index()
df_merged.head()

In [None]:

# Step 4: Convert LATITUDE and LONGITUD to geometry points
def make_point(row):
    return Point(row['LONGITUD'], row['LATITUDE'])
df_merged['geometry'] = df_merged.apply(make_point, axis=1)
df_merged.head()

In [None]:

# Step 5: Convert the merged DataFrame into a GeoDataFrame
gdf_university = gpd.GeoDataFrame(df_merged, geometry='geometry', crs="EPSG:4326")
gdf_university.head()

# Add the university point layer to the map
m.add_gdf(gdf_university, 
          layer_name="All Universities",
          visible=False  # THIS CAN TAKE A LONG TIME TO DRAW!
         )

In [None]:

# Step 6: Load the Shapefile for US States from the correct directory
shapefile_path = "./extracted_files/cb_2018_us_state_500k.shp"
gdf_states = gpd.read_file(shapefile_path)
gdf_states.head()

# Add the states with the degree totals to the map, symbolizing by the 'Total degrees awarded' column
m.add_gdf(gdf_states, layer_name="States")
m

In [None]:

# Step 2: Start the timer for the spatial join
spatial_join_start_time = time.time()

# Step 7: Perform a spatial join between the university data and the state shapefile
gdf_joined = gpd.sjoin(gdf_university, gdf_states, how="inner", op="within")

# Step 3: Calculate and print the time taken for the spatial join
spatial_join_end_time = time.time()
spatial_join_duration = spatial_join_end_time - spatial_join_start_time
print(f"Time taken for spatial join: {spatial_join_duration:.2f} seconds")


In [None]:
# Step 8: Aggregate the joined data by state and sum the degrees awarded
gdf_state_degrees = gdf_joined.groupby("STUSPS")["CTOTALT"].sum().reset_index()

# Step 9: Merge the aggregated data back into the state boundaries GeoDataFrame
gdf_states = gdf_states.merge(gdf_state_degrees, left_on="STUSPS", right_on="STUSPS", how="left")
print(gdf_states.head())

In [None]:

# Step 10: Use the 'Name' from the shapefile for the full state names and clean up the DataFrame
gdf_states["State Name"] = gdf_states["NAME"]
gdf_states["State Abbreviation"] = gdf_states["STUSPS"]
gdf_states["Total degrees awarded"] = gdf_states["CTOTALT"]

# Drop unnecessary columns and keep the relevant ones
gdf_states = gdf_states[["State Name", "State Abbreviation", "Total degrees awarded", "geometry"]]

# Add the states with the degree totals to the map, symbolizing by the 'Total degrees awarded' column
m.add_gdf(gdf_states, 
          layer_name="States with Degrees",
          color_by="Total degrees awarded",
          color_scale="YlOrRd",
         )

# Drop unnecessary columns and keep the relevant ones
#gdf_states = gdf_states[["State Name", "State Abbreviation", "Total degrees awarded"]]

m.add_labels(
    gdf_states,
    column="Total degrees awarded",
    label_font_size=12,
    label_color="black",
    label_offset=[0, 0],
    layer_name="Degree Labels"
)

# Show the map
m

In [None]:
num_rows, num_columns = df_merged.shape
num_cells = num_rows * num_columns

print(f"Total number of rows: {num_rows:,}")
print(f"Total number of columns: {num_columns:,}")
print(f"Total number of cells: {num_cells:,}")

In [None]:
# Calculate the sum of all degrees awarded across all universities
total_degrees_awarded = gdf_states["Total degrees awarded"].sum()

# Print the result with commas for readability
print(f"Total degrees awarded across all states: {total_degrees_awarded:,}")


In [None]:
# Step 4: Complete the full operation
end_time = time.time()
total_duration = end_time - start_time

# Step 5: Print the total operation time and format with commas
print(f"Total operation time: {total_duration:.2f} seconds")

In [None]:
# Step 1: Reproject the GeoDataFrame to EPSG:4326
gdf_states = gdf_states.to_crs(epsg=4326)

# Step 2: Export the GeoDataFrame to GeoJSON without the CRS field
gdf_states.to_file("states_degrees_awarded.geojson", driver="GeoJSON")


In [None]:
# Manually upload your GeoJSON to ArcGIS Online: https://www.arcgis.com/home/content.html

In [None]:
# Import that layer into a Map: https://www.arcgis.com/apps/mapviewer/index.html
# Make sure to SAVE!

In [None]:
# Can you repeat thsi exercise using US counties?!
# https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_500k.zip