In [17]:
# Import necessary libraries
import pandas as pd                # For working with data tables
import geopandas as gpd            # For working with geographic data
from shapely.geometry import Point # For creating point locations on the map
import leafmap                     # For displaying interactive maps
import os
import urllib.request
import zipfile
import time
import zipfile

# Step 1: Start the timer for the entire operation
start_time = time.time()

# Create a map centered on the United States, with a zoom level of 4
m = leafmap.Map(center=[37.8, -96.9], zoom=4)

In [2]:
# Create the directory if it doesn't exist
unzip_dir = './extracted_files'
os.makedirs(unzip_dir, exist_ok=True)

# URL of the shapefile
url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip"
# Path where you want to save the zip file
zip_path = os.path.join(unzip_dir, "cb_2018_us_state_500k.zip")

# Download the shapefile zip
urllib.request.urlretrieve(url, zip_path)

('./extracted_files/cb_2018_us_state_500k.zip',
 <http.client.HTTPMessage at 0x7f466b59e660>)

In [3]:
# Unzip the downloaded shapefile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_dir)

In [4]:
# Step 1: Load the UniversityDirectory2023 CSV
csv_path_university = "./UniversityData/UniversityDirectory2023.csv"
columns_to_use_university = ["UNITID", "INSTNM", "ADDR", "CITY", "STABBR", "LONGITUD", "LATITUDE"]
df_university = pd.read_csv(csv_path_university, usecols=columns_to_use_university, encoding='ISO-8859-1')
print(df_university.head())

   UNITID                               INSTNM  \
0  100654             Alabama A & M University   
1  100663  University of Alabama at Birmingham   
2  100690                   Amridge University   
3  100706  University of Alabama in Huntsville   
4  100724             Alabama State University   

                             ADDR        CITY STABBR   LONGITUD   LATITUDE  
0            4900 Meridian Street      Normal     AL -86.568502  34.783368  
1  Administration Bldg Suite 1070  Birmingham     AL -86.799345  33.505697  
2                  1200 Taylor Rd  Montgomery     AL -86.174010  32.362609  
3                 301 Sparkman Dr  Huntsville     AL -86.640449  34.724557  
4            915 S Jackson Street  Montgomery     AL -86.295677  32.364317  


In [5]:
# Step 2: Load the UniversityAwardedDegrees2023 CSV
csv_path_awarded_degrees = "./UniversityData/UniversityAwardedDegrees2023.csv"
df_awarded_degrees = pd.read_csv(csv_path_awarded_degrees, usecols=["UNITID", "CTOTALT"], encoding='ISO-8859-1')
df_awarded_degrees.head()

Unnamed: 0,UNITID,CTOTALT
0,100654,18
1,100654,8
2,100654,6
3,100654,2
4,100654,2


In [6]:

# Step 3: Merge the two DataFrames on UNITID
df_merged = pd.merge(df_university, df_awarded_degrees, on="UNITID", how="inner")
df_merged.head()

Unnamed: 0,UNITID,INSTNM,ADDR,CITY,STABBR,LONGITUD,LATITUDE,CTOTALT
0,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,18
1,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,8
2,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,6
3,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,2
4,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,2


In [7]:

# Step 4: Convert LATITUDE and LONGITUD to geometry points
def make_point(row):
    return Point(row['LONGITUD'], row['LATITUDE'])
df_merged['geometry'] = df_merged.apply(make_point, axis=1)
df_merged.head()

Unnamed: 0,UNITID,INSTNM,ADDR,CITY,STABBR,LONGITUD,LATITUDE,CTOTALT,geometry
0,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,18,POINT (-86.568502 34.783368)
1,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,8,POINT (-86.568502 34.783368)
2,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,6,POINT (-86.568502 34.783368)
3,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,2,POINT (-86.568502 34.783368)
4,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,2,POINT (-86.568502 34.783368)


In [8]:

# Step 5: Convert the merged DataFrame into a GeoDataFrame
gdf_university = gpd.GeoDataFrame(df_merged, geometry='geometry', crs="EPSG:4326")
gdf_university.head()
#m.add_gdf(gdf_university, 
#          layer_name="All Universities"
#         )

Unnamed: 0,UNITID,INSTNM,ADDR,CITY,STABBR,LONGITUD,LATITUDE,CTOTALT,geometry
0,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,18,POINT (-86.56850 34.78337)
1,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,8,POINT (-86.56850 34.78337)
2,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,6,POINT (-86.56850 34.78337)
3,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,2,POINT (-86.56850 34.78337)
4,100654,Alabama A & M University,4900 Meridian Street,Normal,AL,-86.568502,34.783368,2,POINT (-86.56850 34.78337)


In [9]:

# Step 6: Load the Shapefile for US States from the correct directory
shapefile_path = "./extracted_files/cb_2018_us_state_500k.shp"
gdf_states = gpd.read_file(shapefile_path)
gdf_states.head()

# Add the states with the degree totals to the map, symbolizing by the 'Total degrees awarded' column
m.add_gdf(gdf_states, 
          layer_name="States"
         )
m

Map(center=[37.8, -96.9], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_ou…

In [10]:

# Step 2: Start the timer for the spatial join
spatial_join_start_time = time.time()

# Step 7: Perform a spatial join between the university data and the state shapefile
gdf_joined = gpd.sjoin(gdf_university, gdf_states, how="inner", op="within")

# Step 3: Calculate and print the time taken for the spatial join
spatial_join_end_time = time.time()
spatial_join_duration = spatial_join_end_time - spatial_join_start_time
print(f"Time taken for spatial join: {spatial_join_duration:.2f} seconds")


Time taken for spatial join: 0.36 seconds


In [11]:
# Step 8: Aggregate the joined data by state and sum the degrees awarded
gdf_state_degrees = gdf_joined.groupby("STUSPS")["CTOTALT"].sum().reset_index()

# Step 9: Merge the aggregated data back into the state boundaries GeoDataFrame
gdf_states = gdf_states.merge(gdf_state_degrees, left_on="STUSPS", right_on="STUSPS", how="left")
print(gdf_states.head())

  STATEFP   STATENS     AFFGEOID GEOID STUSPS            NAME LSAD  \
0      28  01779790  0400000US28    28     MS     Mississippi   00   
1      37  01027616  0400000US37    37     NC  North Carolina   00   
2      40  01102857  0400000US40    40     OK        Oklahoma   00   
3      51  01779803  0400000US51    51     VA        Virginia   00   
4      54  01779805  0400000US54    54     WV   West Virginia   00   

          ALAND       AWATER  \
0  121533519481   3926919758   
1  125923656064  13466071395   
2  177662925723   3374587997   
3  102257717110   8528531774   
4   62266474513    489028543   

                                            geometry  CTOTALT  
0  MULTIPOLYGON (((-88.50297 30.21523, -88.49176 ...    93814  
1  MULTIPOLYGON (((-75.72681 35.93584, -75.71827 ...   317868  
2  POLYGON ((-103.00257 36.52659, -103.00219 36.6...   119328  
3  MULTIPOLYGON (((-75.74241 37.80835, -75.74151 ...   285978  
4  POLYGON ((-82.64320 38.16909, -82.64300 38.169...    75184  


In [12]:

# Step 10: Use the 'Name' from the shapefile for the full state names and clean up the DataFrame
gdf_states["State Name"] = gdf_states["NAME"]
gdf_states["State Abbreviation"] = gdf_states["STUSPS"]
gdf_states["Total degrees awarded"] = gdf_states["CTOTALT"]

# Drop unnecessary columns and keep the relevant ones
gdf_states = gdf_states[["State Name", "State Abbreviation", "Total degrees awarded", "geometry"]]

# Add the states with the degree totals to the map, symbolizing by the 'Total degrees awarded' column
m.add_gdf(gdf_states, 
          layer_name="States with Degrees",
          color_by="Total degrees awarded",
          color_scale="YlOrRd",
         )

# Drop unnecessary columns and keep the relevant ones
#gdf_states = gdf_states[["State Name", "State Abbreviation", "Total degrees awarded"]]

m.add_labels(
    gdf_states,
    column="Total degrees awarded",
    label_font_size=12,
    label_color="black",
    label_offset=[0, 0],
    layer_name="Degree Labels"
)

# Show the map
m

Map(bottom=1883.0, center=[37.8, -96.9], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_t…

In [13]:
num_rows, num_columns = df_merged.shape
num_cells = num_rows * num_columns

print(f"Total number of rows: {num_rows:,}")
print(f"Total number of columns: {num_columns:,}")
print(f"Total number of cells: {num_cells:,}")

Total number of rows: 303,292
Total number of columns: 9
Total number of cells: 2,729,628


In [14]:
# Calculate the sum of all degrees awarded across all universities
total_degrees_awarded = gdf_states["Total degrees awarded"].sum()

# Print the result with commas for readability
print(f"Total degrees awarded across all states: {total_degrees_awarded:,}")


Total degrees awarded across all states: 10,793,422


In [15]:
# Step 4: Complete the full operation
end_time = time.time()
total_duration = end_time - start_time

# Step 5: Print the total operation time and format with commas
print(f"Total operation time: {total_duration:.2f} seconds")

Total operation time: 160.11 seconds


In [21]:
# Step 1: Reproject the GeoDataFrame to EPSG:4326
gdf_states = gdf_states.to_crs(epsg=4326)

# Step 2: Export the GeoDataFrame to GeoJSON without the CRS field
gdf_states.to_file("states_degrees_awarded.geojson", driver="GeoJSON")


In [18]:
# Manually upload your GeoJSON to ArcGIS Online: https://www.arcgis.com/home/content.html

Zipped shapefile created: states_degrees_awarded.zip


In [None]:
# Import that layer into a Map: https://www.arcgis.com/apps/mapviewer/index.html
# Make sure to SAVE!

In [None]:
# Can you repeat thsi exercise using US counties?!
# https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_500k.zip