In [4]:
import pandas as pd
import folium
import time
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import branca

# Load Data
data = pd.read_csv("neighborhood_school_enrollment.csv")

# Geopy setup
geolocator = Nominatim(user_agent="pittsburgh_geo_script")
geocode_cache = {}

def geocode_location(location, retries=3):
    """Geocode using OpenStreetMap with caching and error handling."""
    if not location:
        return None, None

    if location in geocode_cache:
        return geocode_cache[location]

    for attempt in range(retries):
        try:
            time.sleep(1)  # Respect OSM's rate limit
            location_obj = geolocator.geocode(f"{location}, Pittsburgh, PA")
            if location_obj:
                lat = location_obj.latitude
                lon = location_obj.longitude
                geocode_cache[location] = (lat, lon)
                return lat, lon
        except Exception as e:
            print(f"Error geocoding {location} (attempt {attempt + 1}): {e}")
    return None, None

# Create Map
pittsburgh_map = folium.Map(location=[40.4406, -79.9959], zoom_start=12)

# Geocode neighborhoods and schools
neighborhood_coords = {}
school_coords = {}

for _, row in data.iterrows():
    # Neighborhood
    if row['neighborhood'] not in neighborhood_coords:
        lat, lon = geocode_location(row['neighborhood'])
        if lat and lon:
            neighborhood_coords[row['neighborhood']] = (lat, lon)

    # School
    if row['school'] not in school_coords:
        lat, lon = geocode_location(row['school'])
        if lat and lon:
            school_coords[row['school']] = (lat, lon)

def get_color_by_distance(distance):
    if distance <= 2:
        return 'green'
    elif distance <= 5:
        return 'blue'
    elif distance <= 10:
        return 'yellow'
    elif distance <= 15:
        return 'orange'
    else:
        return 'red'

data['enrolled_students'] = pd.to_numeric(data['enrolled_students'], errors='coerce')

# Draw lines between neighborhoods and schools
for _, row in data.iterrows():
    n_coords = neighborhood_coords.get(row['neighborhood'])
    s_coords = school_coords.get(row['school'])

    if n_coords and s_coords:
        distance_km = geodesic(n_coords, s_coords).kilometers
        color = get_color_by_distance(distance_km)

        folium.PolyLine(
            locations=[n_coords, s_coords],
            color=color, weight=3, opacity=0.7,
            tooltip=f"Distance: {distance_km:.2f} km"
        ).add_to(pittsburgh_map)

# Add markers
for _, row in data.iterrows():
    neighborhood = str(row["neighborhood"]) if pd.notna(row["neighborhood"]) else None
    school = str(row["school"]) if pd.notna(row["school"]) else None

    if neighborhood:
        lat, lon = neighborhood_coords.get(neighborhood, (None, None))
        if lat and lon:
            folium.Marker(
                location=[lat, lon],
                popup=f"Neighborhood: {neighborhood}",
                icon=folium.Icon(color='blue', icon='info-sign')
            ).add_to(pittsburgh_map)

    if school:
        lat, lon = school_coords.get(school, (None, None))
        if lat and lon:
            folium.Marker(
                location=[lat, lon],
                popup=f"School: {school}",
                icon=folium.Icon(color='green', icon='info-sign')
            ).add_to(pittsburgh_map)

# Legend
distance_legend_html = """
    <div style="position: fixed; 
                bottom: 50px; left: 50px; width: 250px; height: 150px; 
                background-color: white; border: 2px solid grey; 
                z-index: 9999; font-size: 10px; font-weight: bold; padding: 10px;">
                <div><b>Marker Legend</b></div>
        <div><i style="background-color:blue; width: 20px; height: 20px; display: inline-block;"></i>  Blue Marker: Neighborhood </div>
        <div><i style="background-color:green; width: 20px; height: 20px; display: inline-block;"></i> Green Marker: Pittsburgh Public Schools</div>
        <br>
        <div><b>Distance Legend</b></div>
        <div><i style="background-color:green; width: 20px; height: 5px; display: inline-block;"></i> ≤ 2 km (Short)</div>
        <div><i style="background-color:blue; width: 20px; height: 5px; display: inline-block;"></i> 2 - 5 km (Moderate)</div>
        <div><i style="background-color:yellow; width: 20px; height: 5px; display: inline-block;"></i> 5 - 10 km (Long)</div>
        <div><i style="background-color:orange; width: 20px; height: 5px; display: inline-block;"></i> 10 - 15 km (Very Long)</div>
        <div><i style="background-color:red; width: 20px; height: 5px; display: inline-block;"></i> 15+ km (Extreme)</div>
    </div>
"""
pittsburgh_map.get_root().html.add_child(folium.Element(distance_legend_html))

# Save Map
pittsburgh_map.save("neighborhood_school_distance_map.html")


Error geocoding PITTSBURGH OBAMA IB 6-12 (attempt 2): HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=PITTSBURGH+OBAMA+IB+6-12%2C+Pittsburgh%2C+PA&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))


In [2]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [19]:
from geopy.distance import geodesic

data = pd.read_csv("neighborhood_school_enrollment.csv")

# Function to calculate distance
def compute_distance(row):
    # Get coordinates from the dictionaries
    n_coords = neighborhood_coords.get(row['neighborhood'])
    s_coords = school_coords.get(row['school'])
    
    # Check if both coordinates are valid (not None)
    if n_coords and s_coords:
        # Calculate distance using geodesic if both coordinates exist
        return geodesic(n_coords, s_coords).km
    else:
        # If coordinates are missing, return None
        return None

# Apply it to DataFrame and create 'distance_km' column
data['distance_km'] = data.apply(compute_distance, axis=1)

# Verify that the 'distance_km' column exists and check for any None values
print(data[['neighborhood', 'school', 'distance_km']].head())


# Total number of records
total_records = len(data)

# How many students attend their neighborhood school
same_school_count = data[data['neighborhood'] == data['school']].shape[1]
percent_same_school = (same_school_count / total_records) * 100

# Distance stats
avg_distance = data['distance_km'].mean()
max_distance = data['distance_km'].max()
over_15km = data[data['distance_km'] > 15].shape[0]
over_8km = data[data['distance_km'] > 8].shape[0]
percent_over_15km = (over_15km / total_records) * 100
percent_over_8km = (over_8km / total_records) * 100

# Top 3 most enrolled schools
top_schools = data.groupby('school')['enrolled_students'].sum().sort_values(ascending=False).head(5)

# Total number of records
total_records = len(data)


print(f"Total records: {total_records}")
print(f"Students attending their neighborhood school: {same_school_count} ({percent_same_school:.2f}%)")
print(f"Average travel distance: {avg_distance:.2f} km")
print(f"Max travel distance: {max_distance:.2f} km")
print(f"Students traveling more than 15 km: {over_15km} ({percent_over_15km:.2f}%)")
print(f"Students traveling more than 8 km: {over_8km} ({percent_over_8km:.2f}%)")
print("\nTop 5 most enrolled schools:")
print(top_schools)

       neighborhood                          school  distance_km
0  Allegheny Center                           Other     9.036639
1  Allegheny Center          PITTSBURGH KING PreK-8          NaN
2         Arlington     PITTSBURGH ARLINGTON PreK-8          NaN
3         Arlington                           Other     9.278018
4         Arlington  PITTSBURGH CARRICK HIGH SCHOOL          NaN
Total records: 500
Students attending their neighborhood school: 4 (0.80%)
Average travel distance: 7.40 km
Max travel distance: 18.42 km
Students traveling more than 15 km: 13 (2.60%)
Students traveling more than 8 km: 67 (13.40%)

Top 5 most enrolled schools:
school
Other                                5422
PITTSBURGH ALLDERDICE HIGH SCHOOL    1262
PITTSBURGH BRASHEAR HIGH SCHOOL      1032
PITTSBURGH COLFAX K-8                 806
PITTSBURGH OBAMA IB 6-12              689
Name: enrolled_students, dtype: int64
