# Modeling Bus Crowdedness
This notebook simulates and visualizes bus stop crowdedness using synthetic data.

## 1.0 Imports

In [1]:
import os
import sys
sys.path.append(os.path.abspath("../src"))

import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from utils.io_utils import save_model, load_model
from IPython.display import IFrame


In [2]:
# Define the bounding box for downtown Denver (latitude and longitude)
downtown_lat_min = 39.73
downtown_lat_max = 39.76
downtown_lon_min = -104.99
downtown_lon_max = -104.95

## 2.0 Read CSVs

In [3]:
DATA_PATH = "../data/google_transit"
GTFS_PATH = DATA_PATH

stops = pd.read_csv(os.path.join(GTFS_PATH, "stops.txt"))
stop_times = pd.read_csv(os.path.join(GTFS_PATH, "stop_times.txt"))
trips = pd.read_csv(os.path.join(GTFS_PATH, "trips.txt"))
routes = pd.read_csv(os.path.join(GTFS_PATH, "routes.txt"))

## 3.0 Merging

In [4]:
merged = stop_times.merge(trips, on="trip_id")
merged = merged.merge(routes, on="route_id")
merged = merged.merge(stops, on="stop_id")

## 4.0 Feature creation

In [5]:
merged = merged[[
    'stop_id', 'stop_name', 'stop_lat', 'stop_lon',
    'route_id', 'route_type', 'arrival_time', 'departure_time'
]].dropna()

### 4.1 Convert time strings to hour values

In [6]:
def extract_hour(time_str):
    try:
        hour = int(time_str.split(":")[0])
        return hour if hour < 24 else 23
    except:
        return 0

merged['hour'] = merged['arrival_time'].apply(extract_hour)
merged['crowdedness'] = merged['stop_name'].apply(lambda name: np.random.uniform(0.3, 1.0) if 'Station' in name else np.random.uniform(0.0, 0.7))

### 4.2 Normalize crowdedness

In [7]:
scaler = MinMaxScaler()
merged['crowdedness'] = scaler.fit_transform(merged[['crowdedness']])

## 5.0 Modeling

In [8]:
features = merged[['hour', 'route_type']]
target = merged['crowdedness']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

### 5.1 Save model

In [9]:
save_model(model, 'core/crowdedness_model.pkl')

✅ Model saved to core/crowdedness_model.pkl


### 5.2 Evaluate

In [10]:
predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE:", rmse)
IFrame('crowdedness_map.html', width='100%', height=600)

RMSE: 0.21050729488000505


## 6.0 Map Visualization

In [14]:
merged['predicted_crowdedness'] = model.predict(features)

m = folium.Map(location=[39.7392, -104.9903], zoom_start=11)
cluster = MarkerCluster().add_to(m)

for _, row in merged.iterrows():
    color = 'green' if row['predicted_crowdedness'] < 0.3 else 'orange' if row['predicted_crowdedness'] < 0.6 else 'red'
    folium.CircleMarker(
        location=[row['stop_lat'], row['stop_lon']],
        radius=5,
        popup=f"{row['stop_name']} - Score: {row['predicted_crowdedness']:.2f}",
        color=color,
        fill=True,
        fill_opacity=0.7
    ).add_to(cluster)

m.save("crowdedness_prediction_map.html")

In [None]:
# Assuming 'stops_df' is loaded from a CSV or other data source
# Example loading of the data (you should replace this with your actual data loading logic)
stops_df = pd.read_csv(os.path.join(DATA_PATH, "stops.txt"))

# Now filter the data for the downtown area
lat_min, lat_max = 39.72, 39.75
lon_min, lon_max = -104.99, -104.95

filtered_stops = stops_df[(stops_df['stop_lat'] >= lat_min) & (stops_df['stop_lat'] <= lat_max) &
                            (stops_df['stop_lon'] >= lon_min) & (stops_df['stop_lon'] <= lon_max)]

# Create and display the map as before
downtown_map = folium.Map(location=[(lat_min + lat_max) / 2, (lon_min + lon_max) / 2], zoom_start=14)
marker_cluster = MarkerCluster().add_to(downtown_map)

for _, row in filtered_stops.iterrows():
    folium.Marker(location=[row['stop_lat'], row['stop_lon']], popup=row['stop_name']).add_to(marker_cluster)

# Save the map to an HTML file
downtown_map.save('downtown_map.html')

## 7.0 Display IFrame (Jupyter only)

In [None]:
from IPython.display import IFrame
IFrame("crowdedness_prediction_map.html", width='100%', height=600)

## Summary
- We created synthetic crowdedness scores due to lack of real ridership data.
- A random forest model was trained to predict crowdedness.
- A heatmap shows relative crowdedness across Denver bus stops.
