In [14]:
import geopandas as gpd
import pandas as pd
import numpy as np

print("üìç Step 1: Loading the GeoDataFrame...")

# Load previously saved sample
gdf = gpd.read_file("../outputs/merged_sample.geojson")

print("‚úÖ Loaded GeoDataFrame")
print("Total rows:", len(gdf))
print(gdf.head())


üìç Step 1: Loading the GeoDataFrame...
‚úÖ Loaded GeoDataFrame
Total rows: 1000000
   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2015-01-15 19:05:39   2015-01-15 19:23:42                1   
1         1  2015-01-10 20:33:38   2015-01-10 20:53:28                1   
2         1  2015-01-10 20:33:38   2015-01-10 20:43:41                1   
3         1  2015-01-10 20:33:39   2015-01-10 20:35:31                1   
4         1  2015-01-10 20:33:39   2015-01-10 20:52:58                1   

   trip_distance  pickup_longitude  pickup_latitude  RateCodeID  \
0           1.59        -73.993896        40.750111           1   
1           3.30        -74.001648        40.724243           1   
2           1.80        -73.963341        40.802788           1   
3           0.50        -74.009087        40.713818           1   
4           3.00        -73.971176        40.762428           1   

  store_and_fwd_flag  dropoff_longitude  dropoff_latitude  pa

In [15]:
print("\nüìç Step 2: Converting CRS to meters...")

gdf_m = gdf.to_crs(epsg=3857)

# Extract X/Y coordinates as float columns
gdf_m["x"] = gdf_m.geometry.x
gdf_m["y"] = gdf_m.geometry.y

print("‚úÖ CRS converted to EPSG:3857 (meters)")



üìç Step 2: Converting CRS to meters...
‚úÖ CRS converted to EPSG:3857 (meters)


In [16]:
print("\nüìç Step 3: Sampling 200,000 points for DBSCAN...")

sample_size = 200000
gdf_sample = gdf_m.sample(sample_size, random_state=42)

coords = gdf_sample[["x", "y"]].values

print("‚úÖ Sampling complete. Sample size:", len(gdf_sample))



üìç Step 3: Sampling 200,000 points for DBSCAN...
‚úÖ Sampling complete. Sample size: 200000


In [17]:
from sklearn.cluster import DBSCAN

print("\nüìç Step 4: Running DBSCAN clustering...")

# eps = radius in meters (60m)
dbscan = DBSCAN(eps=60, min_samples=50, n_jobs=-1)

gdf_sample["cluster"] = dbscan.fit_predict(coords)

print("‚úÖ DBSCAN finished")
print("Clusters found:", len(set(gdf_sample['cluster'])) - (1 if -1 in gdf_sample['cluster'].unique() else 0))
print("Noise points:", sum(gdf_sample['cluster'] == -1))



üìç Step 4: Running DBSCAN clustering...
‚úÖ DBSCAN finished
Clusters found: 149
Noise points: 24649


In [18]:
print("\nüìç Step X: Creating trip_duration column...")

# Convert to datetime (if not already converted)
gdf_sample["tpep_pickup_datetime"] = pd.to_datetime(gdf_sample["tpep_pickup_datetime"])
gdf_sample["tpep_dropoff_datetime"] = pd.to_datetime(gdf_sample["tpep_dropoff_datetime"])

# Create trip duration in minutes
gdf_sample["trip_duration"] = (
    gdf_sample["tpep_dropoff_datetime"] - gdf_sample["tpep_pickup_datetime"]
).dt.total_seconds() / 60

print("‚úÖ trip_duration column added successfully.")
print("Sample:", gdf_sample["trip_duration"].head())



üìç Step X: Creating trip_duration column...
‚úÖ trip_duration column added successfully.
Sample: 987231     3.866667
79954      7.100000
567130    14.533333
500891    13.383333
55399      6.300000
Name: trip_duration, dtype: float64


In [19]:
print("\nüìç Step 5: Calculating hotspot metrics...")

cluster_stats = (
    gdf_sample[gdf_sample["cluster"] != -1]
    .groupby("cluster")
    .agg(
        points=("cluster", "count"),
        avg_trip_duration=("trip_duration", "mean"),
    )
    .sort_values("points", ascending=False)
)

print("‚úÖ Hotspot metrics calculated")
cluster_stats.head()



üìç Step 5: Calculating hotspot metrics...
‚úÖ Hotspot metrics calculated


Unnamed: 0_level_0,points,avg_trip_duration
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,120540,12.004185
18,5680,12.299082
11,3675,12.182776
20,3345,13.363837
16,2309,13.315952


In [20]:
print("\nüìç Step 6: Saving clustered geospatial data for dashboard...")

# Keep only necessary columns
columns_to_keep = [
    "pickup_latitude",
    "pickup_longitude",
    "trip_duration",
    "cluster",
    "geometry"
]

gdf_out = gdf_sample[columns_to_keep]

output_path = "../outputs/clustered_sample.geojson"
gdf_out.to_file(output_path, driver="GeoJSON")

print(f"‚úÖ Clustered data saved successfully to: {output_path}")



üìç Step 6: Saving clustered geospatial data for dashboard...
‚úÖ Clustered data saved successfully to: ../outputs/clustered_sample.geojson


In [21]:
print("\nüìç Step 7: Saving clustered dataset for dashboard...")

# Save clustered sample as GeoJSON ‚Üí for map visualization
output_geojson = "../outputs/dbscan_clusters_sample.geojson"
gdf_sample.to_file(output_geojson, driver="GeoJSON")
print(f"‚úÖ GeoJSON saved: {output_geojson}")

# Save metrics as CSV ‚Üí for charts & stats in dashboard
output_metrics = "../outputs/dbscan_cluster_metrics.csv"
cluster_stats.to_csv(output_metrics)
print(f"üìä Cluster metrics saved: {output_metrics}")

print("\nüéâ Step 7 complete: Clustered dataset and metrics are stored!")



üìç Step 7: Saving clustered dataset for dashboard...
‚úÖ GeoJSON saved: ../outputs/dbscan_clusters_sample.geojson
üìä Cluster metrics saved: ../outputs/dbscan_cluster_metrics.csv

üéâ Step 7 complete: Clustered dataset and metrics are stored!


In [22]:
print("\nüìç Step 8: Exporting hotspot summary report...")

# Save the hotspot metrics you computed (cluster_stats)

csv_path = "../outputs/hotspot_summary.csv"
json_path = "../outputs/hotspot_summary.json"

# Save CSV
cluster_stats.to_csv(csv_path)
print(f"üìÑ CSV saved to: {csv_path}")

# Save JSON
cluster_stats.to_json(json_path, orient="index")
print(f"üóÇÔ∏è JSON saved to: {json_path}")

print("‚úÖ Hotspot summary export completed successfully.")



üìç Step 8: Exporting hotspot summary report...
üìÑ CSV saved to: ../outputs/hotspot_summary.csv
üóÇÔ∏è JSON saved to: ../outputs/hotspot_summary.json
‚úÖ Hotspot summary export completed successfully.


In [None]:
print("\nüìç Step 9: Visualizing clusters on an OpenStreetMap basemap...")

import matplotlib.pyplot as plt
import contextily as ctx

# Ensure contextily uses a local tile cache (important in restricted networks)
ctx.set_cache_dir("./tile_cache")

# --- 1. Reproject to Web Mercator (required by OSM tiles) ---
gdf_merc = gdf_sample.to_crs(epsg=3857)

# --- 2. Create plot ---
fig, ax = plt.subplots(figsize=(12, 12))

gdf_merc.plot(
    ax=ax,
    column="cluster",
    cmap="tab20",
    markersize=2,
    alpha=0.9,
    legend=False
)

# --- 3. Add OSM basemap ---
try:
    ctx.add_basemap(
        ax,
        source=ctx.providers.OpenStreetMap.Mapnik,
        zoom=12
    )
except Exception as e:
    print("\n‚ö†Ô∏è Basemap could not be loaded. Error:")
    print(e)
    print("Plotting points without basemap instead.")

# --- 4. Formatting ---
ax.set_title("DBSCAN Clusters on OpenStreetMap (200k Sample)", fontsize=16)
ax.set_axis_off()

plt.show()
