NYC Crime Data

In [None]:
import pandas as pd
url = "https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD"
crime_data = pd.read_csv(url)
crime_data.to_parquet("./data/raw/spatial/nyc_crime.parquet")  # Format optimisé

OSM data

In [None]:
import osmnx as ox
city = ox.geocode_to_gdf("New York, USA")
city.to_file("./data/raw/spatial/nyc_map.gpkg", driver="GPKG")

cleaning

In [None]:
# Scripts/preprocess_spatial.py
import geopandas as gpd

# 1. Chargement
crime = pd.read_parquet("./data/raw/spatial/nyc_crime.parquet")
nyc_map = gpd.read_file("./data/raw/spatial/nyc_map.gpkg")

# 2. Filtrage des colonnes essentielles
crime_clean = crime[["CMPLNT_FR_DT", "CMPLNT_FR_TM", "Latitude", "Longitude", "OFNS_DESC"]]

# 3. Conversion datetime
crime_clean["datetime"] = pd.to_datetime(crime_clean["CMPLNT_FR_DT"] + " " + crime_clean["CMPLNT_FR_TM"])

# 4. Géocodage inverse (lier aux zones OSM)
gdf = gpd.GeoDataFrame(
    crime_clean,
    geometry=gpd.points_from_xy(crime_clean.Longitude, crime_clean.Latitude)
)
gdf = gdf.sjoin(nyc_map, how="left")  # Jointure spatiale

# 5. Export final
gdf.to_file("./data/processed/spatial_features.gpkg")

exploration

In [None]:
# 1. Heatmap temporel
crime_clean.groupby(crime_clean["datetime"].dt.hour).size().plot(kind="bar")

# 2. Cartographie des hotspots
import folium
from folium.plugins import HeatMap

nyc_coords = [40.7128, -74.0060]
m = folium.Map(location=nyc_coords, zoom_start=11)
HeatMap(data=crime_clean[["Latitude", "Longitude"]].values).add_to(m)
m.save("hotspots.html")