In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from IPython.display import display

## Carga datasets

In [2]:
df_crimes = pd.read_csv("../datasets/Crimes_Chicago_2024.csv")
df_police_stations = pd.read_csv("../datasets/Police_Stations_20251005.csv")

In [3]:
df_crimes.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,13709672,JJ101940,12/31/2024 11:58:00 PM,014XX E 68TH ST,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,False,...,5,43,14,1186817.0,1860189.0,2024,05/17/2025 03:40:52 PM,41.77147,-87.590742,POINT (-87.59074212 41.771470188)
1,13707925,JJ100089,12/31/2024 11:56:00 PM,047XX S DR MARTIN LUTHER KING JR DR,1365,CRIMINAL TRESPASS,TO RESIDENCE,APARTMENT,True,True,...,3,38,26,1179661.0,1873623.0,2024,05/17/2025 03:40:52 PM,41.808501,-87.616563,POINT (-87.616562762 41.808500903)
2,13708038,JJ100035,12/31/2024 11:55:00 PM,077XX S CICERO AVE,498,BATTERY,"AGG. DOMESTIC BATTERY - HANDS, FISTS, FEET, SE...",HOTEL / MOTEL,False,True,...,18,70,04B,1145740.0,1853048.0,2024,05/17/2025 03:40:52 PM,41.752749,-87.741498,POINT (-87.741497836 41.752748627)
3,13709164,JJ101392,12/31/2024 11:53:00 PM,066XX S GREENWOOD AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,20,42,14,1184362.0,1861188.0,2024,05/17/2025 03:40:52 PM,41.774269,-87.59971,POINT (-87.599709962 41.774269351)
4,13707823,JJ100020,12/31/2024 11:50:00 PM,012XX N MENARD AVE,460,BATTERY,SIMPLE,SIDEWALK,False,False,...,29,25,08B,1137458.0,1907694.0,2024,05/17/2025 03:40:52 PM,41.902858,-87.770537,POINT (-87.770536741 41.902858242)


El dataset “Crimes – 2024” del portal de datos de la Ciudad de Chicago contiene registros de delitos ocurridos en el año 2024, incluyendo tipo de delito, ubicación geográfica, fecha y hora, usados para análisis de seguridad pública.

In [5]:
df_police_stations.head()

Unnamed: 0,DISTRICT,DISTRICT NAME,ADDRESS,CITY,STATE,ZIP,WEBSITE,PHONE,FAX,TTY,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION
0,Headquarters,Headquarters,3510 S Michigan Ave,Chicago,IL,60653,http://home.chicagopolice.org,,,,1177731.401,1881697.404,41.830702,-87.623395,"(41.8307016873, -87.6233953459)"
1,18,Near North,1160 N Larrabee St,Chicago,IL,60610,http://home.chicagopolice.org/community/distri...,312-742-5870,312-742-5771,312-742-5773,1172080.029,1908086.527,41.903242,-87.643352,"(41.9032416531, -87.6433521393)"
2,19,Town Hall,850 W Addison St,Chicago,IL,60613,http://home.chicagopolice.org/community/distri...,312-744-8320,312-744-4481,312-744-8011,1169730.744,1924160.317,41.9474,-87.651512,"(41.9474004564, -87.651512018)"
3,20,Lincoln,5400 N Lincoln Ave,Chicago,IL,60625,http://home.chicagopolice.org/community/distri...,312-742-8714,312-742-8803,312-742-8841,1158399.146,1935788.826,41.97955,-87.692845,"(41.9795495131, -87.6928445094)"
4,22,Morgan Park,1900 W Monterey Ave,Chicago,IL,60643,http://home.chicagopolice.org/community/distri...,312-745-0710,312-745-0814,312-745-0569,1165825.476,1830851.333,41.691435,-87.66852,"(41.6914347795, -87.6685203937)"


El dataset “Police Stations” del portal de datos de la Ciudad de Chicago contiene información sobre las estaciones de policía de la ciudad — ubicaciones, direcciones, zonas de cobertura y otros detalles relacionados con cada comisaría.

In [6]:
df_police_stations = df_police_stations.drop(["PHONE", "FAX", "TTY"], axis=1)

## Calculo de distancias

Se propone añadir una nueva columna al dataset crimes que indique la distancia entre el crimen y la estación de policía más cercana.

Dada la gran cantidad de datos, se opta por eliminar aquellos registros que presenten datos faltantes en latitud y longitud.

In [22]:
# Ver tipos de columnas lat/long
print("dtypes en crimes (lat/lon):", df_crimes[['Latitude','Longitude']].dtypes)
print("dtypes en estaciones (lat/lon):", df_police_stations[['LATITUDE','LONGITUDE']].dtypes)

# Limpiar / preparar latitudes y longitudes

# En crímenes: eliminar filas sin latitud o longitud válidas
df_crimes_dist = df_crimes.dropna(subset=["Latitude","Longitude"])
df_crimes_dist["Latitude"] = pd.to_numeric(df_crimes_dist["Latitude"], errors="coerce")
df_crimes_dist["Longitude"] = pd.to_numeric(df_crimes_dist["Longitude"], errors="coerce")
df_crimes_dist = df_crimes_dist.dropna(subset=["Latitude","Longitude"])

# En estaciones similar
df_police_stations_dist = df_police_stations.dropna(subset=["LATITUDE","LONGITUDE"])
df_police_stations_dist["Latitude"] = pd.to_numeric(df_police_stations_dist["LATITUDE"], errors="coerce")
df_police_stations_dist["Longitude"] = pd.to_numeric(df_police_stations_dist["LONGITUDE"], errors="coerce")
df_police_stations_dist = df_police_stations_dist.dropna(subset=["LATITUDE","LONGITUDE"])

# Convertir a GeoDataFrames con geometría Point

gdf_crimes = gpd.GeoDataFrame(
    df_crimes_dist,
    geometry = df_crimes_dist.apply(lambda r: Point(r["Longitude"], r["Latitude"]), axis=1),
    crs = "EPSG:4326"   # lat/lon estándar
)

gdf_police_stations_dist = gpd.GeoDataFrame(
    df_police_stations_dist,
    geometry = df_police_stations_dist.apply(lambda r: Point(r["Longitude"], r["Latitude"]), axis=1),
    crs = "EPSG:4326"
)

# Elegir un CRS proyectado en metros apropiado para la zona de Chicago

gdf_crimes_proj = gdf_crimes.to_crs(epsg=26971)
gdf_police_stations_proj = gdf_police_stations_dist.to_crs(epsg=26971)

# Calcular la estación más cercana y la distancia

# Usamos sjoin_nearest de GeoPandas
gdf_joined = gpd.sjoin_nearest(
    gdf_crimes_proj,
    gdf_police_stations_proj,
    how="left",
    distance_col="dist_to_station"
)

print("Columnas tras join:", gdf_joined.columns)

dtypes en crimes (lat/lon): Latitude     float64
Longitude    float64
dtype: object
dtypes en estaciones (lat/lon): LATITUDE     float64
LONGITUDE    float64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_crimes_dist["Latitude"] = pd.to_numeric(df_crimes_dist["Latitude"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_crimes_dist["Longitude"] = pd.to_numeric(df_crimes_dist["Longitude"], errors="coerce")


Columnas tras join: Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude_left', 'Longitude_left',
       'Location', 'geometry', 'index_right', 'DISTRICT', 'DISTRICT NAME',
       'ADDRESS', 'CITY', 'STATE', 'ZIP', 'WEBSITE', 'X COORDINATE',
       'Y COORDINATE', 'LATITUDE', 'LONGITUDE', 'LOCATION', 'Latitude_right',
       'Longitude_right', 'dist_to_station'],
      dtype='object')


In [8]:
# Vemos algunas filas resultado
print("Resultado — muestras con estación más cercana y distancia:")
display(gdf_joined.head())


Resultado — muestras con estación más cercana y distancia:


Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,ZIP,WEBSITE,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION,Latitude_right,Longitude_right,dist_to_station
0,13709672,JJ101940,12/31/2024 11:58:00 PM,014XX E 68TH ST,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,False,...,60637,http://home.chicagopolice.org/community/distri...,1182739.183,1858317.732,41.766431,-87.605748,"(41.7664308925, -87.6057478606)",41.766431,-87.605748,1367.540845
1,13707925,JJ100089,12/31/2024 11:56:00 PM,047XX S DR MARTIN LUTHER KING JR DR,1365,CRIMINAL TRESPASS,TO RESIDENCE,APARTMENT,True,True,...,60609,http://home.chicagopolice.org/community/distri...,1175864.837,1871153.753,41.801811,-87.63056,"(41.8018110912, -87.6305601801)",41.801811,-87.63056,1380.313916
2,13708038,JJ100035,12/31/2024 11:55:00 PM,077XX S CICERO AVE,498,BATTERY,"AGG. DOMESTIC BATTERY - HANDS, FISTS, FEET, SE...",HOTEL / MOTEL,False,True,...,60629,http://home.chicagopolice.org/community/distri...,1154575.242,1862672.049,41.778987,-87.708864,"(41.778987189, -87.7088638153)",41.778987,-87.708864,3982.098345
3,13709164,JJ101392,12/31/2024 11:53:00 PM,066XX S GREENWOOD AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,60637,http://home.chicagopolice.org/community/distri...,1182739.183,1858317.732,41.766431,-87.605748,"(41.7664308925, -87.6057478606)",41.766431,-87.605748,1005.009383
4,13707823,JJ100020,12/31/2024 11:50:00 PM,012XX N MENARD AVE,460,BATTERY,SIMPLE,SIDEWALK,False,False,...,60639,http://home.chicagopolice.org/community/distri...,1138770.871,1913442.439,41.918609,-87.765574,"(41.9186088912, -87.765574479)",41.918609,-87.765574,1797.243049


## Conversion de columnas

In [9]:
gdf_joined.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude_left', 'Longitude_left',
       'Location', 'geometry', 'index_right', 'DISTRICT', 'DISTRICT NAME',
       'ADDRESS', 'CITY', 'STATE', 'ZIP', 'WEBSITE', 'X COORDINATE',
       'Y COORDINATE', 'LATITUDE', 'LONGITUDE', 'LOCATION', 'Latitude_right',
       'Longitude_right', 'dist_to_station'],
      dtype='object')

Eliminamos las columnas "ID" y "Case Number" por ser identificadores únicos de cada registro.
También eliminamos las columnas referentes a la estación de policía más cercana ya que solo nos interesa conservar el dato de la distancia al crimen.

In [10]:
# Seleccionar/renombrar columnas de interés

cols_to_keep = [
    "Date", "IUCR", "Primary Type",
    "Location Description",
    "Arrest", "Domestic", "Beat",
    "District", "Ward", "Community Area",
    "FBI Code", "X Coordinate",	"Y Coordinate",	"Latitude_left",	"Longitude_right"	,
    "DISTRICT", "DISTRICT NAME", "dist_to_station"
]
gdf_selected = gdf_joined[cols_to_keep].copy()

# Ver algunas filas resultado
print("Resultado — muestras con estación más cercana y distancia:")
display(gdf_selected.head())

Resultado — muestras con estación más cercana y distancia:


Unnamed: 0,Date,IUCR,Primary Type,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Latitude_left,Longitude_right,DISTRICT,DISTRICT NAME,dist_to_station
0,12/31/2024 11:58:00 PM,1310,CRIMINAL DAMAGE,APARTMENT,False,False,332,3,5,43,14,1186817.0,1860189.0,41.77147,-87.605748,3,Grand Crossing,1367.540845
1,12/31/2024 11:56:00 PM,1365,CRIMINAL TRESPASS,APARTMENT,True,True,223,2,3,38,26,1179661.0,1873623.0,41.808501,-87.63056,2,Wentworth,1380.313916
2,12/31/2024 11:55:00 PM,498,BATTERY,HOTEL / MOTEL,False,True,834,8,18,70,04B,1145740.0,1853048.0,41.752749,-87.708864,8,Chicago Lawn,3982.098345
3,12/31/2024 11:53:00 PM,1320,CRIMINAL DAMAGE,STREET,False,False,321,3,20,42,14,1184362.0,1861188.0,41.774269,-87.605748,3,Grand Crossing,1005.009383
4,12/31/2024 11:50:00 PM,460,BATTERY,SIDEWALK,False,False,2531,25,29,25,08B,1137458.0,1907694.0,41.902858,-87.765574,25,Grand Central,1797.243049


In [11]:
# Renombramos columnas para mayor claridad
gdf_selected["Latitude"] = gdf_selected["Latitude_left"]
gdf_selected["Longitude"] = gdf_selected["Longitude_right"]
gdf_selected["Crime District"] = gdf_selected["District"]
gdf_selected["Nearest Police Station District"] = gdf_selected["DISTRICT"]
gdf_selected["Nearest Police Station District Name"] = gdf_selected["DISTRICT NAME"]
gdf_selected["Distance Crime To Police Station"] = gdf_selected["dist_to_station"]
gdf_selected = gdf_selected.drop(["District", "DISTRICT", "DISTRICT NAME", "dist_to_station", "Longitude_right", "Latitude_left"], axis=1)
gdf_selected.head()

Unnamed: 0,Date,IUCR,Primary Type,Location Description,Arrest,Domestic,Beat,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Latitude,Longitude,Crime District,Nearest Police Station District,Nearest Police Station District Name,Distance Crime To Police Station
0,12/31/2024 11:58:00 PM,1310,CRIMINAL DAMAGE,APARTMENT,False,False,332,5,43,14,1186817.0,1860189.0,41.77147,-87.605748,3,3,Grand Crossing,1367.540845
1,12/31/2024 11:56:00 PM,1365,CRIMINAL TRESPASS,APARTMENT,True,True,223,3,38,26,1179661.0,1873623.0,41.808501,-87.63056,2,2,Wentworth,1380.313916
2,12/31/2024 11:55:00 PM,498,BATTERY,HOTEL / MOTEL,False,True,834,18,70,04B,1145740.0,1853048.0,41.752749,-87.708864,8,8,Chicago Lawn,3982.098345
3,12/31/2024 11:53:00 PM,1320,CRIMINAL DAMAGE,STREET,False,False,321,20,42,14,1184362.0,1861188.0,41.774269,-87.605748,3,3,Grand Crossing,1005.009383
4,12/31/2024 11:50:00 PM,460,BATTERY,SIDEWALK,False,False,2531,29,25,08B,1137458.0,1907694.0,41.902858,-87.765574,25,25,Grand Central,1797.243049


In [12]:
# Convertimos columna date a formato datetime
gdf_selected["Date"] = pd.to_datetime(gdf_selected["Date"])
print(f"Columnas del dataframe:\n{gdf_selected.dtypes}\n")

  gdf_selected["Date"] = pd.to_datetime(gdf_selected["Date"])


Columnas del dataframe:
Date                                    datetime64[ns]
IUCR                                            object
Primary Type                                    object
Location Description                            object
Arrest                                            bool
Domestic                                          bool
Beat                                             int64
Ward                                             int64
Community Area                                   int64
FBI Code                                        object
X Coordinate                                   float64
Y Coordinate                                   float64
Latitude                                       float64
Longitude                                      float64
Crime District                                   int64
Nearest Police Station District                 object
Nearest Police Station District Name            object
Distance Crime To Police Station         

Se propone generar 3 nuevas variables categoricas a partir de la variable "Date":
1. Season, que representara la estación del año en que ocurrió el crimen.
2. Day, que representara el día de la semana en que ocurrió el crimen.
3. Day Time, que representara la franja horaria del día en que ocurrió el crimen.

In [13]:
# Metodo auxiliar para determinar la temporada del año
def get_year_season(month_number: int) -> str:
  if month_number in [12, 1, 2]:
      return "Winter"
  elif month_number in [3, 4, 5]:
      return "Spring"
  elif month_number in [6, 7, 8]:
      return "Summer"
  elif month_number in [9, 10, 11]:
      return "Autumn"
  else:
      return "Invalid Month Number"

In [14]:
# Metodo auxiliar para determinar la etapa del día
def get_day_stage_x4(hour_number: int) -> str:
  if hour_number in [0, 1, 2, 3, 4, 5]:
      return "Early Morning"
  elif hour_number in [6, 7, 8, 9, 10, 11]:
      return "Morning"
  elif hour_number in [12, 13, 14, 15, 16, 17]:
      return "Afternoon"
  elif hour_number in [18, 19, 20, 21, 22, 23]:
      return "Night"
  else:
      return "Invalid Hour Number"

In [None]:
# Cáculamos los valores de las nuevas columnas
crime_day_array = gdf_selected['Date'].dt.day_name().values
crime_hour_array = gdf_selected['Date'].dt.hour.apply(get_day_stage_x4).values
crime_season_array = gdf_selected['Date'].dt.month.apply(get_year_season).values

In [17]:
# Agregamos las nuevas columnas
gdf_selected["Season"] = crime_season_array
gdf_selected["Day"] = crime_day_array
gdf_selected["Day Time"] = crime_hour_array

In [18]:
# Eliminamos las columnas que ya no serán de utilidad
gdf_selected = gdf_selected.drop("Date", axis=1)
gdf_selected.head(10)

Unnamed: 0,IUCR,Primary Type,Location Description,Arrest,Domestic,Beat,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Latitude,Longitude,Crime District,Nearest Police Station District,Nearest Police Station District Name,Distance Crime To Police Station,Season,Day,Day Time
0,1310,CRIMINAL DAMAGE,APARTMENT,False,False,332,5,43,14,1186817.0,1860189.0,41.77147,-87.605748,3,3,Grand Crossing,1367.540845,Winter,Tuesday,Night
1,1365,CRIMINAL TRESPASS,APARTMENT,True,True,223,3,38,26,1179661.0,1873623.0,41.808501,-87.63056,2,2,Wentworth,1380.313916,Winter,Tuesday,Night
2,498,BATTERY,HOTEL / MOTEL,False,True,834,18,70,04B,1145740.0,1853048.0,41.752749,-87.708864,8,8,Chicago Lawn,3982.098345,Winter,Tuesday,Night
3,1320,CRIMINAL DAMAGE,STREET,False,False,321,20,42,14,1184362.0,1861188.0,41.774269,-87.605748,3,3,Grand Crossing,1005.009383,Winter,Tuesday,Night
4,460,BATTERY,SIDEWALK,False,False,2531,29,25,08B,1137458.0,1907694.0,41.902858,-87.765574,25,25,Grand Central,1797.243049,Winter,Tuesday,Night
5,486,BATTERY,STREET,False,True,1234,25,31,08B,1162508.0,1890389.0,41.854884,-87.656973,12,12,Near West,2038.55108,Winter,Tuesday,Night
6,486,BATTERY,APARTMENT,False,True,532,9,53,08B,1178352.0,1827293.0,41.681396,-87.604506,5,5,Calumet,1973.441733,Winter,Tuesday,Night
7,1310,CRIMINAL DAMAGE,APARTMENT,False,False,1223,27,28,14,1164081.0,1901067.0,41.884152,-87.656973,12,12,Near West,2699.539255,Winter,Tuesday,Night
8,1345,CRIMINAL DAMAGE,CTA BUS,False,False,1211,36,24,14,1156857.0,1905197.0,41.895635,-87.705488,12,11,Harrison,2501.887282,Winter,Tuesday,Night
9,910,MOTOR VEHICLE THEFT,STREET,False,False,1223,27,28,07,1161090.0,1899804.0,41.880749,-87.705488,12,11,Harrison,1957.029737,Winter,Tuesday,Night


In [19]:
# Vemos como quedó en nuevo dataset
gdf_selected.describe(include='object')

Unnamed: 0,IUCR,Primary Type,Location Description,FBI Code,Nearest Police Station District,Nearest Police Station District Name,Season,Day,Day Time
count,257543,257543,256494,257543,257543,257543,257543,257543,257543
unique,340,31,128,26,23,23,4,7,4
top,486,THEFT,STREET,6,3,Grand Crossing,Summer,Monday,Afternoon
freq,20318,60295,69378,61067,23944,23944,70104,37892,80290


In [20]:
gdf_selected.describe()

Unnamed: 0,Beat,Ward,Community Area,X Coordinate,Y Coordinate,Latitude,Longitude,Crime District,Distance Crime To Police Station
count,257543.0,257543.0,257543.0,257543.0,257543.0,257543.0,257543.0,257543.0,257543.0
mean,1157.066812,23.184874,36.182789,1165281.0,1887737.0,41.847533,-87.668926,11.340821,1998.301436
std,709.355188,13.950006,21.606731,16160.6,31538.09,0.086723,0.05132,7.08824,1213.914195
min,111.0,1.0,1.0,1092647.0,1813897.0,41.64459,-87.7682,1.0,12.262025
25%,533.0,10.0,22.0,1154006.0,1860557.0,41.772692,-87.705488,5.0,1219.496607
50%,1034.0,23.0,32.0,1167128.0,1894416.0,41.866091,-87.656973,10.0,1852.873793
75%,1732.0,34.0,53.0,1176638.0,1910460.0,41.910074,-87.63056,17.0,2509.868556
max,2535.0,50.0,77.0,1205119.0,1951493.0,42.022548,-87.568349,31.0,14166.02497


In [21]:
# Salvamos en un archivo
gdf_selected.to_csv("../datasets/chicago_crimes_and_stations_2024_processed.csv", index=False)