# Data Aquisition/Mining

NYPD 911 Calls for Service

In [1]:
# import libraries 
import pandas as pd
import numpy as np
import seaborn as sns

This dataset documents entries into the NYPD 911 system, ICAD. The data is collected from the ICAD system which call takers and dispatchers use to communicate with callers and the NYPD. Each record represents an entry into the system. 

The data includes entries generated by members of the public as well as self-initiated entries by NYPD Members of Service. The data can be used for issues being responded to by the NYPD.

Can we forecast the demand for the NYPD repsonses to 911 calls. 

## Current Data

In [6]:
nypd_calls = pd.read_csv("https://data.cityofnewyork.us/resource/n2zq-pubd.csv?$limit=3527523", 
                         parse_dates = ["create_date", "incident_date", "add_ts",
                                        "disp_ts", "arrivd_ts", "closng_ts"])
nypd_calls.head()

Unnamed: 0,cad_evnt_id,create_date,incident_date,incident_time,nypd_pct_cd,boro_nm,patrl_boro_nm,geo_cd_x,geo_cd_y,radio_code,typ_desc,cip_jobs,add_ts,disp_ts,arrivd_ts,closng_ts,latitude,longitude
0,108642792,2025-01-01,2024-12-31,23:22:32,116.0,(null),(null),1056625,180527,53S,VEHICLE ACCIDENT: SPECIAL CONDITION,Non CIP,2025-01-01 00:44:37,2025-01-01 00:45:19,2025-01-01 01:21:05,2025-01-01 01:21:07,40.661894,-73.739133
1,108643213,2025-01-01,2024-12-31,23:58:23,52.0,BRONX,PATROL BORO BRONX,1013953,255677,54I2,AMBULANCE CASE: INJURY/OUTSIDE,Non CIP,2025-01-01 00:17:38,2025-01-01 00:19:56,2025-01-01 00:20:50,2025-01-01 00:39:14,40.868407,-73.892608
2,108643225,2025-01-01,2025-01-01,00:00:08,84.0,BROOKLYN,PATROL BORO BKLYN NORTH,987351,193828,75S,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,Non CIP,2025-01-01 00:00:08,2025-01-01 00:00:08,2025-01-01 00:00:08,2025-01-01 00:42:04,40.698698,-73.988818
3,108643226,2025-01-01,2025-01-01,00:00:43,70.0,BROOKLYN,PATROL BORO BKLYN SOUTH,993152,168419,50G2,DISORDERLY: GROUP/OUTSIDE,Non CIP,2025-01-01 00:00:44,2025-01-01 00:38:50,2025-01-01 01:35:33,2025-01-01 02:06:57,40.628952,-73.967931
4,108643227,2025-01-01,2025-01-01,00:00:49,13.0,MANHATTAN,PATROL BORO MAN SOUTH,987653,210944,10H1,INVESTIGATE/POSSIBLE CRIME: CALLS FOR HELP/INSIDE,Non CIP,2025-01-01 00:00:49,2025-01-01 00:13:25,NaT,2025-01-01 00:19:14,40.745677,-73.98772


In [5]:
nypd_calls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3527523 entries, 0 to 3527522
Data columns (total 18 columns):
 #   Column         Dtype         
---  ------         -----         
 0   cad_evnt_id    int64         
 1   create_date    datetime64[ns]
 2   incident_date  datetime64[ns]
 3   incident_time  object        
 4   nypd_pct_cd    float64       
 5   boro_nm        object        
 6   patrl_boro_nm  object        
 7   geo_cd_x       int64         
 8   geo_cd_y       int64         
 9   radio_code     object        
 10  typ_desc       object        
 11  cip_jobs       object        
 12  add_ts         datetime64[ns]
 13  disp_ts        datetime64[ns]
 14  arrivd_ts      datetime64[ns]
 15  closng_ts      datetime64[ns]
 16  latitude       float64       
 17  longitude      float64       
dtypes: datetime64[ns](6), float64(3), int64(3), object(6)
memory usage: 484.4+ MB


In [37]:
import os 
os.getcwd()

'/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698'

In [8]:
nypd_calls.to_csv(f'/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698/df_current.csv')

## Historic Data

In [53]:
# Link to filtered data
link = "https://data.cityofnewyork.us/resource/d6zx-ckhd.csv?$query=SELECT%0A%20%20%60cad_evnt_id%60%2C%0A%20%20%60create_date%60%2C%0A%20%20%60incident_date%60%2C%0A%20%20%60incident_time%60%2C%0A%20%20%60nypd_pct_cd%60%2C%0A%20%20%60boro_nm%60%2C%0A%20%20%60patrl_boro_nm%60%2C%0A%20%20%60typ_desc%60%2C%0A%20%20%60cip_jobs%60%2C%0A%20%20%60disp_ts%60%2C%0A%20%20%60arrivd_ts%60%2C%0A%20%20%60closng_ts%60%2C%0A%20%20%60location%60%0AWHERE%0A%20%20%60incident_date%60%0A%20%20%20%20BETWEEN%20%222022-01-01T00%3A00%3A00%22%20%3A%3A%20floating_timestamp%0A%20%20%20%20AND%20%222025-01-01T00%3A00%3A00%22%20%3A%3A%20floating_timestamp"

In [57]:
# Get data using filtered link
offset = 0
nypd_calls_historic = pd.DataFrame()
for i in range(0,3):
    df = pd.read_csv(f"https://data.cityofnewyork.us/resource/d6zx-ckhd.csv?$limit=10000000&$offset={offset}", 
                         parse_dates = ["create_date", "incident_date",
                                        "disp_ts", "arrivd_ts", "closng_ts"])
    offset += 10000000
    df.to_csv(f'/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698/df{i}.csv')

In [17]:
nypd_calls_historic.head()

Unnamed: 0,objectid,cad_evnt_id,create_date,incident_date,incident_time,nypd_pct_cd,boro_nm,patrl_boro_nm,typ_desc,cip_jobs,disp_ts,arrivd_ts,location,closng_ts
0,,82294758,2022-01-01,2022-01-01,12:07:24,1,MANHATTAN,PATROL BORO MAN SOUTH,VISIBILITY PATROL: DIRECTED,Non CIP,2022-01-01 12:07:24,NaT,POINT (-74.00165729799994 40.71967628600004),2022-01-01T12:46:24.000
1,,82291751,2022-01-01,2022-01-01,08:46:02,1,MANHATTAN,PATROL BORO MAN SOUTH,TRAIN RUN/MOBILE ORDER MAINTENANCE SWEEP,Non CIP,2022-01-01 08:46:02,NaT,POINT (-74.00687374599994 40.71945925700004),2022-01-01T08:55:29.000
2,,82294358,2022-01-01,2022-01-01,11:37:24,14,MANHATTAN,PATROL BORO MAN SOUTH,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,Non CIP,2022-01-01 11:37:24,NaT,POINT (-73.97654816699998 40.75273446700004),2022-01-01T12:07:40.000
3,,82293068,2022-01-01,2022-01-01,10:10:06,41,BRONX,PATROL BORO BRONX,AMBULANCE CASE: SERIOUS/INSIDE,Non CIP,2022-01-01 10:32:13,NaT,POINT (-73.90188580799997 40.81903532000007),2022-01-01T10:45:35.000
4,,82296771,2022-01-01,2022-01-01,14:23:07,46,BRONX,PATROL BORO BRONX,SEE COMPLAINANT: OTHER/INSIDE,Non CIP,2022-01-01 15:47:44,NaT,POINT (-73.90989931399997 40.85099855500005),2022-01-01T16:45:04.000


In [25]:
nypd_calls_historic.to_csv('/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698/historic_data22_24.csv')

In [3]:
# Try with filtered columns 

link = "https://data.cityofnewyork.us/resource/d6zx-ckhd.csv?$select=cad_evnt_id,create_date,incident_date,incident_time,nypd_pct_cd,boro_nm,patrl_boro_nm,typ_desc,cip_jobs,disp_ts,arrivd_ts,closng_ts,location&$where=incident_date%20BETWEEN%20%222022-01-01T00:00:00%22%20AND%20%222025-01-01T00:00:00%22&$limit=10000000&$"


In [7]:
# Get data using filtered link
offset = 0
for i in range(0,3):
    df = pd.read_csv(f'{link}offset={offset}',
                         parse_dates = ["create_date", "incident_date",
                                        "disp_ts", "arrivd_ts", "closng_ts"])
    offset += 10000000
    df.to_csv(f'/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698/df{i}.csv',  index = False)
    print(f"df{i} completed")

df0 completed
df1 completed
df2 completed


In [None]:
# Combine all df 

# Load in the first data frame
historic_data_df0 = pd.read_csv('/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698/Data/df0.csv', 
                                parse_dates = ["create_date", "incident_date", "incident_time",
                                               "disp_ts", "arrivd_ts"])
historic_data_df0.info()

In [None]:
# Read in df1
historic_data_df1 = pd.read_csv('/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698/Data/df1.csv',
                               parse_dates = ["create_date", "incident_date", "incident_time",
                                               "disp_ts", "arrivd_ts"])
historic_data_df1.info()

In [None]:
# Read in df2
historic_data_df2 = pd.read_csv('/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698/Data/df2.csv', 
                               parse_dates = ["create_date", "incident_date",
                                              "incident_time", "disp_ts", "arrivd_ts"])
historic_data_df2.info()

In [None]:
# Combine all historic data frames together

all_historic_data = pd.concat([historic_data_df0, historic_data_df1, historic_data_df2], ignore_index=True)
all_historic_data.info()

In [None]:
# Save historic data
all_historic_data.to_csv('/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698/Data/df_historic_22_24.csv')

In [None]:
# Read in current data 

# Combine current data with historic 
cols_to_use = list(all_historic_data.columns)

# Add in the latitude and longitude in order to create a new location column for the current data 
cols_to_use.append("latitude")
cols_to_use.append("longitude")
cols_to_use.remove("location")

current_data = pd.read_csv("/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698/Data/df_current.csv",
                           usecols = cols_to_use,
                           parse_dates = ["create_date", "incident_date", "incident_time",
                                               "disp_ts", "arrivd_ts"])

In [None]:
# Drop index column
current_data.drop(columns="Unnamed: 0", inplace = True)

# Create a column location from lat and long 
current_data["latitude"] = current_data["latitude"].astype("str")
current_data["longitude"] = current_data["longitude"].astype("str")

current_data.info()
#current_data["location"] = current_data["longitude"] + ", " + current_data["latitude"]

In [None]:
# Create new column loccation that has a tring with long, lat
current_data["location"] = current_data["longitude"] + ", " + current_data["latitude"]

# View first few rows
current_data.head()

In [None]:
# Merge historic and current data 

all_data = pd.concat([all_historic_data, current_data], ignore_index=True)

all_data.drop(columns = ["latitude", "longitude"], inplace = True)

all_data.to_csv('/Users/dirkhartog/Desktop/CUNY_MSDS/DATA_698/Data/df_combined.csv', index = False)

all_data.info()