In [1]:
import pynvml
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
print(f'Total memory: {info.total / 1024**3:.2f} GB')
print(f'Free memory: {info.free / 1024**3:.2f} GB')
print(f'Used memory: {info.used / 1024**3:.2f} GB')
import dask_cudf
import cupy as cp
import rmm
import cudf
import gc
import cudf.pandas
cudf.pandas.install()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Total memory: 8.00 GB
Free memory: 7.60 GB
Used memory: 0.40 GB


In [2]:
arrest_data = dask_cudf.read_csv('NYPD_Arrest_Data__Year_to_Date__20240529.csv', blocksize="100MB")


In [3]:
rows = arrest_data.shape[0].compute()
print(rows)

63621


In [4]:
arrest_data.columns


Index(['ARREST_KEY', 'ARREST_DATE', 'PD_CD', 'PD_DESC', 'KY_CD', 'OFNS_DESC',
       'LAW_CODE', 'LAW_CAT_CD', 'ARREST_BORO', 'ARREST_PRECINCT',
       'JURISDICTION_CODE', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'X_COORD_CD',
       'Y_COORD_CD', 'Latitude', 'Longitude', 'New Georeferenced Column'],
      dtype='object')

In [5]:
arrest_data = arrest_data[["ARREST_KEY","ARREST_DATE","ARREST_PRECINCT","ARREST_BORO","Latitude","Longitude"]]

In [6]:
null_counts = arrest_data.isnull().sum()
print(null_counts.compute())

ARREST_KEY         0
ARREST_DATE        0
ARREST_PRECINCT    0
ARREST_BORO        0
Latitude           0
Longitude          0
dtype: int64


In [7]:
for col in arrest_data.columns:
    grouped = arrest_data.groupby(col).size()
    result = grouped.compute()
    print(f"Group sizes for column {col}:\n{result}\n")

Group sizes for column ARREST_KEY:
ARREST_KEY
279912480    1
279912484    1
279912485    1
279912486    1
279912487    1
            ..
284565909    1
284565910    1
284567343    1
284565634    1
284565633    1
Length: 63621, dtype: int64

Group sizes for column ARREST_DATE:
ARREST_DATE
03/05/2024    787
03/06/2024    883
03/07/2024    917
03/08/2024    742
03/09/2024    562
             ... 
01/29/2024    633
01/30/2024    718
01/31/2024    851
02/01/2024    919
01/25/2024    743
Length: 91, dtype: int64

Group sizes for column ARREST_PRECINCT:
ARREST_PRECINCT
26     404
28     461
30     446
32     715
33     538
      ... 
71     712
72     814
73    1356
75    2078
76     339
Length: 77, dtype: int64

Group sizes for column ARREST_BORO:
ARREST_BORO
B    14221
K    17402
M    15324
Q    13782
S     2892
dtype: int64

Group sizes for column Latitude:
Latitude
40.592918    1
40.592920    1
40.592943    2
40.592981    1
40.592985    2
            ..
40.883928    7
40.883977    1
40.884

In [8]:
max_values = arrest_data["Latitude"].max().compute()

print(max_values)
max_values = arrest_data["Latitude"].min().compute()

print(max_values)

40.911235999999995
0.0


In [9]:
max_values = arrest_data["Longitude"].max().compute()

print(max_values)
max_values = arrest_data["Longitude"].min().compute()

print(max_values)

0.0
-74.25271141323829


1. **Location 1:**  
   - Latitude: 40.916  
   - Longitude: -73.919  

2. **Location 2:**  
   - Latitude: 40.492  
   - Longitude: -74.254  

3. **Location 3:**  
   - Latitude: 40.540  
   - Longitude: -74.248  

4. **Location 4:**  
   - Latitude: 40.752471  
   - Longitude: -73.698  

**NYC Boundaries:**  
- Latitude: 40.492 < x < 40.916  
- Longitude: -74.254 < x < -73.698  


for points outside this or at 0 , 0 impute


In [13]:


lat_lower_bound = 40.492
lat_upper_bound = 40.916
long_lower_bound = -74.254
long_upper_bound = -73.698

filtered_data = arrest_data[
    (arrest_data['Latitude'] < lat_lower_bound) |
    (arrest_data['Latitude'] > lat_upper_bound) |
    (arrest_data['Longitude'] < long_lower_bound) |
    (arrest_data['Longitude'] > long_upper_bound)
]

filtered_data_computed = filtered_data.compute()

bound_rows = filtered_data_computed.shape[0]
print(bound_rows)
filtered_data_computed

2


Unnamed: 0,ARREST_KEY,ARREST_DATE,ARREST_PRECINCT,ARREST_BORO,Latitude,Longitude
40573,282350930,02/17/2024,25,M,0.0,0.0
41238,282953053,02/28/2024,1,M,0.0,0.0


In [15]:
filtered_data_computed.iloc[0, filtered_data_computed.columns.get_loc('Latitude')] = 40.75593384590866
filtered_data_computed.iloc[0, filtered_data_computed.columns.get_loc('Longitude')] = -73.98319081657343
filtered_data_computed.iloc[1, filtered_data_computed.columns.get_loc('Latitude')] = 40.75593384590866
filtered_data_computed.iloc[1, filtered_data_computed.columns.get_loc('Longitude')] = -73.98319081657343

In [25]:
arrest_data['Latitude'] = arrest_data['Latitude'].fillna(40.75593384590866)
arrest_data['Longitude'] = arrest_data['Longitude'].fillna(-73.98319081657343)


In [17]:
filtered_data_computed

Unnamed: 0,ARREST_KEY,ARREST_DATE,ARREST_PRECINCT,ARREST_BORO,Latitude,Longitude
40573,282350930,02/17/2024,25,M,40.755934,-73.983191
41238,282953053,02/28/2024,1,M,40.755934,-73.983191


In [26]:
arrest_data.to_csv('saved_csvs/arrests.csv', single_file=True)

['/home/eamonn-walsh/Documents/Summer-Project/saved_csvs/arrests.csv']