In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../../datasets/csv_datasets/NYPD_Arrest_2023.csv")

In [3]:
df.head()

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude
0,261209118,01/01/2023,109,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200501,F,K,77,0,45-64,F,BLACK,999335,186085,40.677426,-73.945615
1,262984267,02/03/2023,515,"CONTROLLED SUBSTANCE,SALE 3",117.0,DANGEROUS DRUGS,PL 2203901,F,K,73,0,25-44,M,BLACK,1009318,178259,40.655923,-73.90965
2,263664549,02/15/2023,105,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,K,62,0,25-44,M,WHITE,982272,158771,40.602468,-74.00712
3,261345231,01/04/2023,105,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,M,32,0,25-44,M,BLACK,999899,238684,40.821797,-73.943457
4,263536618,02/13/2023,109,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 12005WX,F,K,71,0,25-44,M,BLACK,1001437,183080,40.669175,-73.938042


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170095 entries, 0 to 170094
Data columns (total 18 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ARREST_KEY         170095 non-null  int64  
 1   ARREST_DATE        170095 non-null  object 
 2   PD_CD              170095 non-null  int64  
 3   PD_DESC            170095 non-null  object 
 4   KY_CD              170082 non-null  float64
 5   OFNS_DESC          170095 non-null  object 
 6   LAW_CODE           170095 non-null  object 
 7   LAW_CAT_CD         168838 non-null  object 
 8   ARREST_BORO        170095 non-null  object 
 9   ARREST_PRECINCT    170095 non-null  int64  
 10  JURISDICTION_CODE  170095 non-null  int64  
 11  AGE_GROUP          170095 non-null  object 
 12  PERP_SEX           170095 non-null  object 
 13  PERP_RACE          170095 non-null  object 
 14  X_COORD_CD         170095 non-null  int64  
 15  Y_COORD_CD         170095 non-null  int64  
 16  La

## Takes the sub dataframes needed

The arrest key is unique and can be used id.

In [5]:
df['ARREST_KEY'].is_unique

True

### Crime types

The ofns_desc provides a brief description of the crime made.

In [6]:
df['OFNS_DESC'].value_counts()

OFNS_DESC
ASSAULT 3 & RELATED OFFENSES      25200
PETIT LARCENY                     18025
FELONY ASSAULT                    16030
DANGEROUS DRUGS                   11526
MISCELLANEOUS PENAL LAW           11132
                                  ...  
DISRUPTION OF A RELIGIOUS SERV        2
HOMICIDE-NEGLIGENT-VEHICLE            2
FELONY SEX CRIMES                     2
UNLAWFUL POSS. WEAP. ON SCHOOL        1
ADMINISTRATIVE CODES                  1
Name: count, Length: 64, dtype: int64

In [7]:
crime_types = (
    df.loc[:, 'OFNS_DESC']
        .copy()
        .value_counts()
        .reset_index()
        .reset_index()
        .drop(columns='count')
        .rename(columns={
            'OFNS_DESC': 'description',
            'index': 'id'
        })
)

crime_types

Unnamed: 0,id,description
0,0,ASSAULT 3 & RELATED OFFENSES
1,1,PETIT LARCENY
2,2,FELONY ASSAULT
3,3,DANGEROUS DRUGS
4,4,MISCELLANEOUS PENAL LAW
...,...,...
59,59,DISRUPTION OF A RELIGIOUS SERV
60,60,HOMICIDE-NEGLIGENT-VEHICLE
61,61,FELONY SEX CRIMES
62,62,UNLAWFUL POSS. WEAP. ON SCHOOL


### Crimes

In [8]:
crimes = (
    df.loc[:, ['ARREST_KEY', 'ARREST_DATE', 'OFNS_DESC','Latitude', 'Longitude']]
        .copy()
        .rename(columns={
            'ARREST_KEY': 'id',
            'ARREST_DATE': 'arrest_date',
            'OFNS_DESC': 'crime_type'
        })
)

crimes

Unnamed: 0,id,arrest_date,crime_type,Latitude,Longitude
0,261209118,01/01/2023,FELONY ASSAULT,40.677426,-73.945615
1,262984267,02/03/2023,DANGEROUS DRUGS,40.655923,-73.909650
2,263664549,02/15/2023,FELONY ASSAULT,40.602468,-74.007120
3,261345231,01/04/2023,FELONY ASSAULT,40.821797,-73.943457
4,263536618,02/13/2023,FELONY ASSAULT,40.669175,-73.938042
...,...,...,...,...,...
170090,270955727,07/07/2023,ROBBERY,40.869067,-73.880600
170091,273117887,08/18/2023,SEX CRIMES,40.887314,-73.847272
170092,274301006,09/13/2023,OTHER TRAFFIC INFRACTION,40.610087,-74.116666
170093,269912233,06/15/2023,POSSESSION OF STOLEN PROPERTY,40.600204,-74.002812


#### Adds foreign keys to crimes

In [9]:
crimes['crime_type'] = crimes['crime_type'].map(
    lambda x: crime_types.query('description == @x')['id'].values[0]                        
)

In [10]:
crimes

Unnamed: 0,id,arrest_date,crime_type,Latitude,Longitude
0,261209118,01/01/2023,2,40.677426,-73.945615
1,262984267,02/03/2023,3,40.655923,-73.909650
2,263664549,02/15/2023,2,40.602468,-74.007120
3,261345231,01/04/2023,2,40.821797,-73.943457
4,263536618,02/13/2023,2,40.669175,-73.938042
...,...,...,...,...,...
170090,270955727,07/07/2023,7,40.869067,-73.880600
170091,273117887,08/18/2023,13,40.887314,-73.847272
170092,274301006,09/13/2023,19,40.610087,-74.116666
170093,269912233,06/15/2023,16,40.600204,-74.002812


### Put data into final csv

In [11]:
crime_types.to_csv('../out/crime_types.csv', index=False)
crimes.to_csv('../out/crimes.csv', index=False)