In [1]:
#-------------------------------
# Dependencies and Setup
#-------------------------------

import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st



#-------------------------------
# Dependencies and Setup
#-------------------------------

arrest_data = "Resources/NYPD_Arrest_Data_2023.csv"
borough_zip_data = "Resources/nyc_zip_borough_neighborhoods.csv"
mental_fac_data = "Resources/NYS-Mental-Facilities.csv"


#-------------------------------
# Create Dataframes 
#-------------------------------

arrest_df = pd.read_csv(arrest_data)
zip_df = pd.read_csv(borough_zip_data)
menta_fac_df = pd.read_csv(mental_fac_data)


In [14]:
#-------------------------------
# List column names for mental facility dataframe
#-------------------------------

menta_fac_df.columns

Index(['index', 'Row Created Date Time', ' Sponsor Name', ' Sponsor Code',
       ' Agency Name', ' Agency Code', 'Facility Name', ' Facility Code',
       ' Program Name', ' Program Code', ' Populations Served',
       ' Agency Phone', ' Program Phone', '  Program Address 1',
       ' Program Address 2', ' Program City', ' Program State', ' Program Zip',
       ' Operating Certificate Required?', ' Program Tier',
       ' Operating Certificate Duration', ' Program County', ' Program Region',
       ' Program Type Description', ' Program Category Description',
       ' Program Subcategory Description', 'Location'],
      dtype='object')

In [10]:
#-------------------------------
# Remove unnecessary columns and save as a new dataframe
#-------------------------------

reduced_mf_df = menta_fac_df[["index", 'Row Created Date Time', ' Sponsor Name',
       ' Agency Name', 'Facility Name', ' Facility Code',
       ' Program Name', ' Populations Served',
       ' Agency Phone', ' Program Phone', '  Program Address 1',
       ' Program Address 2', ' Program City', ' Program State', ' Program Zip',
       ' Operating Certificate Required?', ' Program Tier',
       ' Operating Certificate Duration', ' Program County', ' Program Region',
       ' Program Type Description', ' Program Category Description',
       ' Program Subcategory Description', 'Location']]



#-------------------------------
# Remove duplicates and save as new dataframe
#-------------------------------

clean_mf_df= reduced_mf_df.drop_duplicates(subset=[' Facility Code'], keep='first')


#-------------------------------
# Remove rows where ' Program Zip' is empty and save as new dataframe
#-------------------------------

super_clean_mf_df=clean_mf_df.dropna(subset = [' Program Zip'])


#-------------------------------
# Show first five rows of dataframe
#-------------------------------

super_clean_mf_df.head()



Unnamed: 0,index,Row Created Date Time,Sponsor Name,Agency Name,Facility Name,Facility Code,Program Name,Populations Served,Agency Phone,Program Phone,...,Program Zip,Operating Certificate Required?,Program Tier,Operating Certificate Duration,Program County,Program Region,Program Type Description,Program Category Description,Program Subcategory Description,Location
177,177,12/19/2017 02:08:00 PM,St. Dominic's Home,St. Dominic's Home,St. Dominic's Home,7159,Bronx Family Support Services - Children & Family,Children Adults,(845)359-3400,(718)295-9112,...,10548,N,,,Bronx,New York City,Family Support Services - Children & Family,Support,General Support,"One Fordham Plaza\r\nBronx, NY 10548\r\n"
251,251,12/19/2017 02:08:00 PM,Cayuga Counseling Services Inc.,Cayuga Counseling Services Inc.,Cayuga Counseling Services Inc.,7734,Residential Family Peer Support Partners Program,Children Adolescents,(315)253-9795,(315)253-9795,...,13021,N,,,Cayuga,Central New York,Family Support Services - Children & Family,Support,General Support,
252,252,12/19/2017 02:08:00 PM,Central New York Psychiatric Center,Central New York Psychiatric Center,Central New York Psychiatric Center,43,Auburn OMH Satellite Unit,Adults,(315)765-3600,(315)253-9382,...,13024,N,,,Cayuga,Central New York,Prison-based Forensic Mental Health Units,Support,Forensics,"135 W State Street\r\nAuburn, NY 13024\r\n"
272,272,12/19/2017 02:08:00 PM,TLC Health Network,TLC Health Network,TLC Health Network,8797,TLC Inpatient Mental Health Unit,Adults,(716)951-7035,(716)951-7239,...,14081-9716,Y,,36.0,Chautauqua,Western New York,Inpatient Psychiatric Unit of a General Hospital,Inpatient,Inpatient Psychiatric Unit of a General Hospital,"845 Routes\r\nIrving, NY 14081-9716\r\n"
313,313,12/19/2017 02:08:00 PM,National Alliance for the Mentally Ill of Cham...,National Alliance for the Mentally Ill of Cha,National Alliance for the Mentally Ill of Cha,7497,OUTREACH,Adults,(518)561-2685x4,(518)561-2685,...,12903,N,,,Clinton,Central New York,Outreach,Support,General Support,"304 New\r\nPlattsburgh, NY 12903\r\n"


In [11]:
#-------------------------------
# List column names for arrests dataframe
#-------------------------------

arrest_df.columns



Index(['ARREST_KEY', 'ARREST_DATE', 'PD_CD', 'PD_DESC', 'KY_CD', 'OFNS_DESC',
       'LAW_CODE', 'LAW_CAT_CD', 'ARREST_BORO', 'ARREST_PRECINCT',
       'JURISDICTION_CODE', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'X_COORD_CD',
       'Y_COORD_CD', 'Latitude', 'Longitude'],
      dtype='object')

In [13]:
#-------------------------------
# Remove unnecessary columns and save as a new database
#-------------------------------

reduced_arrest_df = arrest_df[['ARREST_KEY', 'ARREST_BORO', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'X_COORD_CD',
       'Y_COORD_CD', 'Latitude', 'Longitude']]


#-------------------------------
# Remove duplicates and save as new dataframe
#-------------------------------

clean_arrest_df= reduced_arrest_df.drop_duplicates(subset=['ARREST_KEY'], keep='first')


#-------------------------------
# Remove rows where ' Program Zip' is empty and save as new dataframe
#-------------------------------

super_clean_arrest_df=clean_arrest_df.dropna(subset = ['ARREST_BORO'])


#-------------------------------
# Show first five rows of dataframe
#-------------------------------

super_clean_arrest_df.head()

Unnamed: 0,ARREST_KEY,ARREST_BORO,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude
0,261209118,K,45-64,F,BLACK,999335,186085,40.677426,-73.945615
1,262984267,K,25-44,M,BLACK,1009318,178259,40.655923,-73.90965
2,263664549,K,25-44,M,WHITE,982272,158771,40.602468,-74.00712
3,261345231,M,25-44,M,BLACK,999899,238684,40.821797,-73.943457
4,263536618,K,25-44,M,BLACK,1001437,183080,40.669175,-73.938042


In [15]:
#-------------------------------
# List column names for zipcode dataframe
#-------------------------------

zip_df.columns

Index(['zip', 'borough', 'neighborhood', 'population', 'density'], dtype='object')

In [16]:
#-------------------------------
# Remove unnecessary columns and save as a new database
#-------------------------------

reduced_zip_df = zip_df[['zip', 'borough']]


#-------------------------------
# Remove duplicates and save as new dataframe
#-------------------------------

clean_zip_df= reduced_zip_df.drop_duplicates(subset=['zip'], keep='first')


#-------------------------------
# Remove rows where ' Program Zip' is empty and save as new dataframe
#-------------------------------

super_clean_zip_df=clean_zip_df.dropna(subset = ['borough'])


#-------------------------------
# Show first five rows of dataframe
#-------------------------------

super_clean_zip_df.head()

Unnamed: 0,zip,borough
0,10001,Manhattan
1,10002,Manhattan
2,10003,Manhattan
3,10004,Manhattan
4,10005,Manhattan
