# Data Preparation

#### Parsing the json **nyc_geo.json** into the dataframe with the following columns:
- Borough
- Neighborhood
- Latitude
- Longitude

In [1]:
import pandas as pd

In [2]:
import json
f = open('nyc_geo.json')
data = json.load(f)

In [3]:
data['features'][0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

In [4]:
geo_list = []
for i in data['features']:
    feature_dict = {'borough': i['properties']['borough'],
                    'neighborhood': i['properties']['name'], 
                    'lat': i['geometry']['coordinates'][1],
                    'long': i['geometry']['coordinates'][0],}
                    # 'bbox': i['properties']['bbox']}
    geo_list.append(feature_dict)
    
    
geo_df = pd.DataFrame(geo_list)

geo_df[geo_df.lat > 44]

Unnamed: 0,borough,neighborhood,lat,long


In [5]:
geo_df.borough.value_counts()

Queens           81
Brooklyn         70
Staten Island    63
Bronx            52
Manhattan        40
Name: borough, dtype: int64

In [6]:
brooklyn = geo_df[geo_df['borough'] == 'Brooklyn']
bronx = geo_df[geo_df['borough'] == 'Bronx']
queens = geo_df[geo_df['borough'] == 'Queens']
staten_island = geo_df[geo_df['borough'] == 'Staten Island']
manhattan = geo_df[geo_df['borough'] == 'Manhattan']

In [37]:
bboxs = {'brooklyn' : [brooklyn.lat.min(), brooklyn.lat.max(), brooklyn.long.min(), brooklyn.long.max()],
         'bronx' : [bronx.lat.min(), bronx.lat.max(), bronx.long.min(), bronx.long.max()],
         'queens' : [queens.lat.min(), queens.lat.max(), queens.long.min(), queens.long.max()],
         'staten_island' : [staten_island.lat.min(), staten_island.lat.max(), staten_island.long.min(), staten_island.long.max()],
         'manhattan' : [manhattan.lat.min(), manhattan.lat.max(), manhattan.long.min(), manhattan.long.max()]
        }

In [38]:
bboxs

{'brooklyn': [40.57429256471601,
  40.7302009848647,
  -74.03197914537984,
  -73.86797598081334],
 'bronx': [40.801663627756206,
  40.90854282950666,
  -73.9261020935813,
  -73.78648845267413],
 'queens': [40.55740128845452,
  40.79278140360048,
  -73.95386782130745,
  -73.70884705889246],
 'staten_island': [40.50533376115642,
  40.6449815710044,
  -74.24656934235283,
  -74.06667766061771],
 'manhattan': [40.70710710727048,
  40.87655077879964,
  -74.01686930508617,
  -73.91065965862981]}

#### The bounding box provided in 'nyc_geo.json' is not sufficient I'm going to parse the data to create a new oe   
---

#### Can use 'geo_df' data frame to reference neighborhood if neccesary for development
#### But I will manipulate the dataframe into 5 boroughs with list of coordinates associated

In [8]:
def make_list(things):
    thing_list = []
    for thing in things:
        thing_list.append(thing)
    return thing_list

In [9]:
boroughs = geo_df.groupby(['borough', 'lat', 'long']).size()

In [10]:
boroughs.Bronx[:5]

lat        long      
40.801664  -73.913221    1
40.806239  -73.916100    1
40.806551  -73.854144    1
40.809730  -73.883315    1
40.815099  -73.895788    1
dtype: int64

---
#### Using different data for NYC_Open_Data to collect information about the neigborhoods that can be used for segmentation.
<br>
* Retrieved csv files of Arrests, Air Quality, and Evictions reported in the 5 boroughs of NYC in 2022
<br>
<br>
* It may be interesting to find some coorelation between these features if any. 

In [11]:
arrest_df = pd.read_csv('NYPD_Arrest_Data__Year_to_Date_.csv')
arrest_df.head(2)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,238859078,01/09/2022,,(null),,(null),PL 2650022,M,B,49,0,25-44,M,BLACK HISPANIC,1021536,251417,40.85668,-73.865212,POINT (-73.865212 40.85668)
1,239923883,01/31/2022,,(null),,(null),CPL5700600,9,Q,113,3,25-44,M,BLACK,1046367,186986,40.679701,-73.776047,POINT (-73.77604735 40.67970059)


In [12]:
air_df = pd.read_csv('Air_Quality.csv')
air_df.head(2)

Unnamed: 0,Unique ID,Indicator ID,Name,Measure,Measure Info,Geo Type Name,Geo Join ID,Geo Place Name,Time Period,Start_Date,Data Value,Message
0,216498,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2013,06/01/2013,34.64,
1,216499,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2014,06/01/2014,33.22,


In [13]:
eviction_df = pd.read_csv('Evictions.csv')
eviction_df.head(2)

Unnamed: 0,Court Index Number,Docket Number,Eviction Address,Eviction Apartment Number,Executed Date,Marshal First Name,Marshal Last Name,Residential/Commercial,BOROUGH,Eviction Postcode,Ejectment,Eviction/Legal Possession,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,70491/17,169289,120 ALDRICH ST.,16F,10/10/2018,Alfred,Locascio,Residential,BRONX,10475,Not an Ejectment,Possession,40.870146,-73.831665,10.0,12.0,46201.0,2128836.0,2051410000.0,Co-op City
1,55498/18A,487863,3041 HOLLAND AVENUE,UNIT 55N,06/19/2019,Danny,Weinheim,Residential,BRONX,10467,Not an Ejectment,Possession,40.870179,-73.865262,12.0,15.0,338.0,2055590.0,2045690000.0,Bronxdale


---
#### EDA 

In [14]:
#### Dropping Unnessesary Columns from arrest dataframe
arrest_df.drop(['ARREST_KEY', 'PD_CD', 'KY_CD', 'LAW_CODE', 'ARREST_PRECINCT', 'JURISDICTION_CODE', 'X_COORD_CD', 'Y_COORD_CD'], axis=1, inplace=True)

In [15]:
arrest_df.head(5)

Unnamed: 0,ARREST_DATE,PD_DESC,OFNS_DESC,LAW_CAT_CD,ARREST_BORO,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude,New Georeferenced Column
0,01/09/2022,(null),(null),M,B,25-44,M,BLACK HISPANIC,40.85668,-73.865212,POINT (-73.865212 40.85668)
1,01/31/2022,(null),(null),9,Q,25-44,M,BLACK,40.679701,-73.776047,POINT (-73.77604735 40.67970059)
2,01/25/2022,RAPE 3,RAPE,F,K,25-44,M,BLACK,40.664121,-73.947765,POINT (-73.9477648403751 40.664121282631)
3,03/03/2022,RAPE 1,RAPE,F,K,18-24,M,BLACK,40.695439,-73.983225,POINT (-73.9832253756043 40.6954388081238)
4,02/22/2022,RAPE 1,RAPE,F,B,45-64,M,BLACK,40.816206,-73.896001,POINT (-73.8960011932583 40.8162058439227)


In [16]:
arrest_df.isnull().sum()

ARREST_DATE                    0
PD_DESC                        0
OFNS_DESC                      0
LAW_CAT_CD                  1362
ARREST_BORO                    0
AGE_GROUP                      0
PERP_SEX                       0
PERP_RACE                      0
Latitude                       0
Longitude                      0
New Georeferenced Column       0
dtype: int64

## Lets take a look at the top 20 crimes within all boroughs

In [17]:
arrest_df.groupby('OFNS_DESC').size().sort_values(ascending=False).head(20)

OFNS_DESC
ASSAULT 3 & RELATED OFFENSES      23188
PETIT LARCENY                     15288
FELONY ASSAULT                    14190
MISCELLANEOUS PENAL LAW            9327
CRIMINAL MISCHIEF & RELATED OF     7717
DANGEROUS DRUGS                    7686
ROBBERY                            7310
GRAND LARCENY                      6432
DANGEROUS WEAPONS                  6011
BURGLARY                           4731
VEHICLE AND TRAFFIC LAWS           4668
OFFENSES AGAINST PUBLIC ADMINI     4203
OFFENSES INVOLVING FRAUD           3627
SEX CRIMES                         3192
OFF. AGNST PUB ORD SENSBLTY &      2938
FORGERY                            2795
POSSESSION OF STOLEN PROPERTY      1869
INTOXICATED & IMPAIRED DRIVING     1813
OTHER OFFENSES RELATED TO THEF     1804
FOR OTHER AUTHORITIES              1362
dtype: int64

#### Removing Null Values

In [18]:
arrest_df.PD_DESC.replace({'(null)': None}, inplace=True)
arrest_df.OFNS_DESC.replace({'(null)': None}, inplace=True)

In [19]:
arrest_df.dropna(inplace=True)

In [20]:
arrest_df.PD_DESC.value_counts().head(10)

ASSAULT 3                                    16877
LARCENY,PETIT FROM OPEN AREAS,               15230
ASSAULT 2,1,UNCLASSIFIED                     10476
ROBBERY,OPEN AREA UNCLASSIFIED                7270
PUBLIC ADMINISTRATION,UNCLASSI                6439
LARCENY,GRAND FROM OPEN AREAS, UNATTENDED     5554
TRAFFIC,UNCLASSIFIED MISDEMEAN                4397
BURGLARY,UNCLASSIFIED,UNKNOWN                 4244
MENACING,UNCLASSIFIED                         4052
CONTROLLED SUBSTANCE, POSSESSI                3670
Name: PD_DESC, dtype: int64

In [None]:
arrest_df.head(1)

Unnamed: 0,ARREST_DATE,PD_DESC,OFNS_DESC,LAW_CAT_CD,ARREST_BORO,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude,New Georeferenced Column
2,01/25/2022,RAPE 3,RAPE,F,K,25-44,M,BLACK,40.664121,-73.947765,POINT (-73.9477648403751 40.664121282631)


In [None]:
def borough_finder(lat, long):
    for i in bboxs:
        

{'brooklyn': [<bound method NDFrame._add_numeric_operations.<locals>.min of 46     40.625801
  47     40.611009
  48     40.645103
  49     40.730201
  50     40.595260
           ...    
  283    40.703176
  289    40.598525
  295    40.681999
  296    40.609378
  300    40.646926
  Name: lat, Length: 70, dtype: float64>,
  <bound method NDFrame._add_numeric_operations.<locals>.max of 46     40.625801
  47     40.611009
  48     40.645103
  49     40.730201
  50     40.595260
           ...    
  283    40.703176
  289    40.598525
  295    40.681999
  296    40.609378
  300    40.646926
  Name: lat, Length: 70, dtype: float64>,
  <bound method NDFrame._add_numeric_operations.<locals>.min of 46    -74.030621
  47    -73.995180
  48    -74.010316
  49    -73.954241
  50    -73.973471
           ...    
  283   -73.988753
  289   -73.959185
  295   -73.890346
  296   -73.948415
  300   -73.948177
  Name: long, Length: 70, dtype: float64>,
  <bound method NDFrame._add_numeric_operations.

In [29]:
for i in bboxs:
    print(i)

brooklyn
bronx
queens
staten_island
manhattan


In [23]:
# display(geo_df.loc[(geo_df['lat'] >= 40.662) & (geo_df['lat'] <= 40.669) | (geo_df['long'] >= -73.941) & (geo_df['long'] <= -73.950)])

Unnamed: 0,borough,neighborhood,lat,long
60,Brooklyn,Brownsville,40.66395,-73.910235
148,Queens,South Ozone Park,40.66855,-73.809865
167,Queens,Springfield Gardens,40.66623,-73.760421
187,Queens,Lindenwood,40.663918,-73.849638
188,Queens,Laurelton,40.667884,-73.740256
260,Brooklyn,New Lots,40.662744,-73.885118


In [25]:
# geo_df['Brooklyn']

---
#### Data Visualization

In [17]:
import matplotlib.pyplot as plt