# Data Preparation

#### Parsing the json **nyc_geo.json** into the dataframe with the following columns:
- Borough
- Neighborhood
- Latitude
- Longitude

In [15]:
import pandas as pd

In [16]:
import json
f = open('nyc_geo.json')
data = json.load(f)

In [24]:
data['features'][0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

In [30]:
geo_list = []
for i in data['features']:
    feature_dict = {'borough': i['properties']['borough'],
                    'neighborhood': i['properties']['name'], 
                    'lat': i['geometry']['coordinates'][0],
                    'long': i['geometry']['coordinates'][1],}
                    # 'bbox': i['properties']['bbox']}
    geo_list.append(feature_dict)
    
    
geo_df = pd.DataFrame(geo_list)

geo_df.tail(10)

Unnamed: 0,borough,neighborhood,lat,long
296,Brooklyn,Madison,-73.948415,40.609378
297,Bronx,Bronxdale,-73.861726,40.852723
298,Bronx,Allerton,-73.859319,40.865788
299,Bronx,Kingsbridge Heights,-73.901523,40.870392
300,Brooklyn,Erasmus,-73.948177,40.646926
301,Manhattan,Hudson Yards,-74.000111,40.756658
302,Queens,Hammels,-73.80553,40.587338
303,Queens,Bayswater,-73.765968,40.611322
304,Queens,Queensbridge,-73.945631,40.756091
305,Staten Island,Fox Hills,-74.08174,40.617311


#### Can use 'geo_df' data frame to reference neighborhood if neccesary for development
#### But I will manipulate the dataframe into 5 boroughs with list of coordinates associated

In [35]:
def make_list(things):
    thing_list = []
    for thing in things:
        thing_list.append(thing)
    return thing_list

In [42]:
boroughs = geo_df.groupby(['borough', 'lat', 'long']).size()

In [47]:
boroughs.Bronx[:5]

lat         long     
-73.926102  40.836623    1
-73.919672  40.847898    1
-73.917190  40.881395    1
-73.916556  40.843826    1
-73.916100  40.806239    1
dtype: int64

---
#### Using different data for NYC_Open_Data to collect information about the neigborhoods that can be used for segmentation.
<br>
* Retrieved csv files of Arrests, Air Quality, and Evictions reported in the 5 boroughs of NYC in 2022
<br>
<br>
* It may be interesting to find some coorelation between these features if any. 

In [4]:
arrest_df = pd.read_csv('NYPD_Arrest_Data__Year_to_Date_.csv')
arrest_df.head(2)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,238859078,01/09/2022,,(null),,(null),PL 2650022,M,B,49,0,25-44,M,BLACK HISPANIC,1021536,251417,40.85668,-73.865212,POINT (-73.865212 40.85668)
1,239923883,01/31/2022,,(null),,(null),CPL5700600,9,Q,113,3,25-44,M,BLACK,1046367,186986,40.679701,-73.776047,POINT (-73.77604735 40.67970059)


In [5]:
air_df = pd.read_csv('Air_Quality.csv')
air_df.head(2)

Unnamed: 0,Unique ID,Indicator ID,Name,Measure,Measure Info,Geo Type Name,Geo Join ID,Geo Place Name,Time Period,Start_Date,Data Value,Message
0,216498,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2013,06/01/2013,34.64,
1,216499,386,Ozone (O3),Mean,ppb,CD,313,Coney Island (CD13),Summer 2014,06/01/2014,33.22,


In [6]:
eviction_df = pd.read_csv('Evictions.csv')
eviction_df.head(2)

Unnamed: 0,Court Index Number,Docket Number,Eviction Address,Eviction Apartment Number,Executed Date,Marshal First Name,Marshal Last Name,Residential/Commercial,BOROUGH,Eviction Postcode,Ejectment,Eviction/Legal Possession,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,70491/17,169289,120 ALDRICH ST.,16F,10/10/2018,Alfred,Locascio,Residential,BRONX,10475,Not an Ejectment,Possession,40.870146,-73.831665,10.0,12.0,46201.0,2128836.0,2051410000.0,Co-op City
1,55498/18A,487863,3041 HOLLAND AVENUE,UNIT 55N,06/19/2019,Danny,Weinheim,Residential,BRONX,10467,Not an Ejectment,Possession,40.870179,-73.865262,12.0,15.0,338.0,2055590.0,2045690000.0,Bronxdale


---
#### EDA 

In [7]:
#### Dropping Unnessesary Columns from arrest dataframe
arrest_df.drop(['ARREST_KEY', 'PD_CD', 'KY_CD', 'LAW_CODE', 'ARREST_PRECINCT', 'JURISDICTION_CODE', 'X_COORD_CD', 'Y_COORD_CD'], axis=1, inplace=True)

In [8]:
arrest_df.head(5)

Unnamed: 0,ARREST_DATE,PD_DESC,OFNS_DESC,LAW_CAT_CD,ARREST_BORO,AGE_GROUP,PERP_SEX,PERP_RACE,Latitude,Longitude,New Georeferenced Column
0,01/09/2022,(null),(null),M,B,25-44,M,BLACK HISPANIC,40.85668,-73.865212,POINT (-73.865212 40.85668)
1,01/31/2022,(null),(null),9,Q,25-44,M,BLACK,40.679701,-73.776047,POINT (-73.77604735 40.67970059)
2,01/25/2022,RAPE 3,RAPE,F,K,25-44,M,BLACK,40.664121,-73.947765,POINT (-73.9477648403751 40.664121282631)
3,03/03/2022,RAPE 1,RAPE,F,K,18-24,M,BLACK,40.695439,-73.983225,POINT (-73.9832253756043 40.6954388081238)
4,02/22/2022,RAPE 1,RAPE,F,B,45-64,M,BLACK,40.816206,-73.896001,POINT (-73.8960011932583 40.8162058439227)


In [29]:
arrest_df.isnull().sum()

ARREST_DATE                    0
PD_DESC                        0
OFNS_DESC                      0
LAW_CAT_CD                  1362
ARREST_BORO                    0
AGE_GROUP                      0
PERP_SEX                       0
PERP_RACE                      0
Latitude                       0
Longitude                      0
New Georeferenced Column       0
dtype: int64

## Lets take a look at the top 20 crimes within all boroughs

In [9]:
arrest_df.groupby('OFNS_DESC').size().sort_values(ascending=False).head(20)

OFNS_DESC
ASSAULT 3 & RELATED OFFENSES      23188
PETIT LARCENY                     15288
FELONY ASSAULT                    14190
MISCELLANEOUS PENAL LAW            9327
CRIMINAL MISCHIEF & RELATED OF     7717
DANGEROUS DRUGS                    7686
ROBBERY                            7310
GRAND LARCENY                      6432
DANGEROUS WEAPONS                  6011
BURGLARY                           4731
VEHICLE AND TRAFFIC LAWS           4668
OFFENSES AGAINST PUBLIC ADMINI     4203
OFFENSES INVOLVING FRAUD           3627
SEX CRIMES                         3192
OFF. AGNST PUB ORD SENSBLTY &      2938
FORGERY                            2795
POSSESSION OF STOLEN PROPERTY      1869
INTOXICATED & IMPAIRED DRIVING     1813
OTHER OFFENSES RELATED TO THEF     1804
FOR OTHER AUTHORITIES              1362
dtype: int64

In [None]:
arrest_df

---
#### Data Visualization

In [61]:
import matplotlib.pyplot as plt