# DB Project, data prep #1

## Feature Generation, Data Merging

In [1]:
# Importing relevant packages, loading relevant data.

import numpy as np
import pandas as pd
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point
import warnings
warnings.filterwarnings('ignore')

crime_df = pd.read_csv('../raw_data/Crimes_-_2015_to_Present.csv')
crime_class_df = pd.read_csv('../raw_data/Crimes_classes.csv')
beats_df = pd.read_csv('../intermediate_data/beats.csv')
weather_df = pd.read_csv('../raw_data/weather_2015-2020.csv')

In [2]:
# Transforming crime data to datetime, indexing by such, limiting data to 2015-2020.

crime_df['Date'] = pd.to_datetime(crime_df['Date'])
crime_df = crime_df.set_index('Date')
crime_df = crime_df['20150101' : '20201231']

In [3]:
# Creating features for month, week of the current year, day of the week, and hour of day .

crime_df['Month'] = crime_df.index.month
crime_df['Week'] = crime_df.index.week
crime_df['Day'] = crime_df.index.day
crime_df['Hour'] = crime_df.index.hour

def find_watch(n):
    if 0 <= n < 8:
        return 'First'
    elif 8 <= n < 16:
        return 'Second'
    elif 16 <= n:
        return 'Third'
        

crime_df['Watch'] = crime_df['Hour'].apply(find_watch)

crime_df.index = crime_df.index.date

In [4]:
# Dropping non-criminal reports

crime_df = crime_df[~crime_df['IUCR'].isin(['5073', '5093', '5094', '5113', '5114'])]

In [5]:
# Transforming weather data to datetime, indexing by such.

weather_df['Date'] = pd.to_datetime(weather_df['DATE'])
weather_df = weather_df.set_index('Date')

In [6]:
# Merging weather data to crime data.

crime_df = pd.merge(crime_df, weather_df, how = 'left', left_index = True, right_index = True)

In [7]:
# Saving total tempral data for analysis

crime_df.to_csv("../intermediate_data/df_temporal_to_graph.csv")

In [8]:
# Preparing crime classes to merge into crime data, and merging.

for index, row in crime_class_df.iterrows():
    if len(row['crime_id']) == 3:
        row['crime_id'] = '0' + row['crime_id']

crime_df = pd.merge(crime_df, crime_class_df, how = 'left', left_on = 'IUCR', right_on = 'crime_id')        

In [9]:
# Merging beat data to crime data.  

crime_df = pd.merge(crime_df, beats_df, how = 'left', left_on = 'Beat', right_on = 'beat_num')

In [11]:
crime_df.loc[crime_df.category_1 == 'serious', 'serious'] = 1
crime_df.serious.fillna(0, inplace=True)

In [12]:
# Dropping undesired features.

col_list = ['ID','Arrest', 'Domestic', 'Beat', 'Year', 'Month', 'Watch', 
            'PRCP', 'SNOW','TMAX', 'TMIN', 'serious']

df_filtered = crime_df.filter(col_list)

In [13]:
groups = df_filtered[['Year','Month','Watch','Beat']]

In [14]:
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['ID'].count().reset_index()).drop_duplicates()

groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['serious'].sum().reset_index())
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['Arrest'].sum().reset_index())
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['Domestic'].sum().reset_index())
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['TMAX'].mean().reset_index())
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['TMIN'].mean().reset_index())
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['PRCP'].mean().reset_index())
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['SNOW'].mean().reset_index())

groups.rename({'ID':'Total', 'serious':'Serious'}, axis=1, inplace=True)

In [15]:
groups['rank'] = groups.groupby(['Year','Month'])['Serious'].rank(pct=True)
groups.loc[groups['rank'] > .75, "high_crime"] = 1
groups['high_crime'].fillna(0, inplace=True)
groups.drop('rank', axis = 1, inplace=True)

In [16]:
# Merging beat data to crime data.  

crime_df = pd.merge(groups, beats_df, how = 'left', left_on = 'Beat', right_on = 'beat_num')

In [17]:
crime_df.drop(["Unnamed: 0", 'beat'], axis=1, inplace=True)

In [18]:
crime_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59093 entries, 0 to 59092
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year                 59093 non-null  int64  
 1   Month                59093 non-null  int64  
 2   Watch                59093 non-null  object 
 3   Beat                 59093 non-null  int64  
 4   Total                59093 non-null  int64  
 5   Serious              59093 non-null  float64
 6   Arrest               59093 non-null  int64  
 7   Domestic             59093 non-null  int64  
 8   TMAX                 59093 non-null  float64
 9   TMIN                 59093 non-null  float64
 10  PRCP                 59093 non-null  float64
 11  SNOW                 59093 non-null  float64
 12  high_crime           59093 non-null  float64
 13  beat_num             59093 non-null  int64  
 14  district             59093 non-null  int64  
 15  sector               59093 non-null 

In [19]:
%run geoprocessing_helpers.ipynb

../raw_data/Boundaries - Police Beats (current).geojson EPSG:3435
../raw_data/Boundaries - Census Blocks - 2010.geojson EPSG:3435


In [20]:
beats = gpd.read_file("../raw_data/Boundaries - Police Beats (current).geojson")

In [21]:
crime_df["Beat"]=crime_df["Beat"].astype("string")
crime_df["Beat"] =crime_df["Beat"].str.zfill(4)

In [22]:
crime_data = beats.merge(crime_df, left_on="beat_num", right_on="Beat")

In [23]:
crime_data = compute_spatial_lag(crime_data, "high_crime", 
                                 "high_crime_geog_lag", 
                                 "Year")

In [24]:
# Saving aggregate geospatial data for analysis

crime_data.to_csv("../intermediate_data/df_geospatial_to_graph.csv")

In [25]:
crime_data.drop(columns = "geometry", axis=0, inplace=True)

In [27]:
crime_data.to_csv("../intermediate_data/high_crime_labeled.csv")