In [2]:
# Importing relevant packages, loading relevant data.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point
import warnings
warnings.filterwarnings('ignore')



In [3]:
crime_df = pd.read_csv('/Users/marielwiechers/Documents/MSCAPP/Spring 2021/Machine Learning/Crime project/Crimes_-_2015_to_Present.csv')


In [4]:
crime_class_df = pd.read_csv('../raw_data/Crimes_classes.csv')
beats_df = pd.read_csv('../intermediate_data/beats.csv')
weather_df = pd.read_csv('../raw_data/weather_2015-2020.csv')

In [5]:
# Transforming crime data to datetime, indexing by such, limiting data to 2015-2020.
crime_df['Date'] = pd.to_datetime(crime_df['Date'])
crime_df = crime_df.set_index('Date')
crime_df = crime_df['20150101' : '20201231']

In [6]:
# Creating features for month, week of the current year, day of the week, and hour of day .

crime_df['Month'] = crime_df.index.month
crime_df['Week'] = crime_df.index.week
crime_df['Day'] = crime_df.index.day
crime_df['Hour'] = crime_df.index.hour

def find_watch(n):
    if 0 <= n < 8:
        return 'First'
    elif 8 <= n < 16:
        return 'Second'
    elif 16 <= n:
        return 'Third'
        

crime_df['Watch'] = crime_df['Hour'].apply(find_watch)

crime_df.index = crime_df.index.date

In [7]:
# Transforming weather data to datetime, indexing by such.

weather_df['Date'] = pd.to_datetime(weather_df['DATE'])
weather_df = weather_df.set_index('Date')

In [8]:
# Merging weather data to crime data.

crime_df = pd.merge(crime_df, weather_df, how = 'left', left_index = True, right_index = True)

In [9]:
# Preparing crime classes to merge into crime data, and merging.

for index, row in crime_class_df.iterrows():
    if len(row['crime_id']) == 3:
        row['crime_id'] = '0' + row['crime_id']

df = pd.merge(crime_df, crime_class_df, how = 'left', left_on = 'IUCR', right_on = 'crime_id')

In [10]:
df.loc[df.category_1 == 'serious', 'serious'] = 1
df.serious.fillna(0, inplace=True)

In [11]:
# Dropping undesired features.

col_list = ['ID','Arrest', 'Domestic', 'Beat', 'Year', 'Month', 'Watch', 
            'PRCP', 'SNOW','TMAX', 'TMIN', 'serious']

df_filtered = df.filter(col_list)

In [30]:
groups = df_filtered[['Year','Month','Watch','Beat']]

In [31]:
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['ID'].count().reset_index()).drop_duplicates()

groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['serious'].sum().reset_index())
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['Arrest'].sum().reset_index())
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch','Beat'])['Domestic'].sum().reset_index())

In [32]:
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch'])['TMAX'].mean().reset_index(), how='left')
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch'])['TMIN'].mean().reset_index(), how='left')
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch'])['PRCP'].mean().reset_index(), how='left')
groups = groups.merge(df_filtered.groupby(['Year','Month', 'Watch'])['SNOW'].mean().reset_index(), how='left')

groups.rename({'ID':'Total', 'serious':'Serious'}, axis=1, inplace=True)

In [33]:
groups.head()

Unnamed: 0,Year,Month,Watch,Beat,Total,Serious,Arrest,Domestic,TMAX,TMIN,PRCP,SNOW
0,2015,1,First,411,16,9.0,3,5,33.294118,20.394993,0.048605,0.302474
1,2015,1,First,1532,24,13.0,9,7,33.294118,20.394993,0.048605,0.302474
2,2015,1,First,2024,7,6.0,1,0,33.294118,20.394993,0.048605,0.302474
3,2015,1,First,223,11,8.0,0,4,33.294118,20.394993,0.048605,0.302474
4,2015,1,First,214,8,8.0,1,2,33.294118,20.394993,0.048605,0.302474


In [34]:
groups['rank'] = groups.groupby(['Year','Month'])['Serious'].rank(pct=True)
groups.loc[groups['rank'] > .75, "high_crime"] = 1
groups['high_crime'].fillna(0, inplace=True)
groups.drop('rank', axis = 1, inplace=True)

In [35]:
# Merging beat data to crime data.  

crime_df = pd.merge(groups, beats_df, how = 'left', left_on = 'Beat', right_on = 'beat_num')

In [36]:
crime_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59093 entries, 0 to 59092
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year                 59093 non-null  int64  
 1   Month                59093 non-null  int64  
 2   Watch                59093 non-null  object 
 3   Beat                 59093 non-null  int64  
 4   Total                59093 non-null  int64  
 5   Serious              59093 non-null  float64
 6   Arrest               59093 non-null  int64  
 7   Domestic             59093 non-null  int64  
 8   TMAX                 59093 non-null  float64
 9   TMIN                 59093 non-null  float64
 10  PRCP                 59093 non-null  float64
 11  SNOW                 59093 non-null  float64
 12  high_crime           59093 non-null  float64
 13  Unnamed: 0           59093 non-null  int64  
 14  beat                 59093 non-null  int64  
 15  beat_num             59093 non-null 

In [37]:
crime_df.drop(["Unnamed: 0", 'beat'], axis=1, inplace=True)

In [38]:
crime_df.to_csv("../intermediate_data/high_crime(with PRCP).csv")