# Imports

In [1]:
#    OS interaction
import os

#    Data Manipulation
import pandas as pd

#    Pandas Settings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#    Linear Algebra
import numpy as np

#    Data Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#    DateTime Manipulation
import datetime as dt

# Load Data

In [2]:
project_dir = os.path.abspath('..')
data_path = '/data/detroit_911_calls_cleaned.csv'

csv_path = project_dir + data_path


df = pd.read_csv(csv_path)

In [3]:
print(df.shape)
df.head(3)

(2727156, 24)


Unnamed: 0,X,Y,incident_id,agency,incident_address,zip_code,priority,callcode,calldescription,category,call_timestamp,precinct_sca,respondingunit,officerinitiated,intaketime,dispatchtime,traveltime,totalresponsetime,time_on_scene,totaltime,neighborhood,block_id,council_district,oid
0,-82.986962,42.408358,201626400002,DPD,Promenade Ave & Roseberry St,48213,3,825030,DISTURBANCE,DISTURB,2016-09-20T09:36:12.000Z,910A,,No,2.2,528.6,15.5,546.6,120.5,667.1,Wade,261635000000000.0,4.0,260872651
1,-83.185213,42.430914,201626400019,DPD,Ardmore St & W 7 Mile Rd,48235,2,811020,AUTO X UNK INJ / IMPAIRED,ACCUNK,2016-09-20T09:46:32.000Z,1201,,No,2.7,5.2,6.4,14.5,59.1,73.6,Tri-Point,261635400000000.0,2.0,260872652
2,-83.146057,42.43877,201626400020,DPD,Pembroke Ave & Prairie St,48221,3,374030,RECOVER AUTO,RECAUTO,2016-09-20T09:48:24.000Z,123B,,No,4.9,,,,,,Oak Grove,261635400000000.0,2.0,260872653


#### Initial Filtering

The first logical step here is to move forward with only the columns we will be needing for this project. The most obvious of course are X and Y our longitude and latitude coordinates, as well as the call_timestamp, priority and call description. The rest of the columns will be unnecessary for our purposes.

In [4]:
pertinent_cols = [
    'X', 
    'Y',
    'call_timestamp',
    'calldescription',
    'priority'
]

df2 = df[pertinent_cols]

#    correct calldescription header to fit convention
df2 = df2.rename(columns={'calldescription':'call_description'})


df2.head(3)

Unnamed: 0,X,Y,call_timestamp,call_description,priority
0,-82.986962,42.408358,2016-09-20T09:36:12.000Z,DISTURBANCE,3
1,-83.185213,42.430914,2016-09-20T09:46:32.000Z,AUTO X UNK INJ / IMPAIRED,2
2,-83.146057,42.43877,2016-09-20T09:48:24.000Z,RECOVER AUTO,3


### Secondary Filtering

If we examine the call_description column we will find that not all observations relate to 911 responses, rather they represent administrative functions

In [6]:
df2.call_description.value_counts()[:20]

SPECIAL ATTENTION                 393478
TRAFFIC STOP                      385731
DISTURBANCE                       186912
START OF SHIFT INFORMATION        151182
REMARKS                           139002
INVESTIGATE PERSON                104818
UNKNOWN PROBLEM                    95698
HANGUP CALLS                       88672
TOWING DETAIL                      76439
ASSAULT AND BATTERY                72014
FELONIOUS ASSAULT IP               47555
BUS BOARDING                       44551
LARCENY REPORT                     41048
AUTO X UNK INJ / IMPAIRED          36121
VERIFIED ALR / PERSON W/O CODE     28770
UDAA REPORT                        28114
PERSON WITH WEAPON                 26513
BURGLARY OCCUPIED RESD I/P         25261
SHOTS FIRED IP                     24837
INVESTIGATE AUTO                   24437
Name: call_description, dtype: int64

There are 2 call descriptions describing non-police functions, 'START OF SHIFT INFORMATION' and 'REMARKS'. So lets define our dataframe to include all the observations save for ones where the call description contains those values

In [10]:
admin_calls = df.call_description.value_counts()[3:5]

START OF SHIFT INFORMATION    151182
REMARKS                       139002
Name: call_description, dtype: int64

In [10]:
admin_calls = df2.call_description.value_counts()[3:5]

df3 = df2.loc[~df2['call_description'].isin(list(admin_calls.index))]

assert len(df3) == len(df2) - sum(list(admin_calls.values))

print(df3.shape)

df3.head(3)

(2436972, 5)


Unnamed: 0,X,Y,call_timestamp,call_description,priority
0,-82.986962,42.408358,2016-09-20T09:36:12.000Z,DISTURBANCE,3
1,-83.185213,42.430914,2016-09-20T09:46:32.000Z,AUTO X UNK INJ / IMPAIRED,2
2,-83.146057,42.43877,2016-09-20T09:48:24.000Z,RECOVER AUTO,3


# DateTime

First thing to be done here is to parse the call timestamps and encode each aspect of the DateTime information as a seperate column, we will also notice here that the timestamps use UTC time, so it would also make sense to localize the timezone as we move forward.