In [None]:
# Alex Tresselt
# CS 7180
# 12/13/18
# Crime Prediction Model

In [1]:
%pylab inline
import pandas as pd
import numpy as np

from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('Call_Data.csv')

In [3]:
# Format time columns
df['Arrived Time'] = pd.to_datetime(df['Arrived Time'], format="%b %d %Y %I:%M:%S:%f%p")
df['Original Time Queued'] = pd.to_datetime(df['Original Time Queued'], format="%m/%d/%Y %I:%M:%S %p")

In [4]:
# Trim dataset to events in 2017 (to speed up processing and save space)
df = df.drop(df[df['Original Time Queued'] <= '2017-01-01 00:00:00'].index)
df = df.drop(df[df['Original Time Queued'] >= '2018-01-01 00:00:00'].index)

In [5]:
# Trim all calls with a priority greater than 3
df = df.drop(df[df['Priority'] >= 4].index)

Unnamed: 0,CAD Event Number,Event Clearance Description,Call Type,Priority,Initial Call Type,Final Call Type,Original Time Queued,Arrived Time,Precinct,Sector,Beat
72665,2017000065223,STREET CHECK WRITTEN,ONVIEW,2,TRESPASS,--PROWLER - TRESPASS,2017-02-22 12:28:36,2017-02-22 12:28:36,SOUTHWEST,FRANK,F2
72894,2017000125726,ASSISTANCE RENDERED,911,2,TRESPASS,--PROWLER - TRESPASS,2017-04-10 19:18:07,2017-04-10 19:55:19,WEST,QUEEN,Q1
73541,2017000137184,ASSISTANCE RENDERED,911,2,SHOPLIFT - THEFT,--THEFT - SHOPLIFT,2017-04-19 19:45:13,2017-04-19 19:52:33,EAST,CHARLIE,C1
73777,2017000137185,REPORT WRITTEN (NO ARREST),911,2,SHOPLIFT - THEFT,"--ASSAULTS, OTHER",2017-04-19 19:45:35,2017-04-19 20:21:57,NORTH,JOHN,J2
74449,2017000319109,ASSISTANCE RENDERED,ONVIEW,2,TRESPASS,--DISTURBANCE - OTHER,2017-08-28 16:44:04,2017-08-28 16:44:04,NORTH,JOHN,J3


In [8]:
# Create more time columns 
df['hour'] = df['Original Time Queued'].dt.hour
df['day'] = df['Original Time Queued'].dt.weekday
df['month'] = df['Original Time Queued'].dt.month

In [11]:
# Drop duplicate/unfounded/non-events
df.drop(df.loc[df['Event Clearance Description']=='NO POLICE ACTION POSSIBLE OR NECESSARY'].index, inplace=True)
df.drop(df.loc[df['Event Clearance Description']=='(NOT CURRENTLY USED) ALARM NO RESPONSE'].index, inplace=True)
df.drop(df.loc[df['Event Clearance Description']=='DUPLICATED OR CANCELLED BY RADIO'].index, inplace=True)
df.drop(df.loc[df['Event Clearance Description']=='FALSE COMPLAINT/UNFOUNDED'].index, inplace=True)
df.drop(df.loc[df['Event Clearance Description']=='NO SUCH ADDRESS OR LOCATION'].index, inplace=True)
df.drop(df.loc[df['Event Clearance Description']=='RADIO BROADCAST AND CLEAR'].index, inplace=True)
df.drop(df.loc[df['Event Clearance Description']=='RESPONDING UNIT(S) CANCELLED BY RADIO'].index, inplace=True)
df.drop(df.loc[df['Event Clearance Description']=='UNABLE TO LOCATE INCIDENT OR COMPLAINANT'].index, inplace=True)

In [12]:
# Drop unneeded columns
df = df.drop(columns=['CAD Event Number', 'Event Clearance Description', 'Priority', 'Arrived Time', 
                      'Sector', 'Precinct', 'Original Time Queued', 'Call Type', 'Initial Call Type'])

In [13]:
df.head()

Unnamed: 0,Final Call Type,Beat,hour,day,month
72665,--PROWLER - TRESPASS,F2,12,2,2
72894,--PROWLER - TRESPASS,Q1,19,0,4
73541,--THEFT - SHOPLIFT,C1,19,2,4
73777,"--ASSAULTS, OTHER",J2,19,2,4
74449,--DISTURBANCE - OTHER,J3,16,0,8


In [14]:
# Encode and transform labels for the classifier
le = LabelEncoder()
le.fit(df['Final Call Type'])
df['Crime'] = le.transform(df['Final Call Type'])
df = df.drop(columns=['Final Call Type'])
df.head()

Unnamed: 0,Beat,hour,day,month,Crime
72665,F2,12,2,2,105
72894,Q1,19,0,4,105
73541,C1,19,2,4,121
73777,J2,19,2,4,15
74449,J3,16,0,8,46


In [15]:
# Encode Beat labels
leB = LabelEncoder()
leB.fit(df['Beat'])
df['Beat'] = leB.transform(df['Beat'])

In [16]:
# Train/Test split
X = df.iloc[:, :4]  # Beat, hour, day, month
y = df.iloc[:, -1]  # Crime
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
# Fit model
clf_dtc = DecisionTreeClassifier()
clf_dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [18]:
# Predict
y_pred = clf_dtc.predict(X_test)
print(np.average(y_test == y_pred))

0.07249822717223543


In [19]:
# Predict Probabilities
probs = clf_dtc.predict_proba([[1,12,2,2]])
# Get top 5
p = np.argsort(probs, axis=1)[:,-5:]
prob = p[0]
prob
# Transform them back to crime labels
for p in prob:
    print(str(le.inverse_transform([p])))

['--PERSON - FOUND PERSON']
['--PERSON - A.W.O.L.']
['--NARCOTICS - NARS REPORT']
['--MISCHIEF OR NUISANCE - GENERAL']
['--PROWLER - TRESPASS']


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [20]:
# Export classifier
joblib.dump(clf_dtc, 'model/DTclassifier.joblib')

['model/DTclassifier.joblib']

In [23]:
# Export labels
joblib.dump(le, 'model/le.joblib')

['model/le.joblib']