In [1]:
%%time
# Data manipulation
import numpy as np
import pandas as pd

# Creating training and test sets
import sklearn

# File system management
import os.path

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

#training/test split
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, ShuffleSplit

# run logistic regression and vary some parameters
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt

# here we can change some of the parameters interactively
from ipywidgets import widgets as wd
from sklearn.model_selection import cross_val_score, cross_validate

#for weights standardization
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from matplotlib import pyplot as plt

# Support vector machines
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

Wall time: 20.5 s


In [2]:
%%time
# Constants
#DATAPATH_BASE = 'https://machinelearningi.blob.core.windows.net/group-project/'
#DATAPATH_SAS_TOKEN = '?sv=2019-02-02&ss=bfqt&srt=sco&sp=rwdlacup&se=2020-04-27T11:12:37Z&st=2020-01-23T04:12:37Z&spr=https&sig=jpIpjrp8dIg9eyUyPpmgTe5yj9i1ZoCSru5kBVHcUO8%3D'
#DATAPATH_FILENAME = 'Arrest_Data_from_2010_to_Present.csv'
#DATAPATH_SMALL_FILENAME = 'Arrest_Data_from_2010_to_Present_Small.csv'

# Fully qualified paths ready to use
#DATA_SOURCE = "".join([DATAPATH_BASE, DATAPATH_FILENAME, DATAPATH_SAS_TOKEN])

# Options
#pd.set_option('float_format', '{:.2f}'.format)  # Reign in the scientific notation for reasonable values

# Load data for analysis; only read if needed because the import can take a long time
#try:
#    if len(df.index) < 1:
#        df_raw = pd.read_csv(DATA_SOURCE) # If we get here, the dataframe was empty
#except:   
#    df_raw = pd.read_csv(DATA_SOURCE) # If we get here, the dataframe did not exist

df_raw = pd.read_csv("Arrest_Data_From_2010_to_Present.csv")

df = df_raw

print("The dataset has {:,} rows and {:,} columns".format(*df.shape))
    

The dataset has 1,324,973 rows and 17 columns
Wall time: 3.6 s


In [3]:
%%time
# Time - filter out 0 and missing
df = df[df['Time'] != 0]
df['Time'] = df['Time'].astype(str) 
df = df[df['Time'] != 'nan']

# Time - Convert float to string. Get rid of decimals. Replace missing or invalid values with '0000'.
df['Time'] = df['Time'].astype(str).str.split(".", expand = True)[0].replace(to_replace = ['2400','nan'], value = '0000') 

# Time - Fill time column with leading zeros to have 4 characters total
df['Time'] = df['Time'].apply(lambda x: '{0:0>4}'.format(x))

# Time - Add colon to Time values by converting attribute to a datetime variable 
df['Time'] = pd.to_datetime(df['Time'], format = '%H%M').dt.time

# Age - Drop the observations where Age is less than 16
df.drop(df[df['Age'] < 16].index, inplace = True) 

# Arrest Type Code - Drop the observations where Arrest Type Code = 'D'
df.drop(df[df['Arrest Type Code'] == 'D'].index, inplace = True) 

# Descent Code - Re-classify any descent not in (B,H,O,W) into 0
descent_list = ['B','H','O','W']
df['Descent Code'] = np.where(np.isin(df['Descent Code'],descent_list),df['Descent Code'],'O')

# Get hour
df['Hour'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.hour

# Convert Arrest Date to datetime
df['Arrest Date'] = pd.to_datetime(df['Arrest Date'])

# Extract year, month, and day of week and add to dataframe as new attributes
#df['arrest_year']= df['Arrest Date'].dt.year
df['arrest_month']= df['Arrest Date'].dt.month
df['arrest_day_of_week'] = df['Arrest Date'].dt.weekday_name

# remove unecessary columns
df.drop(['Cross Street','Charge Description','Charge','Charge Group Description','Time',
         'Arrest Date','Report ID','Address','Area Name','Location'], axis=1, inplace=True)

# Change data types
df['Age'] = df['Age'].astype(np.int8)
df['Reporting District'] = df['Reporting District'].astype(np.str)
df['Area ID'] = df['Area ID'].astype(np.str)
df['Charge Group Code'] = df['Charge Group Code'].astype(np.str)
df['Hour'] = df['Hour'].astype(np.str)
#df['arrest_year'] = df['arrest_year'].astype(np.str)
df['arrest_month'] = df['arrest_month'].astype(np.str)
df['arrest_day_of_week'] = df['arrest_day_of_week'].astype(np.str)

df_lightgbm = df
# print clean dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1277176 entries, 0 to 1324972
Data columns (total 10 columns):
Area ID               1277176 non-null object
Reporting District    1277176 non-null object
Age                   1277176 non-null int8
Sex Code              1277176 non-null object
Descent Code          1277176 non-null object
Charge Group Code     1277176 non-null object
Arrest Type Code      1277176 non-null object
Hour                  1277176 non-null object
arrest_month          1277176 non-null object
arrest_day_of_week    1277176 non-null object
dtypes: int8(1), object(9)
memory usage: 98.7+ MB
Wall time: 10.9 s


In [4]:
%%time
# Create buckets for Age
df['age_range'] = pd.cut(df.Age,[16,25,35,45,55,65,75,1e6],4,labels=[0,1,2,3,4,5,6]) # this creates a new variable
df['age_range'] = df.age_range.astype(np.int)

# Replace the current Sex atribute with something slightly more intuitive and readable
df['IsMale'] = df['Sex Code']=='M' 
df.IsMale = df.IsMale.astype(np.int)

# Perform one-hot encoding of the categorical data "DOW"
tmp_df = pd.get_dummies(df['arrest_day_of_week'],prefix='DOW',drop_first=True)
df = pd.concat((df,tmp_df),axis=1) # add back into the dataframe

# Perform one-hot encoding of the categorical data "Area ID"
tmp_df = pd.get_dummies(df['Area ID'],prefix='Area',drop_first=True)
df = pd.concat((df,tmp_df),axis=1) # add back into the dataframe

# Perform one-hot encoding of the categorical data "Charge Group Code"
tmp_df = pd.get_dummies(df['Charge Group Code'],prefix='Charge',drop_first=True)
df = pd.concat((df,tmp_df),axis=1) # add back into the dataframe

# Perform one-hot encoding of the categorical data "Hour"
tmp_df = pd.get_dummies(df['Hour'],prefix='Hour',drop_first=True)
df = pd.concat((df,tmp_df),axis=1) # add back into the dataframe

# Perform one-hot encoding of the categorical data "Month"
tmp_df = pd.get_dummies(df['arrest_month'],prefix='Month',drop_first=True)
df = pd.concat((df,tmp_df),axis=1) # add back into the dataframe


Wall time: 2.87 s


In [5]:
%%time

df_arrest = df
df_descent = df

#Final encoding steps for Arrest Type Code classification dataset
# Encode Arrest Type Code as Categorical
cleanup_arrest = {"Arrest Type Code": {"F": 0, "M": 1, "I": 2, "O":3}}
df_arrest.replace(cleanup_arrest,inplace=True)

# Perform one-hot encoding of the categorical data "Descent Code"
tmp_df = pd.get_dummies(df_arrest['Descent Code'],prefix='Descent',drop_first=True)
df_arrest = pd.concat((df_arrest,tmp_df),axis=1) # add back into the dataframe

df_arrest.drop(['Sex Code','Descent Code','arrest_day_of_week','Area ID','Reporting District','Charge Group Code',
         'Age','Hour','arrest_month'], axis=1, inplace=True)

df_arrest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1277176 entries, 0 to 1324972
Data columns (total 94 columns):
Arrest Type Code    1277176 non-null int64
age_range           1277176 non-null int32
IsMale              1277176 non-null int32
DOW_Monday          1277176 non-null uint8
DOW_Saturday        1277176 non-null uint8
DOW_Sunday          1277176 non-null uint8
DOW_Thursday        1277176 non-null uint8
DOW_Tuesday         1277176 non-null uint8
DOW_Wednesday       1277176 non-null uint8
Area_10             1277176 non-null uint8
Area_11             1277176 non-null uint8
Area_12             1277176 non-null uint8
Area_13             1277176 non-null uint8
Area_14             1277176 non-null uint8
Area_15             1277176 non-null uint8
Area_16             1277176 non-null uint8
Area_17             1277176 non-null uint8
Area_18             1277176 non-null uint8
Area_19             1277176 non-null uint8
Area_2              1277176 non-null uint8
Area_20             1277176

grid search allows automated testing for things like KNN where you specify a grid of k values to test and record the results

In [6]:
df_arrest_small = pd.DataFrame.sample(df_arrest, frac = .05, random_state = 34128)
len(df_arrest_small)

63859

In [None]:
%%time
#knn example
#create parameter grid
knn = KNeighborsClassifier(n_neighbors = 5)

X = df_arrest_small #data
y = df_arrest_small['IsMale'] #target

k_range = list(range(1, 31))
param_grid = dict(n_neighbors = k_range)

grid = GridSearchCV(knn, param_grid, cv = 10, scoring = 'accuracy', return_train_score = False)

grid.fit(X, y)

In [None]:
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

on 1/10th% of the data, grid searchs says that 92% accuracy was achieved with k = 4 and 6

In [None]:
#plt.plot(k_range, )
score = pd.DataFrame(grid.cv_results_)['mean_test_score']
plt.plot(k_range, score)
plt.xlabel('Value of K for KNN')
plt.ylabel('Mean Score Accuracy')

In [None]:
knn  = KNeighborsClassifier(n_neighbors = 4, weights = 'uniform')
knn.fit(X, y)

df_arrest_test = pd.DataFrame.sample(df_arrest, frac = .01, random_state = 3412)

X = df_arrest_test #data
y = df_arrest_test['IsMale'] #target

#knn.predict(df_arrest_test)

Multiple parameter grid search

passes dict of possible variables to run function (knn), can add more? limited value depending on the type of analysis being performed

In [None]:
#multi parameter grid search

k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']

param_grid = dict(n_neighbors = k_range, weights = weight_options)
print(param_grid)

In [None]:
%%time
grid = GridSearchCV(knn, param_grid, cv = 10, scoring = 'accuracy', return_train_score = False)
grid.fit(X, y)
results = pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
results

In [None]:

#score_dist = pd.DataFrame(grid.cv_results_)['mean_test_score']

weight_u = pd.DataFrame(grid.cv_results_)[pd.DataFrame(grid.cv_results_)['param_weights'] == 'uniform']#results from uniform parameter weights
weight_d = pd.DataFrame(grid.cv_results_)[pd.DataFrame(grid.cv_results_)['param_weights'] == 'distance']

plt.plot(k_range, weight_u['mean_test_score'], color = 'orange')
plt.plot(k_range, weight_d['mean_test_score'], color = 'blue')
plt.xlabel('Value of K for KNN')
plt.ylabel('Mean Score Accuracy')
plt.legend()

In [None]:
%%time 
from sklearn.model_selection import RandomizedSearchCV

k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']

param_dist = dict(n_neighbors = k_range, weights = weight_options)

df_arrest_test = pd.DataFrame.sample(df_arrest, frac = .005, random_state = 3412)

X = df_arrest_test #data
y = df_arrest_test['IsMale'] #target


rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5, return_train_score=False)
rand.fit(X, y)

pd.DataFrame(rand.cv_results_)[['mean_test_score', 'std_test_score', 'params']]


In [None]:
%%time 
from sklearn.model_selection import RandomizedSearchCV

k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']

param_dist = dict(n_neighbors = k_range, weights = weight_options)

df_arrest_test = pd.DataFrame.sample(df_arrest, frac = .05, random_state = 3412)

X = df_arrest_test #data
y = df_arrest_test['age_range'] #target


rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5, return_train_score=False)
rand.fit(X, y)

pd.DataFrame(rand.cv_results_)[['mean_test_score', 'std_test_score', 'params']]


In [None]:
df_arrest_test.columns

Test portion 2: fit arima model with grid search

In [None]:
%%time
# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    # calculate out of sample error
    error = mean_squared_error(test, predictions)
    return error

In [None]:
%%time
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s MSE=%.3f' % (order,mse))
                except:
                    continue
    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))

In [None]:
#univariate data test, looking only at predicting... felonies over time?? arrests over time


In [None]:
%%time
#dprep = {df_raw['Arrest Type Code'], }
f_rate = pd.DataFrame()

f_rate['Date'] = df_raw['Arrest Date']
f_rate['Arrest Type'] = df_raw['Arrest Type Code']
a_rate = f_rate.groupby('Date').count()

plt.plot(a_rate, color = 'orange')

In [None]:
%%time
# evaluate parameters
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)
warnings.filterwarnings("ignore")
evaluate_models(series.values, p_values, d_values, q_values)

In [None]:
%%time
a_rate_small = pd.DataFrame.sample(a_rate, frac = .2, random_state = 34128)
len(a_rate_small)
plt.plot(a_rate_small)

In [None]:
%%time
#gridsearch to search for time series model parameters #hmm 10min on .01% data, oh wait that's because 
#its only 39 occuranceshours on 10% data, 
from pandas import read_csv
from pandas import datetime
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
 
# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    # calculate out of sample error
    error = mean_squared_error(test, predictions)
    return error
 
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s MSE=%.3f' % (order,mse))
                except:
                    continue
    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))
 
#load dataset
def parser(x):
	return datetime.strptime('190'+x, '%Y-%m')
series = a_rate_small
# evaluate parameters
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)
warnings.filterwarnings("ignore")
evaluate_models(series.values, p_values, d_values, q_values)

In [None]:
#fit arima model to ts and get predictions based on grid search results
#ARIMA(0, 1, 1) MSE=10460.118 has the lowest MSE

from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(a_rate_small, order=(0, 1, 1))
model_fit = model.fit(disp=0)
print(model_fit.summary())

In [None]:
model_fit.plot_predict(dynamic = False)
plt.show()

11. Deployment

(5 points)

How useful is your model for interested parties (i.e., the companies or organizations that might want to use it for prediction)? How would you measure the model's value if it was used by these parties? How would your deploy your model for interested parties? What other data should be collected? How often would the model need to be updated, etc.?

Model deployment - although minority report style predictive modeling would be (cool), the amount of cleaning necessary to process the data would likely bottleneck the process for a single county, let alone large municipalities. An ideal use would be obtaining relevant retrospective analysis on things that could influence anything intersecting the data gathered during the arrests. Changes in demographic, criminal definition, (literally anything on a macro sociological scale if you abstract out far enough) can possibly be reflected in arrest data types. The ability to condense wide scale sociological change into easily understandable and verifiable conclusions would be the primary function of this type of retrospective analysis, although (other stuff) Conclusions like “The new highschool police mentorship program in this county has reduced arrests in high school age people by 15% ” or “This new spanish language program for officers has reduced hispanic misdemeanors by 10%” would be valuable information for citizens, politicians, and civil servants who have influence in the process of creating/implementing such civil programs.  




### Model Comparison 

#### Task 1 - Classifying Arrest Type Code

|Model| AUC | Precision | Recall | F1 Score | Support | 
| :-- | :-- | :-- | :-- | :-- | :-- |
| "model 1" |     |     |     |     | 
| "model 2" |     |     |     |     |
| "model 3" |     |     |     |     |

#### Task 2 - Classifying Descent Code

|Model| AUC | Precision | Recall | F1 Score | Support | 
| :-- | :-- | :-- | :-- | :-- | :-- |
| "model 1" |     |     |     |     | 
| "model 2" |     |     |     |     |
| "model 3" |     |     |     |     |

(ROC/AUC) was chosen as the comparison metric for the 3 models on both tasks because (reasons go here). 

For classifying arrest code, the (winning model type) achieved a final ROC of (roc here),  (slightly/much) larger  compared to the others. This may be due to the unbalanced nature of the data; 60% misdemeanors, 30% felonies, less than 10% infractions and other.  This may fit the structural and computational assumptions of (winning model) better than (losing models). (evidence to back that up)

For Classifying Descent Code, the (winning model type) had a final ROC of (ROC here), (much/ only marginally) better than (the other two, with (roc2, roc3)). Descent code was also a fairly unbalanced dataset, with almost 50% of the arrests recorded as Hispanic, 30% black, and 15% white. (winning model) operates on (assumptions of winning model) which are more closely reflected in the data than (the assumptions of the other models).  

