## Assignment 4
### Author: Emily McAfee
#### Predictive Policing

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Smote
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

# RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Forward feature selection
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

# LASSO & ridge regression
from sklearn import linear_model

## 1. Read data

In [2]:
# Import data
filename = "http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data"
ccdf = pd.read_csv(filename, header = None)
ccdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.0,?,0.67
2,24,?,?,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,?,?,?,?,0.0,?,0.43
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,?,?,?,?,0.0,?,0.12
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,?,?,?,?,0.0,?,0.03


In [3]:
# Replace ?s with NaNs
ccdf = ccdf.replace(to_replace = '?', value = float('NaN'))
ccdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,8,,,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,,,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.0,,0.67
2,24,,,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.0,,0.43
3,34,5.0,81440.0,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,,,,,0.0,,0.12
4,42,95.0,6096.0,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,,,,,0.0,,0.03


In [4]:
# Make towns index
ccdf.set_index(ccdf[3], inplace = True)
ccdf

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Lakewoodcity,8,,,Lakewoodcity,1,0.19,0.33,0.02,0.90,0.12,...,0.12,0.26,0.20,0.06,0.04,0.9,0.5,0.32,0.14,0.20
Tukwilacity,53,,,Tukwilacity,1,0.00,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.00,,0.67
Aberdeentown,24,,,Aberdeentown,1,0.00,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.00,,0.43
Willingborotownship,34,5,81440,Willingborotownship,1,0.04,0.77,1.00,0.08,0.12,...,0.02,0.39,0.28,,,,,0.00,,0.12
Bethlehemtownship,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,,,,,0.00,,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TempleTerracecity,12,,,TempleTerracecity,10,0.01,0.40,0.10,0.87,0.12,...,0.01,0.28,0.05,,,,,0.00,,0.09
Seasidecity,6,,,Seasidecity,10,0.05,0.96,0.46,0.28,0.83,...,0.02,0.37,0.20,,,,,0.00,,0.45
Waterburytown,9,9,80070,Waterburytown,10,0.16,0.37,0.25,0.69,0.04,...,0.08,0.32,0.18,0.08,0.06,0.78,0,0.91,0.28,0.23
Walthamcity,25,17,72600,Walthamcity,10,0.08,0.51,0.06,0.87,0.22,...,0.03,0.38,0.33,0.02,0.02,0.79,0,0.22,0.18,0.19


In [5]:
# Drop column 3 (idk why it stayed)
ccdf = ccdf.drop(3, axis = 1)
ccdf

Unnamed: 0_level_0,0,1,2,4,5,6,7,8,9,10,...,118,119,120,121,122,123,124,125,126,127
3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Lakewoodcity,8,,,1,0.19,0.33,0.02,0.90,0.12,0.17,...,0.12,0.26,0.20,0.06,0.04,0.9,0.5,0.32,0.14,0.20
Tukwilacity,53,,,1,0.00,0.16,0.12,0.74,0.45,0.07,...,0.02,0.12,0.45,,,,,0.00,,0.67
Aberdeentown,24,,,1,0.00,0.42,0.49,0.56,0.17,0.04,...,0.01,0.21,0.02,,,,,0.00,,0.43
Willingborotownship,34,5,81440,1,0.04,0.77,1.00,0.08,0.12,0.10,...,0.02,0.39,0.28,,,,,0.00,,0.12
Bethlehemtownship,42,95,6096,1,0.01,0.55,0.02,0.95,0.09,0.05,...,0.04,0.09,0.02,,,,,0.00,,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TempleTerracecity,12,,,10,0.01,0.40,0.10,0.87,0.12,0.16,...,0.01,0.28,0.05,,,,,0.00,,0.09
Seasidecity,6,,,10,0.05,0.96,0.46,0.28,0.83,0.32,...,0.02,0.37,0.20,,,,,0.00,,0.45
Waterburytown,9,9,80070,10,0.16,0.37,0.25,0.69,0.04,0.25,...,0.08,0.32,0.18,0.08,0.06,0.78,0,0.91,0.28,0.23
Walthamcity,25,17,72600,10,0.08,0.51,0.06,0.87,0.22,0.10,...,0.03,0.38,0.33,0.02,0.02,0.79,0,0.22,0.18,0.19


In [6]:
# Change entire dataframe to numeric
ccdf2 = ccdf.apply(pd.to_numeric)

ccdf2.dtypes

0        int64
1      float64
2      float64
4        int64
5      float64
        ...   
123    float64
124    float64
125    float64
126    float64
127    float64
Length: 127, dtype: object

In [7]:
# Replace NaNs with means
ccdf2.mean()
ccdf3 = ccdf2.fillna(ccdf2.mean())

In [8]:
ccdf3.head()

Unnamed: 0_level_0,0,1,2,4,5,6,7,8,9,10,...,118,119,120,121,122,123,124,125,126,127
3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Lakewoodcity,8,58.826829,46188.336597,1,0.19,0.33,0.02,0.9,0.12,0.17,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
Tukwilacity,53,58.826829,46188.336597,1,0.0,0.16,0.12,0.74,0.45,0.07,...,0.02,0.12,0.45,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.67
Aberdeentown,24,58.826829,46188.336597,1,0.0,0.42,0.49,0.56,0.17,0.04,...,0.01,0.21,0.02,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.43
Willingborotownship,34,5.0,81440.0,1,0.04,0.77,1.0,0.08,0.12,0.1,...,0.02,0.39,0.28,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.12
Bethlehemtownship,42,95.0,6096.0,1,0.01,0.55,0.02,0.95,0.09,0.05,...,0.04,0.09,0.02,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.03


In [9]:
min(ccdf[127])
max(ccdf[127])
ccdf[127].mean()

0.23797893681043028

In [10]:
# Drop "non-predictive" columns
ccdf4 = ccdf3.drop([0,1,2,4], axis = 1)
ccdf4.head()

Unnamed: 0_level_0,5,6,7,8,9,10,11,12,13,14,...,118,119,120,121,122,123,124,125,126,127
3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Lakewoodcity,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
Tukwilacity,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.02,0.12,0.45,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.67
Aberdeentown,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.01,0.21,0.02,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.43
Willingborotownship,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,0.21,...,0.02,0.39,0.28,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.12
Bethlehemtownship,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.04,0.09,0.02,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.03


In [11]:
ccdf4.shape

(1994, 123)

In [12]:
# Change variable we should be predicting to binary (i.e. more than average crime)
ccdf4[127].mean()

# This seems a little low so we'll change it to anything >50
ccdf5 = ccdf4.copy()
ccdf5[127] = np.where(ccdf4[127] > .5, 1, 0)

ccdf5.head()

Unnamed: 0_level_0,5,6,7,8,9,10,11,12,13,14,...,118,119,120,121,122,123,124,125,126,127
3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Lakewoodcity,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0
Tukwilacity,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.02,0.12,0.45,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,1
Aberdeentown,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.01,0.21,0.02,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0
Willingborotownship,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,0.21,...,0.02,0.39,0.28,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0
Bethlehemtownship,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.04,0.09,0.02,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0


In [13]:
print('How many neighborhoods may need extra support: ', sum(ccdf5[127]))
print('How many neighborhoods that do not need extra support: ', len(ccdf5) - (sum(ccdf5[127])))

How many neighborhoods may need extra support:  277
How many neighborhoods that do not need extra support:  1717


## 2. Apply 3 techniques for filter selection

In [14]:
# Fix class imbalance
# Establish variables
x = ccdf5.loc[:, ccdf5.columns != 127]

# Establish target label (variable we are trying to predict)
y = ccdf5[127]

# Apply SMOTE for class imbalance
# What did the imbalance look like before SMOTE
print('Original dataset shape {}'.format(Counter(y)))

# Apply SMOTE
sm = SMOTE(random_state = 42)
x_res, y_res = sm.fit_sample(x,y)

# What does the imbalance look like after SMOTE
print('Resampled dataset shape {}'.format(Counter(y_res)))

Original dataset shape Counter({0: 1717, 1: 277})
Resampled dataset shape Counter({0: 1717, 1: 1717})


## Filter methods

Mutual Information

In [15]:
# Establish variables
# Establish features
x = x_res.iloc[:,0]

# Establish target label (variable we are trying to predict)
y = y_res

# Calculation Correlation
corr = np.corrcoef(x, y)[0, 1]
print("Correlation between X and Y is %.2f"%corr)
# Calculate Mutual Information

from sklearn.metrics import mutual_info_score

def calc_MI(x, y, bins):
    c_xy = np.histogram2d(x, y, bins)[0]
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi

mi = calc_MI(x, y, 20)
print("Mutual information=%.2f"%mi)

Correlation between X and Y is 0.29
Mutual information=0.07


^population v. ViolentCrimesPerPop: total number of violent crimes per 100K popuation (numeric - decimal) GOAL attribute (to be predicted)

## Wrapper methods

stepwise selection (backward)

In [16]:
# Establish model we will be using (linear regression)
estimator = LinearRegression()

# Establish how many features you will select (1 = remove one variable)
selector = RFE(estimator, 5, step=1)

# Fit model to data
selector = selector.fit(x_res, y_res)

# Print the mask of selected features
print(selector.support_)

# Print ranking of features (selected features are ranked 1, 6th is removed first, 2nd is the last removed)
print(selector.ranking_)


[ True False False False False False False False False False  True False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False False False  True False False False False
 False False False False False False False False False False False False
 False False]
[  1  88  32  56  93 115  38  36  37  39   1  99  34 112  98  87  40  69
  35  82 114  66  77  86 100  79 113  18  30  41  45  46  24  68  55  91
  54  71  84  50  85  83  43   2   1 117  73  92  26   5  58  27  94  96
 103 116  29  95  28  31  60  61  44 

In [17]:
# See which features we would like to keep
np.where(selector.support_ == True)

(array([  0,  10,  44,  97, 103]),)

## Embedded methods

LASSO regression

In [19]:
# LASSO
alpha = .05
clf = linear_model.Lasso(alpha=alpha)
clf.fit(x_res, y_res)

print(clf.coef_)

print(clf.intercept_)

[ 0.         -0.          0.00406642 -0.08873408  0.          0.
 -0.          0.          0.          0.          0.          0.
 -0.         -0.         -0.         -0.          0.          0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.          0.          0.          0.
  0.         -0.          0.         -0.         -0.         -0.
  0.         -0.          0.          0.          0.          0.
  0.         -0.         -0.         -0.         -0.         -0.
 -0.          0.          0.50641252  0.          0.          0.
  0.          0.          0.          0.          0.          0.
 -0.          0.          0.          0.         -0.         -0.
  0.         -0.          0.          0.         -0.          0.
 -0.         -0.          0.          0.         -0.          0.
  0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.         -0.          0.          0.
  0.          0.         

In [20]:
# What are the indices of our selected features?
np.where(clf.coef_ != 0)

(array([ 2,  3, 50]),)

## 3. Describe your findings

In an attempt to predict the amount of violent crimes in particular neighborhoods, we investigated socio-economic, law-enforcement, and FBI data. Before we can attempt any machine learning models on the data we recognized a fair amount of class imbalance (i.e. far more places with less crime than those with too much - which makes sense). We took care of the class imbalances so that future machine learning models would be less biased. We then applied filter, wrapper, and embedded methods for feature selection. For our *filter* method we found mutual information between our predictor variable and a variable of interest. We found that population size and amount of crime in an area is moderately positively correlated (*r* = .29), while the mutual infoormation (non-linear relationship) was .07. These results make sense, as there should be some type of relationship between population and amount of crime, and a positvely linear one would be most logical (i.e. more people = more crime). For our *wrapper* methods we chose the stepwise selection method of backward model selection. We found the top five features that most significantly impact our prediction for y. Our embedded methods required a very low alpha to see any signnificant features, which we eventually got with alpha = .05 and for 3 features.