### Summary
Here, I evaluate classification models for their ability to predict peremptory juror strikes by the state of Mississippi, focusing on the importance of race in the decision. 

Data were gathered by the reporting team at APM Reports for the podcast "In The Dark"

### Model Prep

In [222]:
import pandas as pd

In [25]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [3]:
with open('model.pickle', 'rb') as read_file:
    df = pickle.load(read_file)

In [5]:
df.head()

Unnamed: 0,struck_by_prosecution,race,same_race,prior_jury,crime_victim,fam_crime_victim,accused,fam_accused,fam_law_enforcement,know_def,know_vic,know_wit,know_attny,prior_info,death_hesitation
2,0,1,0,False,False,False,False,False,False,False,False,False,False,False,False
3,0,1,0,False,False,False,False,False,False,False,False,False,False,False,False
4,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False
5,0,1,0,False,False,False,False,False,True,False,False,False,False,False,False
6,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False


In [18]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [19]:
X.shape, y.shape

((2216, 14), (2216,))

In [258]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

In [259]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1551, 14), (665, 14), (1551,), (665,))

### Class Imbalance
This juror sample resulted in 25% being struck by the state and 75% moving on to the defense. To better predict the minority class (those that are struck), I will oversample and evaluate the aoc score.

In [184]:
from imblearn.over_sampling import RandomOverSampler

Using TensorFlow backend.


In [185]:
from collections import Counter

In [197]:
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(X_train, y_train)

In [198]:
Counter(y_resampled)

Counter({0: 1161, 1: 1161})

### Logisitic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

In [260]:
logr = LogisticRegression()
logr.fit(X_train, y_train)
logr.score(X_test, y_test)



0.8180451127819549

In [261]:
logr.score(X_train, y_train) 

0.8252740167633784

In [262]:
logr.coef_

array([[ 1.97998189,  0.23595699,  0.01463397,  0.33991134,  0.55632363,
         1.74419517,  1.73442961, -0.55260197,  1.39656531,  0.20952216,
         0.03120787,  0.48940616, -0.21476876,  1.24157926]])

In [263]:
confusion_matrix(y_test, logr.predict(X_test))

array([[480,  19],
       [102,  64]])

In [264]:
roc_auc_score(y_test, logr.predict_proba(X_test)[:,1])

0.825289132481831

#### Oversampled

In [199]:
logr_o = LogisticRegression()
logr_o.fit(X_resampled, y_resampled)
logr_o.score(X_test, y_test)



0.7669172932330827

In [200]:
logr_o.score(X_resampled, y_resampled)

0.7583979328165374

In [201]:
confusion_matrix(y_test, logr_o.predict(X_test))

array([[389, 114],
       [ 41, 121]])

In [205]:
roc_auc_score(y_test, logr_o.predict_proba(X_test)[:,1])

0.8148884470952066

### Decision Tree

In [74]:
from sklearn.tree import DecisionTreeClassifier

In [100]:
dt = DecisionTreeClassifier(max_depth = 6)
dt.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [101]:
dt.score(X_test, y_test)

0.8135338345864662

In [102]:
dt.score(X_train, y_train)

0.8330109606705351

In [110]:
dt.feature_importances_

array([0.54226714, 0.00589773, 0.00561561, 0.        , 0.01034082,
       0.05626232, 0.22054766, 0.02077061, 0.07427627, 0.00494721,
       0.00151587, 0.01068012, 0.01720339, 0.02967526])

In [206]:
roc_auc_score(y_test, dt.predict_proba(X_test)[:,1])

0.7732555285570528

In [183]:
confusion_matrix(y_test, dt.predict(X_test))

array([[493,  10],
       [114,  48]])

### Random Forest

In [111]:
from sklearn.ensemble import RandomForestClassifier

In [166]:
rf = RandomForestClassifier(n_estimators = 100, max_depth = 4)

In [167]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [168]:
rf.score(X_test, y_test)

0.8225563909774436

In [169]:
rf.score(X_train, y_train) 

0.8265635074145713

In [170]:
confusion_matrix(y_test, rf.predict(X_test))

array([[495,   8],
       [110,  52]])

In [203]:
roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])

0.8222516751343789

In [174]:
rf.feature_importances_

array([0.41202961, 0.16553057, 0.00602516, 0.00434598, 0.00559758,
       0.04582226, 0.20390392, 0.02213455, 0.08425867, 0.00692172,
       0.00474002, 0.00773134, 0.01010099, 0.02085763])

### SVM

In [40]:
from sklearn import svm

#### Linear

In [41]:
svm_l = svm.SVC(kernel='linear')

In [42]:
svm_l.fit(X_train,y_train)
svm_l.score(X_test, y_test)

0.8150375939849624

In [43]:
svm_l.score(X_train, y_train)

0.8265635074145713

In [48]:
confusion_matrix(y_test, svm_l.predict(X_test))

array([[487,  16],
       [107,  55]])

In [209]:
roc_auc_score(y_test, svm_l.predict(X_test))

0.6538485138551408

#### Rbf

In [118]:
svm_r = svm.SVC(kernel='rbf')

In [119]:
svm_r.fit(X_train,y_train)
svm_r.score(X_test, y_test)



0.8195488721804511

In [120]:
svm_r.score(X_train, y_train)

0.8259187620889749

In [121]:
confusion_matrix(y_test, svm_r.predict(X_test))

array([[491,  12],
       [108,  54]])

In [178]:
roc_auc_score(y_test, svm_r.predict(X_test))

0.6547382372432075

### KNN

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [65]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [66]:
knn.score(X_train, y_train)

0.7962604771115409

In [67]:
knn.score(X_test, y_test)

0.7984962406015037

In [50]:
confusion_matrix(y_test, knn.predict(X_test))

array([[444,  59],
       [ 75,  87]])

In [51]:
roc_auc_score(y_test, knn.predict_proba(X_test)[:,1])

0.7459624966251871

### Naive Bayes

In [58]:
from sklearn.naive_bayes import BernoulliNB

In [69]:
nb = BernoulliNB()

In [70]:
nb.fit(X_train, y_train)
nb.score(X_test, y_test)

0.8

In [71]:
nb.score(X_train, y_train)

0.7949709864603481

In [177]:
confusion_matrix(y_test, nb.predict(X_test))

array([[436,  67],
       [ 66,  96]])

In [73]:
roc_auc_score(y_test, nb.predict_proba(X_test)[:,1])

0.8185823331615246

## Best model: Logistic Regression

In [284]:
df.xtest = pd.DataFrame(X_test)

  """Entry point for launching an IPython kernel.


In [285]:
xtest['predict'] = logr.predict(X_test)

In [289]:
xtest['actual'] = y_test

In [290]:
xtest.head()

Unnamed: 0,race,same_race,prior_jury,crime_victim,fam_crime_victim,accused,fam_accused,fam_law_enforcement,know_def,know_vic,know_wit,know_attny,prior_info,death_hesitation,predict,actual
1392,1,1,False,False,False,False,True,False,False,False,False,False,False,False,1,1
2782,0,0,False,False,False,False,False,False,False,False,False,False,False,False,0,0
3153,1,1,False,False,False,False,False,True,True,False,False,False,False,False,1,0
2196,1,1,False,False,False,False,False,False,True,False,False,False,False,False,1,1
2250,1,0,False,False,False,False,False,False,False,False,False,False,False,False,0,0


In [301]:
xtest.predict.value_counts()

0    582
1     83
Name: predict, dtype: int64

In [302]:
def result(row):
    if (row['struck_by_prosecution'] == 0) and (row['predict'] == 0):
        val = 'True Negative'
    
    elif (row['struck_by_prosecution'] == 1) and (row['predict'] == 0):
        val = 'False Negative'
    
    elif (row['struck_by_prosecution'] == 0) and (row['predict'] == 1):
        val = 'False Positive'
    
    else:
        val = 'True Positive'
    
    return val

In [303]:
xtest['result'] = test.apply(result, axis = 1)

In [304]:
xtest.result.value_counts()

True Negative     122
False Negative     44
False Positive     24
True Positive       3
Name: result, dtype: int64

In [305]:
xtest.groupby('result')['race'].value_counts()

result          race
False Negative  1       31
                0       13
False Positive  0       18
                1        6
True Negative   0       99
                1       23
True Positive   1        2
                0        1
Name: race, dtype: int64