In [56]:
import pandas as pd
import numpy as np
import csv
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import pylab as pl

% pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
data = pd.read_csv('hackathon-CUSP-WI_clean.csv', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


In [1138]:
data.columns

Index([u'DA Case Status', u'Ref Date', u'Referral Type', u'Ref Agency',
       u'Primary Officer', u'Prosecutor', u'Reviewing Prosecutor', u'Unit',
       u'VW Assigned', u'Def Pros Counselor', u'Defendant Name',
       u'Defendant Address Line1', u'Defendant Address Line2',
       u'Defendant City', u'Defendant State', u'Defendant Zip',
       u'Defendant Race', u'Defendant Gender', u'Defendant DOB',
       u'Defendant's Age at Incident', u'Defendant’s Arrest Status',
       u'Defendant Case Status', u'CC# Year', u'Incident Date',
       u'Municipality', u'Incident Zip', u'Type', u'Summary Charge Statute #',
       u'Summary Charge Desc', u'Severity', u'Class', u'Charge Status',
       u'Charge Dispo', u'Dispo Date', u'Modifier', u'Enhancer1', u'Enhancer2',
       u'Enhancer3', u'Child Abuse', u'Child Neglect', u'Child Pornography',
       u'Child Sexual Assault', u'Child Support', u'Civil Forfeiture',
       u'Civil Traffic', u'Criminal', u'Criminal Traffic', u'Diversion',
       u'D

# drop items without known charge disposition

In [5]:
known_dispo = data.dropna(axis = 0, subset = ['Charge Dispo'])

In [6]:
# remove fields related to forfeiture and extradition per DA's advice
known_dispo = known_dispo[known_dispo['Civil Forfeiture'].isnull()]
known_dispo = known_dispo[known_dispo['Property Forfeiture'].isnull()]
known_dispo = known_dispo[known_dispo.Extradition.isnull()]

In [8]:
# filter to just felonies (could be run separately for Misdemeanors & University)
known_dispo = known_dispo[known_dispo.Severity != 'O  ']

In [9]:
known_dispo = known_dispo[known_dispo.Severity != '   ']

In [10]:
known_dispo = known_dispo[known_dispo.Severity == "M  "]

In [60]:
known_dispo = known_dispo[['Defendant Race', 'Defendant Gender', 'Class', 'Charge Dispo']]
known_dispo.dropna(inplace = True)

In [61]:
X = pd.get_dummies(known_dispo.iloc[:,:-1])

In [62]:
Y = known_dispo['Charge Dispo']

# we would expect that similar crime levels would not show a racial or class discrepancies in resultant charge dispositions. Therefore, I first classified the data using classifier based on Class, Race and Gender.  

In [63]:
bnb = RandomForestClassifier(max_depth = 5)
bnb.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [64]:
# Calculate percent incorrect
float(sum(Y != bnb.predict(X))) / len(Y) * 100 

37.24662162162162

In [65]:
# model does not really perform better than sorting everyone into No Contest 
Y.value_counts() / len(Y) * 100

Guilty Due to No Contest Plea                           62.697072
Dismissed on Prosecutors Motion                         15.230856
Charge Dismissed but Read In                             8.474099
Guilty Due to Guilty Plea                                7.164977
Deferred Prosecution or Sentence                         4.279279
Amended Complaint Filed                                  0.661599
Dismissed Before Initial Appearance                      0.408221
Dismissed on Defendants Motion                           0.309685
Found Guilty at Jury Trial                               0.225225
Amended By Prosecutor Or Court                           0.197072
Found Not Guilty at Jury Trial                           0.182995
Dismissed on Courts own Motion                           0.070383
Found Not Guilty at Court Trial                          0.028153
Discharged After Being Found Incompetent                 0.014077
Guilty but Not Guilty Due to Mental Disease/Defect       0.014077
Found Guil

In [66]:
known_dispo2 = known_dispo[known_dispo['Charge Dispo'] != 'Guilty Due to No Contest Plea']

In [80]:
X = pd.get_dummies(known_dispo2.iloc[:,:-1])
Y = known_dispo2['Charge Dispo']
bnb = RandomForestClassifier(max_depth = 10)
bnb.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [81]:
# Calculate percent incorrect
float(sum(Y != bnb.predict(X))) / len(Y) * 100 

55.05660377358491

In [82]:
Y.value_counts() / len(Y) * 100 # only slightly better

Dismissed on Prosecutors Motion                         40.830189
Charge Dismissed but Read In                            22.716981
Guilty Due to Guilty Plea                               19.207547
Deferred Prosecution or Sentence                        11.471698
Amended Complaint Filed                                  1.773585
Dismissed Before Initial Appearance                      1.094340
Dismissed on Defendants Motion                           0.830189
Found Guilty at Jury Trial                               0.603774
Amended By Prosecutor Or Court                           0.528302
Found Not Guilty at Jury Trial                           0.490566
Dismissed on Courts own Motion                           0.188679
Found Not Guilty at Court Trial                          0.075472
Guilty but Not Guilty Due to Mental Disease/Defect       0.037736
Found Guilty at Court Trial                              0.037736
Administrative Disposition                               0.037736
Dismissed-

# With this dataset, a vast number of items were either dismissed by prosecutors or guilty due to no contest plea leading to imbalanced classes. It may be worth rerunning with some of the observations from the dominant classes removed. However, for this dataset there were not enough instances in the lower classes for this to be doable. There are obviously also many other factors related to charge dispostion that are not accounted for in a model based on just gender, race and class of crime. It may also be interesting to further subset the dataset (for instance, just attempt to classify the outcomes of those people who did go to trial) to look for gender or racial imbalances. I intended to attempt to quantify gender/racial differences by creating 'fake' data by changing, for instance, the gender for all the data to male and female and then running the classifier model on both. An example is below:

In [115]:
test_allmale = X.copy()
test_allfemale = X.copy()

In [120]:
test_allmale['Defendant Gender_F   '] = 0
test_allmale['Defendant Gender_M   '] = 1
test_allmale['Defendant Gender_F   '] = 1
test_allmale['Defendant Gender_M   '] = 0

In [123]:
predicted_allmale = bnb.predict(test_allmale)
predicted_allfemale = bnb.predict(test_allfemale)

In [126]:
pd.Series(predicted_allmale).value_counts() / len(predicted_allmale)

Dismissed on Prosecutors Motion     0.892830
Guilty Due to Guilty Plea           0.049057
Charge Dismissed but Read In        0.042642
Deferred Prosecution or Sentence    0.013585
Found Not Guilty at Jury Trial      0.001887
dtype: float64

In [127]:
pd.Series(predicted_allfemale).value_counts() / len(predicted_allfemale)

Dismissed on Prosecutors Motion     0.715849
Charge Dismissed but Read In        0.225283
Guilty Due to Guilty Plea           0.049811
Deferred Prosecution or Sentence    0.008302
Found Not Guilty at Jury Trial      0.000755
dtype: float64

# From these results we can see the approximate effect that male v. female has on the model. That said, the model is not especially accurate so this can be due to covariance between the variables.