# Magistrates
* This notebook generates figures for the 'bail set by' page of the dashboard. 
* Note that some of these figures overlap with the magistrate section in 'dashboard.ipynb'

In [1]:
import ast
import datetime
import itertools
import matplotlib.pyplot as plt
from matplotlib import rc_file_defaults
from matplotlib import ticker

plt.style.use('ggplot')

import os
import sys
import numpy as np
import pandas as pd
import random
import regex as re
import seaborn as sns

from dateutil.relativedelta import relativedelta
from gensim.models.nmf import Nmf
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from scipy.stats import chi2_contingency, mannwhitneyu, wilcoxon
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from wordcloud import WordCloud

import plotly.express as px
import plotly.graph_objs as go
from ipywidgets import widgets
from plotly.subplots import make_subplots

import json
import copy

In [2]:
# Set various data and figure directories relative to cwd
# (which is hopefully always this file's dirname)
cwd = os.getcwd()
cwd_split = os.path.split(cwd)
srcdir = cwd_split[0]
#sys.path.append(os.path.abspath(os.path.join('..', cwd_split[-1])))
sys.path.append(srcdir)
root = os.path.split(srcdir)[0]
rawdir = os.path.join(root, "data/raw") # Raw csv files; local only!
tmpdir = os.path.join(root, "data/tmp") # Intermediate data products; local only!
extdir = os.path.join(root, "data/external") # Data from external sources; in repo
cleandir = os.path.join(root, "data/cleaned") # Data for dashboard generation; in repo
figdir = os.path.join(root, "figures") # Final figures; in repo

In [3]:
import pltformat
colorDict = pltformat.get_color_dictionary()

# 1. Import and preprocess

In [4]:
import preprocess

## 1(a) 2020 data

In [151]:
# Set raw csv data file paths
docketcsv = "docket_reparse_03_02_2021.csv"
courtcsv = "court_summary.csv"
outcsv = "processed_data.csv"
trimmedcsv = "app_data.csv"

docketpath = os.path.join(rawdir, docketcsv)
courtpath = os.path.join(rawdir, courtcsv)
outpath = os.path.join(tmpdir, outcsv)
trimmedpath = os.path.join(cleandir, trimmedcsv)


In [153]:
# Merge and clean docket and court summary data and save to new csv file
df_2020 = preprocess.merge_and_clean_data(docketpath, courtpath,
                                     outPath=outpath, verbose=True)

Removing 13 cases for which prelim_hearing_dt - bail_date was more than 5...
> Imported 24225 rows with 25 columns:
age
age_group
arrest_dt
arresting_officer
attorney
attorney_type
bail_amount
bail_date
bail_paid
bail_set_bin
bail_set_by
bail_type
case_status
dob
is_bail_posted
is_philly_zipcode
offense_date
offense_type
offenses
prelim_hearing_dt
prelim_hearing_time
race
sex
statute
zip
> Saved new file


In [154]:
df_2020.head()

Unnamed: 0,attorney,attorney_type,offenses,offense_date,statute,offense_type,bail_set_by,bail_amount,bail_paid,bail_date,...,arrest_dt,prelim_hearing_dt,prelim_hearing_time,sex,race,is_philly_zipcode,age,age_group,bail_set_bin,is_bail_posted
0,Defender Association of Philadelphia,Public,"[DUI: Gen Imp/Inc of Driving Safely - 1st Off,...",2019-12-31,"[75 § 3802, 75 § 3323]",[driving after imbibing alcohol or utilizing d...,"Bernard, Francis X.",0.0,0.0,2020-01-01,...,2020-01-01,2020-01-01,2021-04-04 04:05:00,Male,White,1,25.0,18 to 25,,0
1,Joseph Kevin Kelly,Private,[DUI: Gen Imp/Inc of Driving Safely - 1st Off],2019-12-31,[75 § 3802],[driving after imbibing alcohol or utilizing d...,"Bernard, Francis X.",0.0,0.0,2020-01-01,...,2020-01-01,2020-01-01,2021-04-04 04:07:00,Male,White,1,24.0,18 to 25,,0
2,Defender Association of Philadelphia,Public,[Retail Theft-Take Mdse],2019-12-31,[18 § 3929],[theft and related offenses],"Bernard, Francis X.",0.0,0.0,2020-01-01,...,2019-12-31,2020-01-01,2021-04-04 04:10:00,Male,Black,1,57.0,26 to 64,,0
3,Defender Association of Philadelphia,Public,"[Simple Assault, Recklessly Endangering Anothe...",2019-12-31,"[18 § 2701, 18 § 2705]","[assault, assault]","Bernard, Francis X.",30000.0,0.0,2020-01-01,...,2019-12-31,2020-01-01,2021-04-04 04:15:00,Male,Black,1,32.0,26 to 64,25k to 50k,0
4,Lee Mandell,Court Appointed,"[Robbery-Inflict Threat Imm Bod Inj, Conspirac...",2019-12-31,"[18 § 3701 §§ A1IV, 18 § 903 §§ C, 18 § 3921 §...","[robbery, inchoate crimes, theft and related o...",No Magistrate Found,30000.0,0.0,2020-01-01,...,2020-01-01,2020-01-01,2021-04-04 08:39:00,Male,Black,1,18.0,18 to 25,25k to 50k,0


## 1(b) 2021 data

In [71]:
# Set raw csv data file paths
docketcsv = "2021-jan_feb_march_dockets.csv"
courtcsv = "2021-jan_feb_march_court.csv"
outcsv = "processed_data_2021.csv"
trimmedcsv = "app_data_2021.csv"

docketpath = os.path.join(rawdir, docketcsv)
courtpath = os.path.join(rawdir, courtcsv)
outpath = os.path.join(tmpdir, outcsv)
trimmedpath = os.path.join(cleandir, trimmedcsv)


In [72]:
# Merge and clean docket and court summary data and save to new csv file
df_2021 = preprocess.merge_and_clean_data(docketpath, courtpath,
                                     outPath=outpath, verbose=True)

Removing 1 cases for which prelim_hearing_dt - bail_date was more than 5...
> Imported 6017 rows with 25 columns:
age
age_group
arrest_dt
arresting_officer
attorney
attorney_type
bail_amount
bail_date
bail_paid
bail_set_bin
bail_set_by
bail_type
case_status
dob
is_bail_posted
is_philly_zipcode
offense_date
offense_type
offenses
prelim_hearing_dt
prelim_hearing_time
race
sex
statute
zip
> Saved new file


# 2. Select magistrates

## 2(a) Select magistrates for year 2020
* Select 10 magistrates who handled the most number of cases in 2020 

In [167]:
magistrates_2020 = list(df_2020['bail_set_by'].value_counts()[:10].index)

In [168]:
magistrates_2020

["O'Brien, James",
 'Bernard, Francis X.',
 'Stack, Patrick',
 'E-Filing Judge',
 'Rigmaiden-DeLeon, Marilyn',
 'Rainey, Debra',
 'Williams, Naomi',
 'Connor, Lauren',
 'Devlin, Kevin R.',
 'No Magistrate Found']

## 2(b) Select magistrates for year 2021
* Select all magistrates

In [169]:
magistrates_2021 = list(df_2021['bail_set_by'].value_counts().index)

In [170]:
magistrates_2021

['Stack, Patrick',
 'Rainey, Debra',
 'Connor, Lauren',
 'Rigmaiden-DeLeon, Marilyn',
 'Bernard, Francis X.',
 "O'Brien, James",
 'Williams, Naomi',
 'No Magistrate Found',
 'Caudo, Michael A.',
 'E-Filing Judge']

# 3. plot year-end summary of bail type by magistrate 
* For year-end summary, report for the 9 selected magistrates (who handled more than 500 cases) and all others (under category 'Other')

## 3(a) Prepare dataframe

In [172]:
def prep_barplot_data(df, magistrates = []):
    # create dataframe for plotting interactive bar plot
    
    """
    --- input ---
    df: (dataframe) of 2020 data or 2021 data
    magistrates: (list) of selected magistrate for plotting.
                If empty, show all the magistrates involved in df
    --- output ---
    df_magistrate: (dataframe) containing data for interative bar plot
    """
    # create column 
    df['magistrate'] = df['bail_set_by'].apply(lambda x: x if x in magistrates else 'Other') 

    # find bail type by magistrate
    df_magistrate = pd.crosstab(index = df['magistrate'], columns =df['bail_type'])
    # note: total excludes any entries with 'bail_type' == NaN
    df_magistrate['Total'] = df_magistrate.sum(axis = 1)
    df_magistrate.sort_values(by = ['Total'], ascending = False, inplace = True)
    
    # if magistrates list is empty, select all magistrates in dataframe
    if magistrates == []:
        magistrates = df_magistrate[df_magistrate['Total']> 0].index

    # get total bail amount set
    df_amount = df[['magistrate','bail_amount']].groupby(by = ['magistrate']).sum()

    # merge the two 
    df_magistrate = pd.concat([df_magistrate.stack(), df_amount.stack()[magistrates]], axis = 0).unstack()
    
    # convert bail type count to percentage
    df_magistrate[["Monetary_count", "Nonmonetary_count", "Denied_count", "ROR_count", "Unsecured_count"]] = df_magistrate[['Monetary', 'Nonmonetary', 'Denied', 'ROR','Unsecured']]
    df_magistrate[['Monetary', 'Nonmonetary', 'Denied', 'ROR','Unsecured']] = df_magistrate[['Monetary', 'Nonmonetary', 'Denied', 'ROR','Unsecured']].apply(lambda x: x* 100/ df_magistrate['Total'])

    # change column formats
    df_magistrate[["Monetary", "ROR","Unsecured", "Nonmonetary", "Denied"]] = df_magistrate[["Monetary", "ROR","Unsecured", "Nonmonetary", "Denied"]].apply(lambda x: round(x,2))
    df_magistrate[["Monetary_count", "ROR_count", "Unsecured_count", "Nonmonetary_count", "Denied_count","Total"]] = df_magistrate[["Monetary_count", "ROR_count", "Unsecured_count", "Nonmonetary_count", "Denied_count", "Total"]].astype(int)

    # format bail amount
    df_magistrate["bail_amount"] = df_magistrate["bail_amount"].apply(lambda x: "${:,.0f}".format(x/1000))

    # keep last names of magistrates
    df_magistrate.rename(index = lambda x: x.split(',')[0] if x != "No Magistrate Found" else x, inplace = True)

    df_magistrate.sort_values(by = ['Total'], ascending = True, inplace = True)

    
    return df_magistrate

In [175]:
# get data for 2020 and 2021
df_magistrate_2020 = prep_barplot_data(df_2020, magistrates_2020)
df_magistrate_2021 = prep_barplot_data(df_2021, magistrates_2021)

In [177]:
# save data
magistrate_path_2020 = os.path.join(cleandir, "app_magistrate_data_2020.csv")
#df_magistrate_2020.to_csv(magistrate_path_2020)
magistrate_path_2021 = os.path.join(cleandir, "app_magistrate_data_2021.csv")
#df_magistrate_2021.to_csv(magistrate_path_2021)

## 3(b) Plot year-end summary interactive bar plot

In [178]:
# load data
magistrate_path_2020 = os.path.join(cleandir, "app_magistrate_data_2020.csv")
magistrate_path_2021 = os.path.join(cleandir, "app_magistrate_data_2021.csv")

df_magistrate_2020 = pd.read_csv(magistrate_path_2020)
df_magistrate_2021 = pd.read_csv(magistrate_path_2021)

In [183]:
# prepare data
bail_type = ["Monetary", "ROR", "Unsecured", "Nonmonetary", "Denied"]
bail_type_count = ["Monetary_count", "ROR_count", "Unsecured_count", "Nonmonetary_count", "Denied_count"]

data_2020 = np.array(df_magistrate_2020[bail_type]).transpose()
count_2020 = np.array(df_magistrate_2020[bail_type_count]).transpose()
names_2020 = list(df_magistrate_2020['magistrate'].values)
total_2020 = df_magistrate_2020['Total'].astype(int)
bail_set_2020 = df_magistrate_2020['bail_amount']

data_2021 = np.array(df_magistrate_2021[bail_type]).transpose()
count_2021 = np.array(df_magistrate_2021[bail_type_count]).transpose()
names_2021 = list(df_magistrate_2021['magistrate'].values)
total_2021 = df_magistrate_2021['Total'].astype(int)
bail_set_2021 = df_magistrate_2021['bail_amount']

In [184]:
# Initialize figure
fig = go.Figure()

##### add traces for 2020
for i in range(5):
    
    # text
    text = [str(item)+"%"  if item > 5 else "" for item in data_2020[i]]
    
    # hover text
    # include monetary bail
    if i == 0: 
        hovertext = ["name: " + name + "<br>"
                 + "percentage: " + str(perct) + "%" + "<br>"
                 + "case count: " + str(case) + " / " + str(total) + "<br>"
                 + "total monetary bail amount set: " + str(amount) 
                 for name, perct, case, total, amount in zip(names_2020, data_2020[i], count_2020[i], total_2020, bail_set_2020)]
    else:
        hovertext = ["name: " + name + "<br>"
                 + "percentage: " + str(perct) + "%" + "<br>"
                 + "case count: " + str(case) + " / " + str(total)
                 for name, perct, case, total in zip(names_2020, data_2020[i], count_2020[i], total_2020)]
        
    fig.add_trace(go.Bar(
        y = names_2020,
        x = data_2020[i],
        text = text,
        textposition = "inside",
        name = bail_type[i],
        hoverinfo = 'text',
        hovertext = hovertext,
        orientation = 'h'))
    
##### add traces for 2021
for i in range(5):
    
    # text
    text = [str(item)+"%"  if item > 5 else "" for item in data_2021[i]]
    
    # hover text
    # include monetary bail
    if i == 0: 
        hovertext = ["name: " + name + "<br>"
                 + "percentage: " + str(perct) + "%" + "<br>"
                 + "case count: " + str(case) + " / " + str(total) + "<br>"
                 + "total monetary bail amount set: " + str(amount) 
                 for name, perct, case, total, amount in zip(names_2021, data_2021[i], count_2021[i], total_2021, bail_set_2021)]
    else:
        hovertext = ["name: " + name + "<br>"
                 + "percentage: " + str(perct) + "%" + "<br>"
                 + "case count: " + str(case) + " / " + str(total)
                 for name, perct, case, total in zip(names_2021, data_2021[i], count_2021[i], total_2021)]
        
    fig.add_trace(go.Bar(
        y = names_2021,
        x = data_2021[i],
        text = text,
        textposition = "inside",
        name = bail_type[i],
        hoverinfo = 'text',
        hovertext = hovertext,
        orientation = 'h',
        visible = False # hide in initial plot
        ))

fig.update_layout(barmode='stack',
                 legend = {'traceorder': 'normal'},
                 xaxis_title="percentage",
                 yaxis_title="magistrate",
                 legend_title="bail type")
    
# update
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=list([
                dict(label="2020",
                     method="update",
                     args=[{"visible": [True, True, True, True, True, False, False, False, False, False]},
                           {"title": "Bail type by actor in 2020"}]),
                dict(label="2021",
                     method="update",
                     args=[{"visible": [False, False, False, False, False, True, True, True, True, True]},
                           {"title": "Bail type by actor in 2021"}])
            ]),
        )
        
    ])


fig.update_layout(
    annotations=[
        dict(text="Select year", x=-0.15, xref="paper", y=1.06, yref="paper",
                             align="left", showarrow=False)
    ])

# Update plot sizing
fig.update_layout(
    margin=dict(t=100, b=0, l=0, r=0),
    title = "Bail type by actor in 2020"
)


fig.show()