# Live sumamry
* Preprocess and prepare dataframe for the live summary on top of dashboard.
* Creates a smaller dataframe to use when generating the live summary.
* Contains code for generating the live summary. The code here is used to play around with the layout of the summary. A very similar code (with different margins and order of columns) are used in the app.py


In [1]:
import ast
import datetime
import itertools
import matplotlib.pyplot as plt
from matplotlib import rc_file_defaults
from matplotlib import ticker

plt.style.use('ggplot')

import os
import sys
import numpy as np
import pandas as pd
import random
import regex as re
import seaborn as sns

from dateutil.relativedelta import relativedelta
from gensim.models.nmf import Nmf
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from scipy.stats import chi2_contingency, mannwhitneyu, wilcoxon
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from wordcloud import WordCloud

import plotly.express as px
import plotly.graph_objs as go
from ipywidgets import widgets
from plotly.subplots import make_subplots

import json
import copy

In [2]:
# Set various data and figure directories relative to cwd
# (which is hopefully always this file's dirname)
cwd = os.getcwd()
cwd_split = os.path.split(cwd)
srcdir = cwd_split[0]
#sys.path.append(os.path.abspath(os.path.join('..', cwd_split[-1])))
sys.path.append(srcdir)
root = os.path.split(srcdir)[0]
rawdir = os.path.join(root, "data/raw") # Raw csv files; local only!
tmpdir = os.path.join(root, "data/tmp") # Intermediate data products; local only!
extdir = os.path.join(root, "data/external") # Data from external sources; in repo
cleandir = os.path.join(root, "data/cleaned") # Data for dashboard generation; in repo
figdir = os.path.join(root, "figures") # Final figures; in repo

In [3]:
import pltformat
colorDict = pltformat.get_color_dictionary()

# 1. import and preprocess

In [4]:
import preprocess

## 1(a) 2020 data

In [14]:
# Set raw csv data file paths
#docketcsv = "docket_reparse_03_02_2021.csv"
docketcsv = "2020_added_bail.csv"
courtcsv = "court_summary.csv"
outcsv = "processed_data.csv"
trimmedcsv = "app_data.csv"

docketpath = os.path.join(rawdir, docketcsv)
courtpath = os.path.join(rawdir, courtcsv)
outpath = os.path.join(tmpdir, outcsv)
trimmedpath = os.path.join(cleandir, trimmedcsv)


In [15]:
# Merge and clean docket and court summary data and save to new csv file
df_2020 = preprocess.merge_and_clean_data(docketpath, courtpath,
                                     outPath=outpath, verbose=True)

Removing 13 cases for which prelim_hearing_dt - bail_date was more than 5...
> Imported 24225 rows with 25 columns:
age
age_group
arrest_dt
arresting_officer
attorney
attorney_type
bail_amount
bail_date
bail_paid
bail_set_bin
bail_set_by
bail_type
case_status
dob
is_bail_posted
is_philly_zipcode
offense_date
offense_type
offenses
prelim_hearing_dt
prelim_hearing_time
race
sex
statute
zip
> Saved new file


## 1(b) 2021 data

In [72]:
# Set raw csv data file paths

docketcsv = "2021_added_bail.csv" # most updated docket parsing (Sent by Adam on April 26)
# docketcsv = "2021-jan_feb_march_dockets.csv"
courtcsv = "2021-jan_feb_march_court.csv"
outcsv = "processed_data_2021.csv"
trimmedcsv = "app_data_2021.csv"

docketpath = os.path.join(rawdir, docketcsv)
courtpath = os.path.join(rawdir, courtcsv)
outpath = os.path.join(tmpdir, outcsv)
trimmedpath = os.path.join(cleandir, trimmedcsv)


In [73]:
# Merge and clean docket and court summary data and save to new csv file
df_2021 = preprocess.merge_and_clean_data(docketpath, courtpath,
                                     outPath=outpath, verbose=True)

Removing 1 cases for which prelim_hearing_dt - bail_date was more than 5...
> Imported 6017 rows with 25 columns:
age
age_group
arrest_dt
arresting_officer
attorney
attorney_type
bail_amount
bail_date
bail_paid
bail_set_bin
bail_set_by
bail_type
case_status
dob
is_bail_posted
is_philly_zipcode
offense_date
offense_type
offenses
prelim_hearing_dt
prelim_hearing_time
race
sex
statute
zip
> Saved new file


# 2. Prepare aggregate data to use for figures  


In [124]:
# concatenate data
df = pd.concat([df_2020, df_2021])
df.reset_index(inplace = True, drop = True)

# create columns 
df['bail_year'] = df['bail_date'].dt.year
df['bail_month'] = df['bail_date'].dt.month

In [125]:
df['monetary_bail'] = df['bail_type'].apply(lambda x : 1 if x == 'Monetary' else 0)


In [127]:
df_summary = df.groupby(["bail_year", "bail_month"])[["bail_amount", "bail_paid", "monetary_bail"]].sum()
df_summary["count"] = df.groupby(["bail_year", "bail_month"]).size()

In [212]:
df_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,bail_amount,bail_paid,monetary_bail,count
bail_year,bail_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020,1,108507700.0,2775850.0,1308,2516
2020,2,125310300.0,4069140.0,1265,2676
2020,3,74407400.0,2364740.0,798,1738
2020,4,89115200.0,2163680.0,510,880
2020,5,180731048.0,4661405.0,1002,2123
2020,6,98238899.0,2958600.0,830,1955
2020,7,123966800.0,2582670.0,891,1860
2020,8,160580400.0,3584930.0,926,2055
2020,9,170847000.0,3499280.0,1151,2245
2020,10,188363496.0,3889770.0,1311,2685


In [213]:
# save data
summary_path = os.path.join(cleandir, "app_year_summary.csv")
#df_summary.to_csv(summary_path)

In [218]:
# read data
df_summary = pd.read_csv(summary_path, index_col = (0,1))

In [219]:
df_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,bail_amount,bail_paid,monetary_bail,count
bail_year,bail_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020,1,108507700.0,2775850.0,1308,2516
2020,2,125310300.0,4069140.0,1265,2676
2020,3,74407400.0,2364740.0,798,1738
2020,4,89115200.0,2163680.0,510,880
2020,5,180731048.0,4661405.0,1002,2123
2020,6,98238899.0,2958600.0,830,1955
2020,7,123966800.0,2582670.0,891,1860
2020,8,160580400.0,3584930.0,926,2055
2020,9,170847000.0,3499280.0,1151,2245
2020,10,188363496.0,3889770.0,1311,2685


In [220]:
# get most recent month in 2021 YTD
last_month = df_summary.loc[2021].index.max()

# get summary information
summary_2020 = df_summary.loc[(2020,)].sum()
summary_2021 = df_summary.loc[(2021,)].sum()

idx = pd.IndexSlice
summary_2020_YTM = df_summary.loc[idx[2020, range(last_month + 1)], idx[:]].sum()

In [221]:
# add monetary bail percentage for all summaries
summary_2020["monetary_bail_perct"] = summary_2020["monetary_bail"]/summary_2020["count"]
summary_2020_YTM["monetary_bail_perct"] = summary_2020_YTM["monetary_bail"] / summary_2020_YTM["count"]
summary_2021["monetary_bail_perct"] = summary_2021["monetary_bail"]/summary_2021["count"]

In [222]:
# find string corresponding to last month
month = {1: "Jan",
         2: "Feb",
         3: "Mar",
         4: "Apr",
         5: "May",
         6: "Jun",
         7: "Jul",
         8: "Aug",
         9: "Sep",
         10: "Oct",
         11: "Nov",
         12: "Dec"}
month_str = month[last_month]

In [223]:
fig = go.Figure()

# 2020 data 
fig.add_trace(go.Indicator(
    mode = "number",
    value = 2020,
    number={"font":{"size": 15}},
    domain = {'x': [0, 0.04], 'y': [0.66, 1]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2020["bail_amount"],
    title = {"text": "<span style='font-size:1.4em'>Amount of bail set</span>"},
    number = {"prefix": "$",
              "font":{"size": 40}},
    domain = {'x': [0.04, 0.28], 'y': [0.66, 1]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2020["monetary_bail_perct"]*100,
    title = {"text": "<span style='font-size:1.4em'>Percentage of bail set</span>"},
    number = {"suffix": "%",
              "font":{"size":40}},
    domain = {'x': [0.28, 0.52], 'y': [0.66, 1]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2020["monetary_bail"],
    title = {"text": "<span style='font-size:1.4em'>Number of people impacted</span>"},
    number = {
              "font":{"size": 40}},
    domain = {'x': [0.52, 0.76], 'y': [0.66, 1]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2020["bail_paid"],
    title = {"text": "<span style='font-size:1.4em'>Amount of bail paid</span>"},
    number = {"prefix": "$",
              "font":{"size":40}},
    domain = {'x': [0.76, 1], 'y': [0.66, 1]}))

# 2020 YTM data
fig.add_trace(go.Indicator(
    mode = "number",
    value = 2020,
    number={"font":{"size": 15},
            "suffix": " Jan-"+month_str},
    domain = {'x': [0, 0.04], 'y': [0.33, 0.66]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2020_YTM["bail_amount"],
    #title = {"text": "<span style='font-size:1.4em'>Amount of bail set</span>"},
    number = {"prefix": "$",
              "font":{"size":40}},
    domain = {'x': [0.04, 0.28], 'y': [0.33, 0.66]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2020_YTM["monetary_bail_perct"]*100,
    #title = {"text": "<span style='font-size:1.4em'>Percentage of bail set</span>"},
    number = {"font": {"size":40},
             "suffix": "%"},
    domain = {'x': [0.28, 0.52], 'y': [0.33, 0.66]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2020_YTM["monetary_bail"],
    number = {"font": {"size":40}},
    #title = {"text": "<span style='font-size:1.4em'>Number of people impacted</span>"},
    domain = {'x': [0.52, 0.76], 'y': [0.33, 0.66]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2020_YTM["bail_paid"],
    #title = {"text": "<span style='font-size:1.4em'>Amount of bail paid</span>"},
    number = {"prefix": "$",
             "font": {"size":40}},
    domain = {'x': [0.76, 1], 'y': [0.33, 0.66]}))



# 2021 data
fig.add_trace(go.Indicator(
    mode = "number",
    value = 2021,
    number={"font":{"size": 15},
            "suffix": " Jan-"+month_str},
    domain = {'x': [0, 0.04], 'y': [0.1, 0.33]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2021["bail_amount"],
    #title = {"text": "<span style='font-size:1.4em'>Amount of bail set</span>"},
    number = {"prefix": "$",
              "font":{"size":40}},
    domain = {'x': [0.04, 0.28], 'y': [0.1, 0.33]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2021["monetary_bail_perct"]*100,
    #title = {"text": "<span style='font-size:1.4em'>Percentage of bail set</span>"},
    number = {"font": {"size":40},
             "suffix": "%"},
    domain = {'x': [0.28, 0.52], 'y': [0.1, 0.33]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2021["monetary_bail"],
    number = {"font": {"size":40}},
    #title = {"text": "<span style='font-size:1.4em'>Number of people impacted</span>"},
    domain = {'x': [0.52, 0.76], 'y': [0.1, 0.33]}))

fig.add_trace(go.Indicator(
    mode = "number",
    value = summary_2021["bail_paid"],
    #title = {"text": "<span style='font-size:1.4em'>Amount of bail paid</span>"},
    number = {"prefix": "$",
             "font": {"size":40}},
    domain = {'x': [0.76, 1], 'y': [0.1, 0.33]}))
              
fig.update_layout(
    height = 200,
    margin = dict(t = 0, b = 0)
)
fig.show()