## Analyze registry entries from the California Department of Justice

In [1]:
import pandas as pd

In [2]:
# Read the excel file
ldf = (
    pd.read_excel(
        "../../data/states/california/PRA PHAM, SCOTT CARP-184_CACIStats_Year_2009_to_2021.xlsx",
        header = 0,
        sheet_name = "Master",
        names = ["year", "type", "race", "gender", "count"],
        na_values = "*"
    )
)

ldf.head()

Unnamed: 0,year,type,race,gender,count
0,2009,SUSPECT,1,M,266.0
1,2009,SUSPECT,1,X,
2,2009,SUSPECT,2,F,76.0
3,2009,SUSPECT,2,M,141.0
4,2009,SUSPECT,2,X,


In [3]:
# fix race and subset
raceguide = {
    1: "asian", # non pacific island asian
    2: "pac", # pacific islands
    3: "other",
    4: "black", #contains native american too
    5: "hispanic",
    6: "white"
}

df = (
    ldf
    .assign(
        race = lambda f: f["race"].apply( lambda x: raceguide[x])
    )
    .loc[
        lambda x: x["type"] == "SUSPECT"
    ]
    .groupby(["year", "race"])
    ["count"]
    .sum()
    .reset_index()
)

df.head()

Unnamed: 0,year,race,count
0,2009,asian,266.0
1,2009,black,1941.0
2,2009,hispanic,7309.0
3,2009,other,1073.0
4,2009,pac,217.0


In [4]:
# yearly 
yearly = df.groupby("year")["count"].sum().to_frame()
# need this for appeals analysis later
yearly.to_csv("../../outputs/CA_yearly_subs.csv")
yearly

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2009,14957.0
2010,15090.0
2011,15757.0
2012,8902.0
2013,7141.0
2014,6666.0
2015,6253.0
2016,6699.0
2017,6826.0
2018,6732.0


In [5]:
# mean
(
    yearly
    .mean()
)

count    8706.384615
dtype: float64

In [6]:
# helper to calculate percents across rows
def percent(row):
    
    total = row.sum()
    
    return row.apply( lambda x: x/total )

In [7]:
yearly_race = (
    df
    .groupby(['year', "race"])
    .sum()
    .unstack()
    .apply( percent, axis = 1)
)

yearly_race

Unnamed: 0_level_0,count,count,count,count,count,count
race,asian,black,hispanic,other,pac,white
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2009,0.017784,0.129772,0.488668,0.071739,0.014508,0.277529
2010,0.023724,0.13214,0.498675,0.068125,0.016103,0.261233
2011,0.022403,0.146665,0.496541,0.070762,0.01631,0.247319
2012,0.020894,0.145136,0.480004,0.089418,0.01494,0.249607
2013,0.021145,0.132895,0.482005,0.093124,0.015964,0.254866
2014,0.023552,0.136214,0.451695,0.108611,0.011851,0.268077
2015,0.02111,0.144731,0.445386,0.119143,0.014073,0.255557
2016,0.027168,0.139125,0.445887,0.110614,0.014629,0.262577
2017,0.022707,0.130237,0.443745,0.1109,0.011427,0.280984
2018,0.030452,0.140226,0.342246,0.166964,0.014112,0.306001


In [8]:
# average yearly race
yearly_race.mean().to_frame("")

Unnamed: 0,race,Unnamed: 2
count,asian,0.023825
count,black,0.137637
count,hispanic,0.379791
count,other,0.137499
count,pac,0.014781
count,white,0.306467


In [9]:
# total
df["count"].sum()

113183.0

In [10]:
# Load state census demos
census = (
    pd
    .read_csv(
        "../../data/national/state_S0901.csv"
    )
    .loc[ lambda x: x["state"] == "California" ]
)

census.sort_values("year")

Unnamed: 0,fips,state,total,white,black,native,asian,pac,other,two,hispanic,non_hispanic_white,year
108,0400000US06,California,9291319.0,5175265,576062,74331,984880,37165,1793225,650392,4645660,2666609,2010
420,0400000US06,California,9271954.0,5266470,565589,74176,992099,37088,1668952,676853,4691609,2605419,2011
56,0400000US06,California,9251565.0,5328901,555094,74013,999169,37006,1572766,693867,4727550,2544180,2012
476,0400000US06,California,9212181.0,5315428,543519,73697,994916,36849,1529222,718550,4735061,2496501,2013
316,0400000US06,California,9182670.0,5298401,532595,73461,1000911,36731,1505958,743796,4738258,2460956,2014
160,0400000US06,California,9146067.0,5268135,521326,73169,1006067,36584,1499955,749977,4728517,2423708,2015
528,0400000US06,California,9112964.0,5185277,510326,63791,1011539,36452,1530978,774602,4720515,2396710,2016
4,0400000US06,California,9088594.0,5116878,499873,72709,1017923,36354,1554150,799796,4716980,2372123,2017
212,0400000US06,California,9047984.0,5057823,488591,72384,1022422,36192,1556253,814319,4695904,2343428,2018
264,0400000US06,California,8997470.0,5011591,485863,71980,1025712,35990,1547565,818770,4669687,2321347,2019


In [11]:
## risk ratios for grouping by year
def rr(grp):
    year = int(grp.name)
    
    if (year > 2020) or year < 2010: # if year outside census bounds
        return
    
    # state populations by race
    state = census.loc[ lambda x: x["year"] == year ]
        
    pop = state["total"].values[0]
    pop_black = state["black"].values[0] + state["native"].values[0]
    pop_white = state["white"].values[0]
    pop_hisp = state["hispanic"].values[0]

    # reg populations
    reg_total = grp["count"].sum()
    
    reg_black = grp.loc[ 
        lambda x: x["race"] == "black"
    ]["count"].sum()
    
    reg_white = grp.loc[
        lambda x: x["race"] == "white"
    ]["count"].sum()
    
    reg_hisp = grp.loc[
        lambda x: x["race"] == "hispanic"
    ]["count"].sum()
  
    # rates
    rate_white = reg_white/pop_white
    rate_black = reg_black/pop_black
    rate_hisp = reg_hisp/pop_hisp
    
    # risk ratios
    rrBlack = (rate_black/rate_white)
    rrHisp = rate_hisp/rate_white
    
    return pd.Series({
        "Risk Ratio, Black": rrBlack,
        "Risk Ratio, Hispanic": rrHisp
    })

# Risk ratios of substantiations and established 
(
    df
    .groupby("year")
    .apply(rr)
)



Unnamed: 0_level_0,"Risk Ratio, Black","Risk Ratio, Hispanic"
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2009,,
2010,4.024994,2.126547
2011,4.881673,2.253701
2012,4.925288,2.167656
2013,4.490512,2.12301
2014,4.442151,1.884136
2015,5.018576,1.941692
2016,4.785437,1.865314
2017,4.1421,1.713135
2018,4.131663,1.204647


In [12]:
# average
(
    df
    .groupby("year")
    .apply(rr)
    .mean()
    .to_frame("")
)

Unnamed: 0,Unnamed: 1
"Risk Ratio, Black",4.228239
"Risk Ratio, Hispanic",1.635626


---

---

---