## Analyze Substantiations from the Arizona Department of Children and Families

In [1]:
import pandas as pd

In [2]:
# excel file has a tab for each year 
# cycle through the tabs

# build year range
years = range(2010, 2022)
# concat this later
container = []

for each in years:
    df = (
        pd
        .read_excel(
            "../../data/states/arizona/Central Registry Media Request 5.xlsx",
            sheet_name = str(each)
        )
    )

    # group by race and mal because age range is included and is not important
    container.append(
        df
        .groupby(["Race", "Maltreatment"])
        .sum()
        .reset_index()
        .assign(
            year = each
        )
    )
    
df = pd.concat(container)
df.head()

Unnamed: 0,Race,Maltreatment,Count,year
0,African American,Neglect,751.0,2010
1,African American,Physical Abuse,264.0,2010
2,African American,Sexual Abuse,7.0,2010
3,American Indian,Neglect,417.0,2010
4,American Indian,Physical Abuse,89.0,2010


In [3]:
# Check to make sure data is not redacted due to small counts
for each in range(2011, 2022):
    
    test = (
        pd
        .read_excel(
            "../../data/states/arizona/Central Registry Media Request 5.xlsx",
            sheet_name = str(each)
        )
    )
    # each row should be nan
    print(test.loc[lambda x: x["Count"].isna()])


    AgeCategory Gender Race Maltreatment  Count
149         NaN    NaN  NaN          NaN    NaN
    AgeCategory Gender Race Maltreatment  Count
152         NaN    NaN  NaN          NaN    NaN
    AgeCategory Gender Race Maltreatment  Count
149         NaN    NaN  NaN          NaN    NaN
    AgeCategory Gender Race Maltreatment  Count
149         NaN    NaN  NaN          NaN    NaN


    AgeCategory Gender Race Maltreatment  Count
154         NaN    NaN  NaN          NaN    NaN
    AgeCategory Gender Race Maltreatment  Count
154         NaN    NaN  NaN          NaN    NaN
    AgeCategory Gender Race Maltreatment  Count
151         NaN    NaN  NaN          NaN    NaN
    AgeCategory Gender Race Maltreatment  Count
152         NaN    NaN  NaN          NaN    NaN


    AgeCategory Gender Race Maltreatment  Count
152         NaN    NaN  NaN          NaN    NaN
    AgeCategory Gender Race Maltreatment  Count
147         NaN    NaN  NaN          NaN    NaN
   AgeCategory Gender Race Maltreatment  Count
80         NaN    NaN  NaN          NaN    NaN


In [4]:
# write to file
df.to_csv("../../outputs/AZ_yearly_subs.csv", index = False)

In [5]:
# yearly totals
df.groupby(["year"]).sum()

Unnamed: 0_level_0,Count
year,Unnamed: 1_level_1
2010,9954.0
2011,11659.0
2012,14452.0
2013,15640.0
2014,18137.0
2015,18664.0
2016,16886.0
2017,15934.0
2018,15949.0
2019,14441.0


In [6]:
# get state population from census
state = (
    pd
    .read_csv("../../data/national/state_B03002.csv")
    .loc[ lambda x: x["state"] == "Arizona" ]
    .sort_values("year")
    .reset_index()
    .drop("index", axis = 1)
)

state.head()

Unnamed: 0,total,non_hispanic,white,black,native,asian,pac,other,two,hispanic,fips,state,year
0,6246816,4432142,3667031,228860,253612,162134,11053,9792,99660,1814674,0400000US04,Arizona,2010
1,6337373,4472998,3686433,238499,256668,168066,11148,9152,103032,1864375,0400000US04,Arizona,2011
2,6410979,4508033,3701932,246474,257107,173231,11568,8063,109658,1902946,0400000US04,Arizona,2012
3,6479703,4543755,3716047,252752,258904,178627,11818,7539,118068,1935948,0400000US04,Arizona,2013
4,6561516,4584490,3734853,257620,262626,186451,11720,7032,124188,1977026,0400000US04,Arizona,2014


In [7]:
# Risk assessments by year
# apply to whole frame of summed races, pops and reg counts
def rr(df):
    
    year = df.name
    
    if int(year) > 2019:
        year = 2019
        
    pop = state.loc[ lambda x: x["year"] == year ]["total"].values[0]
    
    pop_black = state["black"].values[0]
    pop_white = state["white"].values[0]
    pop_hisp = state["hispanic"].values[0]
    
    # registry population in total
    subgrp = df["Count"].sum()
        
    subgrp_black = (
        df
        .loc[ lambda x: x["Race"] == "African American" ]
        ["Count"]
        .values[0]
    )
    
    subgrp_white = (
        df
        .loc[ lambda x: x["Race"] == "White" ]
        ["Count"]
        .values[0]
    )
    
    subgrp_hisp = (
        df
        .loc[ lambda x: x["Race"] == "Hispanic" ]
        ["Count"]
        .values[0]
    )
  
    # rates
    rate_white = subgrp_white/pop_white
    rate_black = subgrp_black/pop_black
    rate_hisp = subgrp_hisp/pop_hisp

    
    # risk ratios
    rrBlackWhite = rate_black/rate_white
    rrHispWhite = rate_hisp/rate_white
    
    return pd.Series({
        "Risk Ratio Black Over White": rrBlackWhite,
        "Risk Ratio, Hispanic Over White": rrHispWhite
    })

# Risk ratios of substantiations
az_rr = (
    df
    .groupby(["Race", "year"])
    .sum()
    .reset_index()
    .groupby("year")
    .apply(rr)
)

az_rr

Unnamed: 0_level_0,Risk Ratio Black Over White,"Risk Ratio, Hispanic Over White"
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,3.575445,1.400417
2011,3.873158,1.38179
2012,3.822649,1.45674
2013,3.866959,1.518403
2014,4.023179,1.421877
2015,4.419567,1.51425
2016,4.271866,1.569331
2017,4.56607,1.603007
2018,4.838754,1.611323
2019,5.609834,1.653794


In [8]:
# average risk ratios 
az_rr.mean().to_frame("")

Unnamed: 0,Unnamed: 1
Risk Ratio Black Over White,4.426512
"Risk Ratio, Hispanic Over White",1.518638


In [9]:
# state makeup
total_pop = (
    state
    .loc[ lambda x: x["year"] == 2019 ]
    .iloc[0]["total"]
)

(
    state
    .loc[ lambda x: x["year"] == 2019 ]
    .apply(
        lambda x: x
        .apply(
            lambda y: y/total_pop if type(y) != str else y
        ), 
        axis = 1
    ).T
)

Unnamed: 0,9
total,1.0
non_hispanic,0.686728
white,0.54707
black,0.042141
native,0.039203
asian,0.03214
pac,0.001827
other,0.001507
two,0.02284
hispanic,0.313272


In [10]:
total_pop

7050299

---

---

---