## Analyze substantiation data from the New Jersey Department of Children and Families

In [1]:
import pandas as pd

In [2]:
# Read the excel file and reformat to long
# in order to get rid of all the multi indexing
ldf = (
    pd.read_excel(
        "../../data/states/new_jersey/Buzzfeed data request August 2021(1).xlsx",
        index_col = [0,1], # set multi columns
        header = [0,1],
        skipfooter = 1
    )
    .reset_index() # in order to rename and forward fill
    .rename(
        columns = {
            "level_0": "year",
            "level_1": "risk_level"
        }
    )
    .assign(
        year = lambda f: f["year"].ffill(),
        risk_level = lambda f: f["risk_level"].ffill()
    )
)

ldf.head()

Unnamed: 0_level_0,year,risk_level,Unnamed: 2_level_0,Substantiated,Substantiated,Substantiated,Substantiated,Substantiated,Substantiated,Established,...,Unfounded,Unfounded,Unfounded,Pending,Pending,Pending,Pending,Pending,Pending,Grand Total
Risk Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Allegation Type,Black or African American,Hispanic,Missing or Undetermined,Other,White,Total,Black or African American,...,Other,White,Total,Black or African American,Hispanic,Missing or Undetermined,Other,White,Total,Unnamed: 33_level_1
0,2015,Low,Emotional Abuse,,,,,,,,...,,1.0,3.0,,,,,,,15.0
1,2015,Low,Multiple,6.0,16.0,1.0,3.0,6.0,32.0,17.0,...,11.0,53.0,220.0,,,,,,,1033.0
2,2015,Low,Neglect,29.0,30.0,,3.0,18.0,80.0,100.0,...,62.0,497.0,1827.0,,,,,,,7704.0
3,2015,Low,Physical Abuse,4.0,6.0,,1.0,,11.0,47.0,...,37.0,146.0,640.0,,,,,,,3478.0
4,2015,Low,Sexual Abuse,47.0,119.0,4.0,5.0,34.0,209.0,3.0,...,7.0,71.0,298.0,,,,,,,1040.0


In [3]:
# continue reformatting and melting
df = (     
    ldf
    .set_index(["year", "risk_level", ("Unnamed: 2_level_0", "Allegation Type")])
    .melt(
        value_vars = ldf.columns.to_list()[3:],
        ignore_index = False # indices are important
    )
    .reset_index() # make them cols again
    .rename( columns = {
        ("Unnamed: 2_level_0","Allegation Type"): "allegation_type",
        None: "determination", # name displays as NaN but really it's None
        "Risk Level": "race" # looks weird, but correct step
    })
    .loc[ lambda x: # removes total rows
         ~x["determination"].str.contains("total|pending", case = False) &
         ~x["risk_level"].str.contains("total", case = False) & 
         (x["race"] != "Total") &
         (x["allegation_type"] != "Total")
    ]
    .assign(
        year = lambda f: pd.to_numeric(f["year"])
    )
)

df.head(3)

Unnamed: 0,year,risk_level,allegation_type,determination,race,value
0,2015,Low,Emotional Abuse,Substantiated,Black or African American,
1,2015,Low,Multiple,Substantiated,Black or African American,6.0
2,2015,Low,Neglect,Substantiated,Black or African American,29.0


In [4]:
# helper to calculate percents across rows
def percent(row):
    total = row.sum()
    return row.apply( lambda x: x/total )

In [5]:
# mainly interested in substantiations
subs = (
    df
    .loc[ 
        lambda x: 
        (x["determination"] == "Substantiated") &
        (x["risk_level"] != "Risk Assessment not Required") # indicates not an official child abuse investigation
    ]
)

subs.head()

Unnamed: 0,year,risk_level,allegation_type,determination,race,value
0,2015,Low,Emotional Abuse,Substantiated,Black or African American,
1,2015,Low,Multiple,Substantiated,Black or African American,6.0
2,2015,Low,Neglect,Substantiated,Black or African American,29.0
3,2015,Low,Physical Abuse,Substantiated,Black or African American,4.0
4,2015,Low,Sexual Abuse,Substantiated,Black or African American,47.0


In [6]:
# confirm we've selected correctly
subs[["risk_level", "determination"]].value_counts()

risk_level  determination
High        Substantiated    150
Low         Substantiated    150
Moderate    Substantiated    150
Very High   Substantiated    150
dtype: int64

In [7]:
# yearly 
yearly = subs.groupby("year")["value"].sum().to_frame()
# need this for appeals analysis later
yearly.to_csv("../../outputs/NJ_yearly_subs.csv")
yearly

Unnamed: 0_level_0,value
year,Unnamed: 1_level_1
2015,3500.0
2016,3102.0
2017,2765.0
2018,2678.0
2019,2188.0
2020,1475.0


In [8]:
# mean
(
    yearly
    .mean()
)

value    2618.0
dtype: float64

In [9]:
# yearly race
(
    subs
    .groupby(['year', "race"])
    .sum()
    .unstack()
)

Unnamed: 0_level_0,value,value,value,value,value
race,Black or African American,Hispanic,Missing or Undetermined,Other,White
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2015,1083.0,1056.0,47.0,179.0,1135.0
2016,862.0,1065.0,42.0,137.0,996.0
2017,788.0,890.0,42.0,143.0,902.0
2018,764.0,905.0,33.0,134.0,842.0
2019,610.0,765.0,35.0,120.0,658.0
2020,364.0,570.0,23.0,64.0,454.0


In [10]:
# yearly race by percent
(
    df
    .groupby(['year', "race"])
    .sum()
    .unstack()
    .apply( percent, axis = 1)
)

Unnamed: 0_level_0,value,value,value,value,value
race,Black or African American,Hispanic,Missing or Undetermined,Other,White
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2015,0.310478,0.291531,0.0291,0.044178,0.324712
2016,0.312194,0.302543,0.029465,0.042333,0.313464
2017,0.300737,0.310724,0.032826,0.044001,0.311712
2018,0.299048,0.313576,0.033988,0.044595,0.308794
2019,0.296867,0.3274,0.037029,0.043889,0.294816
2020,0.285258,0.323996,0.039811,0.044868,0.306068


In [11]:
# total
subs["value"].sum()

15708.0

In [12]:
# Load state census demos
census = (
    pd
    .read_csv(
        "../../data/national/state_S0901.csv"
    )
    .loc[ lambda x: x["state"] == "New Jersey" ]
)

census.head()

Unnamed: 0,fips,state,total,white,black,native,asian,pac,other,two,hispanic,non_hispanic_white,year
30,0400000US34,New Jersey,1993559.0,1251955,285079,3987,185401,0,161478,103665,512345,950928,2017
82,0400000US34,New Jersey,2049546.0,1317858,307432,6149,176261,0,151666,90180,459098,1057566,2012
134,0400000US34,New Jersey,2073770.0,1331360,315213,4148,170049,0,172123,78803,435492,1105319,2010
186,0400000US34,New Jersey,2017223.0,1272868,294515,6052,183567,0,159361,100861,494220,990456,2015
238,0400000US34,New Jersey,1967255.0,1233469,279350,3935,182955,0,161315,104265,515421,928544,2018


In [13]:
## risk ratios for grouping by year
def rr(grp):
    year = int(grp.name)
    
    # state populations by race
    state = census.loc[ lambda x: x["year"] == year ]
        
    pop = state["total"].values[0]
    pop_black = state["black"].values[0]
    pop_white = state["white"].values[0]
    pop_hisp = state["hispanic"].values[0]

    # reg populations
    reg_total = grp["value"].sum()
    
    reg_black = grp.loc[ 
        lambda x: x["race"] == "Black or African American"
    ]["value"].sum()
    
    reg_white = grp.loc[
        lambda x: x["race"] == "White"
    ]["value"].sum()
    
    reg_hisp = grp.loc[
        lambda x: x["race"] == "Hispanic"
    ]["value"].sum()
    
  
    # rates
    rate_white = reg_white/pop_white
    rate_black = reg_black/pop_black
    rate_hisp = reg_hisp/pop_hisp
    
    # risk ratios
    rrBlack = (rate_black/rate_white)
    rrHisp = rate_hisp/rate_white
    
    return pd.Series({
        "Risk Ratio, Black": rrBlack,
        "Risk Ratio, Hispanic": rrHisp
    })

# Risk ratios of substantiations and established 
(
    subs
    .groupby("year")
    .apply(rr)
)

Unnamed: 0_level_0,"Risk Ratio, Black","Risk Ratio, Hispanic"
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,4.123904,2.396244
2016,3.748344,2.675321
2017,3.83657,2.411069
2018,4.00646,2.572187
2019,4.145251,2.725829
2020,3.432007,2.797853


In [14]:
# average
(
    subs
    .groupby("year")
    .apply(rr)
    .mean()
    .to_frame("")
)

Unnamed: 0,Unnamed: 1
"Risk Ratio, Black",3.882089
"Risk Ratio, Hispanic",2.596417


---

---

---