## Analyze substantiation data from Texas Department of Family and Protective Services

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../../data/states/texas/TX_full.csv")
df.head(3)

Unnamed: 0,Fiscal Year,Region Code,Region,County Code,County,Disposition,Perpetrator Count,Confirmed Perpetrator,Perpetrator Age,Perpetrator Gender,Perpetrator Race Ethnicity,Perpetrator Marital Status,Relation to Victim
0,2010,4,4-Tyler,1,Anderson,Ruled Out,5,Unconfirmed,36-45,Female,Anglo,Unknown,Parent
1,2010,4,4-Tyler,1,Anderson,Ruled Out,6,Unconfirmed,18-25,Female,Anglo,Married,Parent
2,2010,4,4-Tyler,1,Anderson,Ruled Out,6,Unconfirmed,18-25,Female,Anglo,Unknown,Parent


In [3]:
# Load county census demos
county = (
    pd
    .read_csv(
        "../../data/national/county_B03002.csv",
        dtype = {
            "fips": object,
        }
    )
    .loc[ lambda x: x["state"] == "Texas" ]
)

county.head(3)

Unnamed: 0,fips,name,total,non_hispanic,white,black,native,asian,pac,other,two,hispanic,year,state,county
2523,0500000US48001,"Anderson County, Texas",57810,47527,34069,12111,181,321,9,16,820,10283,2019,Texas,Anderson
2524,0500000US48003,"Andrews County, Texas",18036,7885,7246,122,0,65,31,0,421,10151,2019,Texas,Andrews
2525,0500000US48005,"Angelina County, Texas",87322,67996,52832,12747,105,957,5,102,1248,19326,2019,Texas,Angelina


In [4]:
# Load state census demos
state = (
    pd
    .read_csv(
        "../../data/national/state_B03002.csv"
    )
    .loc[ lambda x: x["state"] == "Texas" ]
)

state

Unnamed: 0,total,non_hispanic,white,black,native,asian,pac,other,two,hispanic,fips,state,year
43,26538614,16342247,11635757,3070821,65378,1110772,18751,33723,407045,10196367,0400000US48,Texas,2015
95,24774187,15557947,11349192,2856383,69329,927023,17758,40018,298244,9216240,0400000US48,Texas,2011
147,24311891,15394414,11286712,2810118,67744,892981,16874,43791,276194,8917477,0400000US48,Texas,2010
199,28635442,17341185,11850477,3367449,65132,1396953,21477,55897,583800,11294257,0400000US48,Texas,2020
251,26092033,16129390,11562453,3015767,65974,1053474,18730,33114,379878,9962643,0400000US48,Texas,2014
303,25639373,15921646,11488269,2956545,66100,1005797,18011,34413,352511,9717727,0400000US48,Texas,2013
355,27419612,16745703,11755493,3199022,65883,1222975,20170,39153,443007,10673909,0400000US48,Texas,2017
407,26956435,16543285,11705684,3134962,63336,1161742,18990,35509,423062,10413150,0400000US48,Texas,2016
459,25208897,15729227,11415017,2903204,67134,966343,17955,37097,322477,9479670,0400000US48,Texas,2012
511,27885195,16963639,11807263,3269253,68452,1292813,20381,42354,463123,10921556,0400000US48,Texas,2018


In [5]:
# group by year
yearly = (
    df
    .loc[ lambda x: x["Confirmed Perpetrator"] == "Confirmed" ]
    .groupby("Fiscal Year")
    ["Perpetrator Count"]
    .sum()
    .to_frame()
)
yearly.to_csv("../../outputs/TX_yearly_subs.csv")
yearly

Unnamed: 0_level_0,Perpetrator Count
Fiscal Year,Unnamed: 1_level_1
2010,53520
2011,53041
2012,51819
2013,53634
2014,53870
2015,53924
2016,47582
2017,50961
2018,52819
2019,53394


In [6]:
# by year and race
dfyr = (
    df
    .loc[ lambda x: x["Confirmed Perpetrator"] == "Confirmed" ]
    .groupby(["Fiscal Year", "Perpetrator Race Ethnicity"])
    ["Perpetrator Count"]
    .sum()
    .unstack()
)

dfyr

Perpetrator Race Ethnicity,African American,Anglo,Asian,Hispanic,Native American,Other
Fiscal Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010,9700,20793,337,20660,123,1907
2011,9359,20006,309,21145,107,2115
2012,8796,20018,330,20709,60,1906
2013,9163,20907,321,21143,92,2008
2014,9540,21705,310,20292,79,1944
2015,9796,21400,389,20321,116,1902
2016,8855,18851,326,17769,67,1714
2017,9882,18787,352,19507,61,2372
2018,11026,19206,363,19898,85,2241
2019,11288,18757,370,20849,80,2050


In [7]:
# 2019 snapshot
(
    dfyr
    .loc[ 2019 ]
    .to_frame()
    .assign(
        percent_of_whole = lambda f: f/(dfyr.loc[2019].sum())
    )
)

Unnamed: 0_level_0,2019,percent_of_whole
Perpetrator Race Ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1
African American,11288,0.21141
Anglo,18757,0.351294
Asian,370,0.00693
Hispanic,20849,0.390475
Native American,80,0.001498
Other,2050,0.038394


In [8]:
# Registry risk ratios for whole state
def rrs(r):
    
    def div(num, denom):
        if denom == 0:
            return 0
        return num/denom
    
    year = r["Fiscal Year"]
    
    if year > 2020: # ignore years with no census data
        return
        
    #get state for this year
    syear = state.loc[ lambda x: x["year"] == year ]

    # total pop in state
    total = syear["total"].values[0]
    black_pop = syear["black"].values[0]
    white_pop = syear["white"].values[0]
    hisp_pop = syear["hispanic"].values[0]
    
    # total registry county
    reg = r.sum()
    
    # count racial population of reg
    white_reg = r["Anglo"]
    black_reg = r["African American"]
    hisp_reg = r["Hispanic"]
    
    # calculate risk ratios for state population
    rate_black = div(black_reg, black_pop)
    
    rate_white = div(white_reg, white_pop)
    
    rate_hisp = div(hisp_reg, hisp_pop)

    rrBlackWhite = div(rate_black, rate_white)
    rrHispWhite = div(rate_hisp, rate_white)
    
    res = pd.Series({
        "Year": year,
        "State pop": total,
        "Reg pop": reg,
        "Percent black": div(black_pop,total),
        "RR Black to White": rrBlackWhite,
        "RR Hisp to White": rrHispWhite
    })
    
    return res

stateRR = (
    dfyr
    .reset_index()
    .apply(
        rrs,
        axis = 1
    )
)

stateRR

Unnamed: 0,Year,State pop,Reg pop,Percent black,RR Black to White,RR Hisp to White
0,2010.0,24311891.0,55530.0,0.115586,1.873689,1.257589
1,2011.0,24774187.0,55052.0,0.115297,1.858736,1.301543
2,2012.0,25208897.0,53831.0,0.115166,1.727681,1.245724
3,2013.0,25639373.0,55647.0,0.115313,1.703006,1.195542
4,2014.0,26092033.0,55884.0,0.115582,1.685159,1.085027
5,2015.0,26538614.0,55939.0,0.115711,1.734503,1.083629
6,2016.0,26956435.0,49598.0,0.116297,1.753956,1.059603
7,2017.0,27419612.0,52978.0,0.116669,1.932907,1.143537
8,2018.0,27885195.0,54837.0,0.11724,2.073394,1.12005
9,2019.0,28260856.0,55413.0,0.117785,2.143525,1.185467


In [9]:
# average across all years
stateRR.mean().to_frame("")

Unnamed: 0,Unnamed: 1
Year,2015.0
State pop,26520230.0
Reg pop,54619.82
Percent black,0.1162039
RR Black to White,1.880003
RR Hisp to White,1.170675


### 2019 County Risk Ratios

In [10]:
# county, year, race, 2019
cy19 = (
    df
    .loc[ lambda x: x["Confirmed Perpetrator"] == "Confirmed" ]
    .loc[ lambda x: x["Fiscal Year"] == 2019 ]
    .groupby(["County", "Perpetrator Race Ethnicity"])
    ["Perpetrator Count"]
    .sum()
    .unstack()
)

cy19.head()

Perpetrator Race Ethnicity,African American,Anglo,Asian,Hispanic,Native American,Other
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Anderson,59.0,125.0,,19.0,,2.0
Andrews,2.0,23.0,,30.0,,
Angelina,50.0,89.0,,26.0,1.0,6.0
Aransas,4.0,68.0,3.0,29.0,,3.0
Archer,,14.0,,,,


In [11]:
# Registry risk ratios for county
def rrc(row):
    def div(num, denom):
        if denom == 0:
            return 0
        return num/denom
    
    county_name = row["County"]    

    try:
    # get county census data
        census = (
            county
            .loc[ lambda x: 
                 (x["year"] == 2019) &
                 (x["county"] == county_name)
            ]
        )
        # total pop in state
        total = census["total"].values[0]
    except Exception as e: # out of state doesn't have census data, ignore it
        return
    
    black_pop = census["black"].values[0]
    white_pop = census["white"].values[0]
    hisp_pop = census["hispanic"].values[0]
        
    # total registry county
    reg = row[[
        "African American", 
        "Anglo",
        "Asian",
        "Hispanic",
        "Native American",
        "Other"
    ]].sum()
    
    # count racial population of reg
    white_reg = row["Anglo"]
    black_reg = row["African American"]
    hisp_reg = row["Hispanic"]
    
    # calculate risk ratios for state population
    rate_black = div(black_reg, black_pop)
    rate_nonBlack = div((reg - black_reg), (total - black_pop))
    
    rate_white = div(white_reg, white_pop)
    
    rate_hisp = div(hisp_reg, hisp_pop)
    rate_nonHisp = div((reg - hisp_reg), (total - hisp_pop))
    
    rrBlack = div(rate_black, rate_nonBlack)
    rrHisp = div(rate_hisp, rate_nonHisp)

    rrBlackWhite = div(rate_black, rate_white)
    rrHispWhite = div(rate_hisp, rate_white)
    
    res = pd.Series({
        "County": county_name,
        "County pop": total,
        "Percent black": div(black_pop,total),
        "Reg pop": reg,
        "RR Black to White": rrBlackWhite,
        "RR Hisp to White": rrHispWhite
    })
    
    return res

In [12]:
# apply county risk ratios
countyRR = (
    cy19
    .reset_index()
    .apply(
        rrc,
        axis = 1
    )
)

In [13]:
# subset only large counties
(
    countyRR
    .loc[ 
        lambda x: 
        (x["County pop"] > 100000) &
        (x["Percent black"] > 0.1)
    ]
    .sort_values("RR Black to White", ascending = False)
)

Unnamed: 0,County,County pop,Percent black,Reg pop,RR Black to White,RR Hisp to White
19,Brazos,222981.0,0.102802,256.0,5.427344,2.025045
156,McLennan,251089.0,0.142607,914.0,3.696029,1.588997
55,Dallas,2606868.0,0.222562,5102.0,3.346815,1.486269
214,Tarrant,2049770.0,0.16141,4390.0,3.198931,1.390983
98,Harris,4646630.0,0.185735,5527.0,3.058187,1.400418
77,Fort Bend,765394.0,0.201167,442.0,2.597154,2.289142
13,Bell,348574.0,0.215151,1072.0,1.893825,0.924074
207,Smith,227449.0,0.172192,460.0,1.891995,1.224977
126,Kaufman,123804.0,0.117153,293.0,1.854712,0.944259
82,Galveston,332885.0,0.123481,585.0,1.826112,1.23654


---
---
---