# References

[1](https://www.nytimes.com/2019/04/25/us/us-mass-incarceration-rate.html) Robertson, C. (2019, 25 April). Crime Is Down, Yet U.S. Incarceration Rates Are Still Among the Highest in the World. _The New York Times_, https://www.nytimes.com/2008/04/23/world/americas/23iht-23prison.12253738.html. Accessed 12 December, 2019.

[2](https://www.nytimes.com/2008/04/23/world/americas/23iht-23prison.12253738.html) Liptak, A. (2008, 23 April). U.S. prison population dwarfs that of other nations. _The New York Times_, https://www.nytimes.com/2008/04/23/world/americas/23iht-23prison.12253738.html. Accessed 12 December, 2019.

[3](https://www.bjs.gov/index.cfm?ty=dcdetail&iid=261) Bureau of Justice Statistics. Data Collection: Annual Survey of Jails (ASJ). _Bureau of Justice Statistics_, https://www.bjs.gov/index.cfm?ty=dcdetail&iid=261. Accessed 18 November, 2019.

[4](https://www.bjs.gov/index.cfm?ty=dcdetail&iid=404) Bureau of Justice Statistics. Data Collection: Census of Jail Inmates. _Bureau of Justice Statistics_, https://www.bjs.gov/index.cfm?ty=dcdetail&iid=404. Accessed 18 November, 2019.

[5](https://www.bjs.gov/index.cfm?ty=dcdetail&iid=255) Bureau of Justice Statistics. Data Collection: Census of State and Federal Adult Correctional Facilities (CSFACF). _Bureau of Justice Statistics_, https://www.bjs.gov/index.cfm?ty=dcdetail&iid=255. Accessed 18 November, 2019.

[6](https://catalog.data.gov/dataset/sentenced-inmates-in-correctional-facilities) Lansing, M. Sentenced Inmates in Correctional Facilities. _Data.ct.gov_, https://catalog.data.gov/dataset/sentenced-inmates-in-correctional-facilities. Accessed 2 December 2019.

[7](https://www.tandfonline.com/doi/full/10.1080/00031305.2014.917055?scroll=top&needAccess=true)  Peter H. Westfall (2014) Kurtosis as Peakedness, 1905–2014. R.I.P., The American Statistician, 68:3, 191-195, DOI: 10.1080/00031305.2014.917055.
https://www.tandfonline.com/doi/full/10.1080/00031305.2014.917055?scroll=top&needAccess=true

[8](https://museumandmemorial.eji.org/museum) The Legacy Museum: From Enslavement to Mass Incarceration. _eji.org_. https://museumandmemorial.eji.org/museum. Accessed 12 December, 2019.

# Data Preparation

## Cleaning
The following is some code that was used to clean and organize data that was retreived from the BJS sources.

In [5]:
import re
import pandas as pd
import pickle
import json
import numpy as np
import scipy.linalg as la
import statsmodels.api as sm
from sklearn import linear_model, model_selection, metrics
import sklearn
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import sys
np.set_printoptions(threshold=sys.maxsize)
import warnings
warnings.filterwarnings('ignore')

In [None]:
# regex things that I want
comma = re.compile(r',')
perc = re.compile(r'%')
make_perc = re.compile(r'(.*)(\(%.*\))')
# make_perc = re.compile(r'hello')
perc = re.compile(r'(.*)(\()([0-9.]*)(%?\))(.?)')
fix_dash = re.compile(r'(\s*-\s)([A-z\s]*)')
comma = re.compile(r',')
u = re.compile(r'(u)([0-9]*)')
letter = re.compile(r'([\(\)A-z]?)([0-9]*)([\(\)A-z]?)')


In [None]:
data = pd.read_csv('male_incarceration_data.csv')
cols = data.columns
for col in cols[3:]:
    items = data[col]
    for i,item in enumerate(items):
        new_item = float(comma.sub('', item))
        data[col][i] = new_item
data.to_csv('male_incarceration_data.csv')

In [None]:
data = pd.read_csv('incarceration_by_race.csv')
cols = data.columns
for col in cols[3:]:
    items = data[col]
#     print(col, items)
    for i,item in enumerate(items):
#         print(type(item))
        if type(item) != int:
            new_item = float(comma.sub('', item))
        else:
            new_item = item
        data[col][i] = new_item
data.to_csv('incarceration_by_race.csv')

In [None]:
data = pd.read_csv('female_incarceration_data.csv')
cols = data.columns
for col in cols[3:]:
    items = data[col]
#     print(col, items)
    for i,item in enumerate(items):
#         print(type(item))
        if type(item) != int:
            new_item = float(comma.sub('', item))
        else:
            new_item = item
        data[col][i] = new_item
data.to_csv('female_incarceration_data.csv')

In [None]:
data = pd.read_csv('incarceration_count.csv')
cols = data.colum
for col in cols[2:]:
    items = data[col]
#     print(col, items)
    for i,item in enumerate(items):
#         print(type(item))
        if type(item) != int and type(item) != float:
#             print(type(item) != float)
            new_item = float(comma.sub('', item))
        else:
            new_item = item
        data[col][i] = new_item
data.to_csv('incarceration_counts.csv')

## Feature Engineering

Here is code that I used to engineer different features that I used for regression

In [None]:
df = pd.read_csv('Sentenced_Inmates_in_Correctional_Facilities.csv')
df.columns

In [None]:
df = pd.read_csv('Sentenced_Inmates_in_Correctional_Facilities.csv')
df.RACE.replace(
    to_replace={
        'WHITE   ':'WHITE','BLACK   ':'BLACK','ASIAN   ':'ASIAN'
    }, inplace=True
)
df.to_csv('individuals.csv')

In [None]:
df['LATEST ADMISSION DATE'] = pd.to_datetime(df['LATEST ADMISSION DATE'])
# df['END SENTENCE DATE'] = pd.to_datetime(df['END SENTENCE DATE'])
mask = df['END SENTENCE DATE'] is np.nan
esd = 'END SENTENCE DATE'
nesd = 'N END SENTENCE DATE'
lad = 'LATEST ADMISSION DATE'
sd = 'SENTENCE DAYS'
new_end = np.empty_like(df[esd])
for i in range(len(new_end)):
    to_add = float(df.iloc[i][sd])
    if (to_add / 365) > 300:
        to_add = 300. * 365
    new_end[i] = df.iloc[i][lad] + pd.DateOffset(days=to_add)
df[esd] = new_end
df.to_csv('fixed_date.csv')

In [None]:
df = pd.read_csv('individuals.csv').sample(500000)

offence = pd.get_dummies(df.OFFENSE)
race = pd.get_dummies(df.RACE)
race.drop('AMER IND', axis=1, inplace=True)
race = pd.get_dummies(df.RACE)
race.drop('AMER IND', axis=1, inplace=True)

gender = pd.get_dummies(df.GENDER).drop('F', axis=1)


# becasue of bad encoding I need to clean these variables
 
t1 = offence['53A021'] + offence['53A021                                  ']
t2 = offence['ALTERNATE PENALTY FOR DRUG SALES        '] + offence['ALTERNATE PENALTY FOR DRUG SALES']
t3 = offence['APPLICATION TO PURCHASE A FIREARM'] + offence['APPLICATION TO PURCHASE A FIREARM       ']
t4 = offence['CARRY PIST/RVOLV W/O PERMIT'] + offence['CARRY PIST/RVOLV W/O PERMIT             ']
t5 = offence['CARRYING OR SALE OF DANGEROUS WEAPON'] + offence['CARRYING OR SALE OF DANGEROUS WEAPON    ']
t6 = offence['CHEATING AT GAMBLING'] + offence['CHEATING AT GAMBLING                    ']
t7 = offence['CONSPIRACY'] + offence['CONSPIRACY                              ']
t8 = offence['CREDIT CARD THEFT'] + offence['CREDIT CARD THEFT                       ']
t9 = offence['CRIMINAL ATTEMPT'] + offence['CRIMINAL ATTEMPT                        ']
t10 = offence['CRIMINAL LIABILITY FOR ANOTHER PERSON'] + offence['CRIMINAL LIABILITY FOR ANOTHER PERSON   ']
t11 = offence['CRUELTY TO ANIMALS'] + offence['CRUELTY TO ANIMALS                      ']
t12 = offence['CRUELTY TO PERSONS'] + offence['CRUELTY TO PERSONS                      ']
t13 = offence['DRIVING WHILE LICENSE SUSP FOR DWI'] + offence['DRIVING WHILE LICENSE SUSP FOR DWI      ']
t14 = offence['DRUGS NEAR PROBIBITED PLACE'] + offence['DRUGS NEAR PROBIBITED PLACE             ']
t15 = offence['ENTICING A MINOR'] + offence['ENTICING A MINOR                        ']
t16 = offence['ESCAPE FROM CUSTODY'] + offence['ESCAPE FROM CUSTODY                     ']
t17 = offence['FAILURE TO OBEY AN OFFICER'] + offence['FAILURE TO OBEY AN OFFICER              ']
t18 = offence['FALSELY REPORTING INCIDENT'] + offence['FALSELY REPORTING INCIDENT              ']
t19 = offence['HOME INVASION'] + offence['HOME INVASION                           ']
t20 = offence['ILLEGAL USE OF CREDIT CARD'] + offence['ILLEGAL USE OF CREDIT CARD              ']
t21 = offence['INTERFERENCE WITH SEARCH'] + offence['INTERFERENCE WITH SEARCH                ']
t22 = offence['INTERSTATE CUSTODY COMPACT'] + offence['INTERSTATE CUSTODY COMPACT              ']
t23 = offence['ISSUING A BAD CHECK'] + offence['ISSUING A BAD CHECK                     ']
t24 = offence['LARCENY DEFINED'] + offence['LARCENY DEFINED                         ']
t25 = offence['OPERATE UNDER INFLU OF LIQ OR DRUGS'] + offence['OPERATE UNDER INFLU OF LIQ OR DRUGS     ']
t26 = offence['POSS OF ASSAULT WEAPON PROHIBITED'] + offence['POSS OF ASSAULT WEAPON PROHIBITED       ']
t27 = offence['POSSESS OF LT 4 OZ MJ OR CONTROLD SUB'] + offence['POSSESS OF LT 4 OZ MJ OR CONTROLD SUB   ']
t28 = offence['POSSESSION OF NARCOTICS'] + offence['POSSESSION OF NARCOTICS                 ']
t29 = offence['PROHIB ACTS RE: CONTROLLED SUBSTANCES'] + offence['PROHIB ACTS RE: CONTROLLED SUBSTANCES   ']
t30 = offence['PROHIBITED ACTIVITIES'] + offence['PROHIBITED ACTIVITIES                   ']
t31 = offence['REGULATION OF RESTRICTED SUBSTANCES'] + offence['REGULATION OF RESTRICTED SUBSTANCES     ']
t32 = offence['ROBBERY BY CARJACKING'] + offence['ROBBERY BY CARJACKING                   ']
t33 = offence['ROBBERY, FIRST DEGREE                BF'] + offence['ROBBERY, FIRST DEGREE                 BF'] + offence['ROBBERY, FIRST DEGREE                BF ']
t34 = offence['SALE OF HEROIN, COC BY NON-DEPENDENT'] + offence['SALE OF HEROIN, COC BY NON-DEPENDENT    ']
t35 = offence['SALE, CARRY & BRAND OF FACSIMILE'] + offence['SALE, CARRY & BRAND OF FACSIMILE        ']
t36 = offence['STRANGULATION 3RD DEGREE'] + offence['STRANGULATION 3RD DEGREE                ']
t37 = offence['USING MOTOR VEHICLE W/O PERMISSION'] + offence['USING MOTOR VEHICLE W/O PERMISSION      ']
t38 = offence['VIOLATION OF CONDITIONS OF RELEASE'] + offence['VIOLATION OF CONDITIONS OF RELEASE      ']
t39 = offence['VIOLATION OF PROBATION OR COND DISCHG'] + offence['VIOLATION OF PROBATION OR COND DISCHG   ']
t40 = offence['VOYEURISM'] + offence['VOYEURISM                               ']
t41 = offence['YOUTHFUL OFFENDER'] + offence['YOUTHFUL OFFENDER                       ']

offence['53A021']
offence['ALTERNATE PENALTY FOR DRUG SALES'] = t1
offence['ALTERNATE PENALTY FOR DRUG SALES'] = t2
offence['CARRY PIST/RVOLV W/O PERMIT'] = t4
offence['CARRYING OR SALE OF DANGEROUS WEAPON'] = t5
offence['CHEATING AT GAMBLING'] = t6
offence['CONSPIRACY'] = t7
offence['CREDIT CARD THEFT'] = t7
offence['CRIMINAL ATTEMPT'] = t9
offence['CRIMINAL LIABILITY FOR ANOTHER PERSON'] = t10
offence['CRUELTY TO ANIMALS'] = t11
offence['CRUELTY TO PERSONS'] = t12
offence['DRIVING WHILE LICENSE SUSP FOR DWI'] = t13
offence['DRUGS NEAR PROBIBITED PLACE'] = t14
offence['ENTICING A MINOR'] = t15
offence['ESCAPE FROM CUSTODY'] = t16
offence['FAILURE TO OBEY AN OFFICER'] = t17
offence['FALSELY REPORTING INCIDENT'] = t18
offence['HOME INVASION'] = t19
offence['ILLEGAL USE OF CREDIT CARD'] = t20
offence['INTERFERENCE WITH SEARCH'] = t21
offence['INTERSTATE CUSTODY COMPACT'] = t22
offence['ISSUING A BAD CHECK'] = t23
offence['LARCENY DEFINED'] = t24
offence['OPERATE UNDER INFLU OF LIQ OR DRUGS'] = t25 
offence['POSS OF ASSAULT WEAPON PROHIBITED'] = t26
offence['POSSESS OF LT 4 OZ MJ OR CONTROLD SUB'] = t27
offence['POSSESSION OF NARCOTICS'] = t28
offence['PROHIB ACTS RE: CONTROLLED SUBSTANCES'] = t29
offence['PROHIBITED ACTIVITIES'] = t30
offence['REGULATION OF RESTRICTED SUBSTANCES'] = t31 
offence['ROBBERY BY CARJACKING'] = t32
offence['ROBBERY, FIRST DEGREE                 BF'] = t33 
offence['SALE OF HEROIN, COC BY NON-DEPENDENT'] = t34
offence['SALE, CARRY & BRAND OF FACSIMILE'] = t35
offence['STRANGULATION 3RD DEGREE'] = t36
offence['USING MOTOR VEHICLE W/O PERMISSION'] = t37 
offence['VIOLATION OF CONDITIONS OF RELEASE'] = t38 
offence['VIOLATION OF PROBATION OR COND DISCHG'] = t39
offence['VOYEURISM'] = t40
offence['YOUTHFUL OFFENDER'] = t41 

In [None]:
# now we drop all the excess categories 
# AND one of the categories to fix linear dependence
offence.drop(
    [
        '53A021', # this is the dependence
        'ALTERNATE PENALTY FOR DRUG SALES        ',
        'APPLICATION TO PURCHASE A FIREARM       ',
        'CARRY PIST/RVOLV W/O PERMIT             ',
        'CARRYING OR SALE OF DANGEROUS WEAPON    ',
        'CHEATING AT GAMBLING                    ',
        'CONSPIRACY                              ',
        'CREDIT CARD THEFT                       ',
        'CRIMINAL ATTEMPT                        ',
        'CRIMINAL LIABILITY FOR ANOTHER PERSON   ',
        'CRUELTY TO ANIMALS                      ',
        'CRUELTY TO PERSONS                      ',
        'DRIVING WHILE LICENSE SUSP FOR DWI      ',
        'DRUGS NEAR PROBIBITED PLACE             ',
        'ENTICING A MINOR                        ',
        'ESCAPE FROM CUSTODY                     ',
        'FAILURE TO OBEY AN OFFICER              ',
        'FALSELY REPORTING INCIDENT              ',
        'HOME INVASION                           ',
        'ILLEGAL USE OF CREDIT CARD              ',
        'INTERFERENCE WITH SEARCH                ',
        'INTERSTATE CUSTODY COMPACT              ',
        'ISSUING A BAD CHECK                     ',
        'LARCENY DEFINED                         ',
        'OPERATE UNDER INFLU OF LIQ OR DRUGS     ',
        'POSS OF ASSAULT WEAPON PROHIBITED       ',
        'POSSESS OF LT 4 OZ MJ OR CONTROLD SUB   ',
        'POSSESSION OF NARCOTICS                 ',
        'PROHIB ACTS RE: CONTROLLED SUBSTANCES   ',
        'PROHIBITED ACTIVITIES                   ',
        'REGULATION OF RESTRICTED SUBSTANCES     ',
        'ROBBERY BY CARJACKING                   ',
        'ROBBERY, FIRST DEGREE                BF',
        'ROBBERY, FIRST DEGREE                BF ',
        'SALE OF HEROIN, COC BY NON-DEPENDENT    ',
        'SALE, CARRY & BRAND OF FACSIMILE        ',
        'STRANGULATION 3RD DEGREE                ',
        'USING MOTOR VEHICLE W/O PERMISSION      ',
        'VIOLATION OF CONDITIONS OF RELEASE      ',
        'VIOLATION OF PROBATION OR COND DISCHG   ',
        'VOYEURISM                               ',
        'YOUTHFUL OFFENDER                       '
    ], axis=1, inplace=True
)

one = pd.DataFrame(np.ones_like(df.AGE))
reg_df = pd.concat(
    [
#         df['IDENTIFIER'],
#         df['LATEST ADMISSION DATE'],
        df['AGE'],
        df['SENTENCE DAYS'],
        offence,
        gender,
        race, 
        one
    ], axis=1
)

reg_df.to_csv('regression_df.csv')

In [None]:
df = pd.read_csv('individuals.csv').sample(3000000)

race = pd.get_dummies(df.RACE)
race.drop('AMER IND', axis=1, inplace=True)

one = pd.DataFrame(np.ones_like(df.AGE))
reg_df = pd.concat(
    [
#         df['IDENTIFIER'],
#         df['LATEST ADMISSION DATE'],
        df['AGE'],
        df['SENTENCE DAYS'],
#         gender,
        race, 
        one
    ], axis=1
)
reg_df.to_csv('race_regression_df.csv')

# Extras
Here I have inserted the full results from the resgressions that I ran.

## Full regression

In [6]:
regr_df = pd.read_csv('small_regression_df.csv')
regr_df.dropna(inplace=True)
sentence = regr_df['SENTENCE DAYS']
regr_df.drop(
    ['SENTENCE DAYS','Unnamed: 0','IDENTIFIER','LATEST ADMISSION DATE'], axis=1, inplace=True
)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    regr_df.astype(float), sentence, test_size=.3
)
res_gen = sm.OLS(y_train, X_train).fit()
print(res_gen.summary())

                            OLS Regression Results                            
Dep. Variable:          SENTENCE DAYS   R-squared:                       0.582
Model:                            OLS   Adj. R-squared:                  0.578
Method:                 Least Squares   F-statistic:                     162.4
Date:                Thu, 12 Dec 2019   Prob (F-statistic):               0.00
Time:                        14:50:27   Log-Likelihood:            -2.6340e+05
No. Observations:               22824   AIC:                         5.272e+05
Df Residuals:                   22629   BIC:                         5.288e+05
Df Model:                         194                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

## Race Only Regression


In [7]:
race_regr_df = pd.read_csv('race_regression_df.csv')
race_regr_df.dropna(inplace=True)
sentence = race_regr_df['SENTENCE DAYS']
race_regr_df.drop(
    ['SENTENCE DAYS','Unnamed: 0'], axis=1, inplace=True
)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    race_regr_df.astype(float), sentence, test_size=.3
)
res_gen = sm.OLS(y_train, X_train).fit()
print(res_gen.summary())

                            OLS Regression Results                            
Dep. Variable:          SENTENCE DAYS   R-squared:                       0.036
Model:                            OLS   Adj. R-squared:                  0.036
Method:                 Least Squares   F-statistic:                     6083.
Date:                Thu, 12 Dec 2019   Prob (F-statistic):               0.00
Time:                        14:52:13   Log-Likelihood:            -9.7050e+06
No. Observations:              809642   AIC:                         1.941e+07
Df Residuals:                  809636   BIC:                         1.941e+07
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
AGE          645.7792      3.728    173.207      0.0

In [11]:
# df = pd.read_csv('individuals.csv')
for thing in regr_df.columns:
    print(thing)

AGE
1ST DEGREE MURDER, OTHER HOMICIDES     F
53-202A
53A189C
ABUSE OF PERSONS-1ST DEGREE           CF
ABUSE OF PERSONS-2ND DEGREE           DF
AGGAVTD SEX ASSLT-VCTM < 13YR         AF
ALLOWING PERSON UNDER 16 TO OPERATE MV M
ALTERING OR REMOVING IDENT NUMBER      F
ALTERNATE PENALTY FOR DRUG SALES
APPLICATION TO PURCHASE A FIREARM
ARSON MURDER                           F
ARSON, FIRST DEGREE                   AF
ARSON, SECOND DEGREE                  BF
ARSON, THIRD DEGREE                   CF
ASSAULT 2ND DEGREE WITH A FIREARM     DF
ASSAULT 2ND WITH MV WHILE INTOXICATED DF
ASSAULT 2ND, VICTIM 60 OR OVER        DF
ASSAULT 3RD DEGREE, VICTIM OVER 59    AM
ASSAULT ON POLICE OR FIRE OFFICER     CF
ASSAULT, 1ST VICTIM 60 OR OVER        BF
ASSAULT, FIRST DEGREE                 BF
ASSAULT, SECOND DEGREE                DF
ASSAULT, THIRD DEGREE                 AM
BREACH OF PEACE                       BM
BRIBERY OF WITNESS                    DF
BURGLARY, FIRST DEGREE                BF
BURGLARY, F

# Code Documentation

Here are the code blocks, in order that I removed from the final report pdf.

In [None]:
import pandas as pd
import numpy as np
import scipy.linalg as la
import scipy.stats as stats
import statsmodels.api as sm
from sklearn import linear_model, model_selection, metrics
import sklearn
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import pprint
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load in bjs data
incar = pd.read_csv('incarceration_trends.csv')
race = pd.read_csv('incarceration_by_race.csv')
crime = pd.read_csv('crime_data.csv')

In [None]:
# print samples from the data sets
print(incar[['Year','State prisons','Population']].sample(3))

In [None]:
race['White_rate']
cols = ['Geography','Total','White','Black','White_rate','Black_rate']
print(race[cols].sample(2))

In [None]:
cols = ['Year','Violent crime','Murder','Rape','Robbery','Assault']
print(crime[cols].sample(3))

In [None]:
# load in connecticut data sets
inmates = pd.read_csv('individuals.csv')
race_regr_df = pd.read_csv('race_regression_df.csv')

In [None]:
# print samples
cols = ['LATEST ADMISSION DATE','AGE','RACE','SENTENCE DAYS']
print(inmates[cols].sample(3))

In [None]:
#  disclose sample sizes for races
races = {'BLACK':'Blacks','WHITE':'Whites',
         'HISPANIC':'Hispanic','AMER IND':'American Indian',
         'ASIAN':'Asian'
        }
for rac in races.keys():
    mask = inmates.RACE == rac
    print(f'Sample size for {races[rac]}: {len(inmates[mask])}')

In [None]:
# plot the incarceration trends against crime
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(
    incar['Year'].values,
    incar['State prisons'].values / (incar.Population*10**4),
    label='State Prisons', color='tab:blue')

ax.plot(
    incar['Year'].values,
    incar['Local jails'].values / (incar.Population*10**4),
    label='Local Jails', color='tab:orange')
ax2 = ax.twinx()
ax2.plot(
    crime.Year, crime['Violent crime'],
    label='violent crime', color='tab:red'
)
ax2.set_ylabel('rate per 100,000')
ax2.legend()
ax.legend()
ax.set_title('Incarceration and Crime Trends Over Time')
ax.set_xlabel('Year')
ax.set_ylabel('rate per 100,000')
plt.show()

In [None]:
# plot histogram of sentence lengths
fig, ax = plt.subplots(1, figsize=(10,5))
ax.set_xlim(0,10000)
mask = inmates['SENTENCE DAYS'] < 10000
ax.hist(
    inmates['SENTENCE DAYS'][mask].values,
    bins=200
)
plt.show()

In [None]:
# plot histograms of incarceration rates by race
fig, ax = plt.subplots(2,2, figsize=(10,7))
ax[0,0].hist(race['White_rate'])
ax[0,0].set_title('White Incarceration Rates')
ax[0,0].set_xlabel('rate per 100,000')

ax[0,1].hist(race['Black_rate'], color='tab:orange')
ax[0,1].set_title('Black Incarceration Rates')
ax[0,1].set_xlabel('rate per 100,000')

ax[1,0].hist(race['Hisp_rate'], color='tab:green')
ax[1,0].set_title('Hispanic Incarceration Rates')
ax[1,0].set_xlabel('rate per 100,000')

ax[1,1].hist(race['Asian_rate'], color='tab:gray')
ax[1,1].set_title('Asian Incarceration Rates')
ax[1,1].set_xlabel('rate per 100,000')
fig.tight_layout()
plt.show()

In [None]:
# plot other stats related to sentence lenghts by race
races = inmates.groupby(by='RACE')
print(
    'Median sentence lengths:\n {}'.format(races['SENTENCE DAYS'].median())
)
print('\n')
print(
    'Standard deviation of sentence lengths:\n {}'.format(races['SENTENCE DAYS'].std())
)

In [None]:
# plot boxplot of sentence lengths
fig, ax = plt.subplots(1, figsize=(10,7))
ax.set_xlim(-50,20000)
bp = inmates.boxplot(
    ['SENTENCE DAYS'], by=['RACE'],
    vert=False, grid=False,
    ax=ax
)
ax.set_title('distribution of sentence lengths')
ax.set_xlabel('sentence length in days')
plt.show()

In [None]:
# plot histograms of sentence lengths by race 
fig, ax = plt.subplots(3,2, figsize=(8,5))
indexer = {
    (0,0):'BLACK',
    (0,1):'WHITE',
    (1,0):'ASIAN',
    (1,1):'HISPANIC',
    (2,0):'AMER IND'
}
for key in indexer.keys():
    ax[key].set_xlim(-50,20000)
    mask = races.get_group(indexer[key])['SENTENCE DAYS'] < 20000
    ax[key].hist(
        races.get_group(indexer[key])['SENTENCE DAYS'][mask],
        bins=50
    )
    ax[key].set_xlabel('distribution of sentence length in days')
    ax[key].set_title(indexer[key])

fig.tight_layout()
plt.show()

In [None]:
# plot the kurtosis of sentence lengths by race
out = '{} Kurtosis: {}'
kurts = {
    'White':
    stats.kurtosis(races.get_group('WHITE')['SENTENCE DAYS']),
    'Black':
    stats.kurtosis(races.get_group('BLACK')['SENTENCE DAYS']),
    'Hispanic':
    stats.kurtosis(races.get_group('HISPANIC')['SENTENCE DAYS']),
    'American Indian':
    stats.kurtosis(races.get_group('AMER IND')['SENTENCE DAYS']),
    'Asian':
    stats.kurtosis(races.get_group('ASIAN')['SENTENCE DAYS']),
}

for key in kurts.keys():
    print(
        out.format(key, kurts[key])
    )

In [None]:
# explore results of regression
race_regr_df.dropna(inplace=True)
sentence = race_regr_df['SENTENCE DAYS']
race_regr_df.drop(
    ['SENTENCE DAYS','Unnamed: 0'], axis=1, inplace=True
)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    race_regr_df.astype(float), sentence, test_size=.3
)
res_gen = sm.OLS(y_train, X_train).fit()
print('Regression Results:\n')
print(f'\tMethod: Least Squares')
print(f'\tR-squared value: {res_gen.rsquared}')
print(f'\tAIC: {res_gen.aic}')
print(f'\tBIC: {res_gen.bic}')

In [None]:
# plot sentence lengths of people that are convicted of 
# criminal liability
race_mask1 = inmates.RACE != 'AMER IND'
race_mask2 = inmates.RACE != 'ASIAN'
race_mask = np.logical_and(race_mask1,race_mask2)

mask = inmates[race_mask]['OFFENSE'] =='CRIMINAL LIABILITY FOR ANOTHER PERSON'

fig, ax = plt.subplots(1, figsize=(5,5))
ax.set_xlim(-50,80000)
bp = inmates[race_mask][mask].boxplot(
    ['SENTENCE DAYS'], by=['RACE'],
    vert=False, grid=False,
    ax=ax
)
ax.set_title('Distribution of Sentence Lengths for CRIMINAL LIABILITY FOR ANOTHER PERSON')
ax.set_xlabel('sentence length in days')
plt.show()

In [None]:
# plot kurtosis related to criminal liability
mask = inmates[race_mask]['OFFENSE'] == 'CRIMINAL LIABILITY FOR ANOTHER PERSON'


out = '{} Kurtosis for\n\t{}: {}'
kurts = {
    'White':
    stats.kurtosis(
        races.get_group('WHITE')['SENTENCE DAYS'][race_mask][mask]
    ),
    'Black':
    stats.kurtosis(
        races.get_group('BLACK')['SENTENCE DAYS'][race_mask][mask]
    ),
    'Hispanic':
    stats.kurtosis(
        races.get_group('HISPANIC')['SENTENCE DAYS'][race_mask][mask]
    )
#     'American Indian':
#     stats.kurtosis(races.get_group('AMER IND')['SENTENCE DAYS'][mask]),
#     'Asian':
#     stats.kurtosis(races.get_group('ASIAN')['SENTENCE DAYS'][mask]),
}

for key in kurts.keys():
    print(
        out.format(key, 'CRIMINAL LIABILITY FOR ANOTHER PERSON', kurts[key])
)

In [None]:
# plot a sample of offenses
race_mask1 = inmates.RACE != 'AMER IND'
race_mask2 = inmates.RACE != 'ASIAN'
race_mask = np.logical_and(race_mask1,race_mask2)
offenses = [
    'MURDER                                AF',
    'SALE OF CONTROLLED SUBSTANCE           F',
    'SALE OF HALLUCIGEN/NARCOTIC SUBSTANCE  F',
    'ROBBERY, FIRST DEGREE                 BF'
#     'CRIMINAL LIABILITY FOR ANOTHER PERSON'
]
for off in offenses:
    
    mask = inmates[race_mask]['OFFENSE'] == off
    # inmates[mask]['RACE'].sample(10)
    fig, ax = plt.subplots(1, figsize=(5,5))
    lim = np.percentile(
        inmates['SENTENCE DAYS'][race_mask][mask],
        99
    )
    ax.set_xlim(-50,lim)
    bp = inmates[race_mask][mask].boxplot(
        ['SENTENCE DAYS'], by=['RACE'],
        vert=False, grid=False,
        ax=ax
    )
    ax.set_title(f'distribution of sentence lengths for {off}')
    ax.set_xlabel('sentence length in days')
    plt.show()
    out = '{} Kurtosis: {}'
    kurts = {
        'White':
        stats.kurtosis(
            races.get_group('WHITE')['SENTENCE DAYS'][race_mask][mask]
        ),
        'Black':
        stats.kurtosis(
            races.get_group('BLACK')['SENTENCE DAYS'][race_mask][mask]
        ),
        'Hispanic':
        stats.kurtosis(
            races.get_group('HISPANIC')['SENTENCE DAYS'][race_mask][mask]
        )
#         'American Indian':
#         stats.kurtosis(races.get_group('AMER IND')['SENTENCE DAYS'][mask]),
#         'Asian':
#         stats.kurtosis(races.get_group('ASIAN')['SENTENCE DAYS'][mask]),
    }

    for key in kurts.keys():
        print(
            out.format(key, kurts[key])
    )