In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Read in individual datasets for each state
all_states = os.listdir('clean_data/')
all_states = {f[:2]:pd.read_csv(f"clean_data/{f}") for f in all_states if len(f)<7}
print("Total number of states (including Federal Bureau of Prisons):", len(all_states.keys()))
all_states['MA']

Total number of states (including Federal Bureau of Prisons): 51


Unnamed: 0,name,abbreviation,staff_tests,prisoner_tests,total_staff_cases,total_prisoner_cases,total_staff_deaths,total_prisoner_deaths,date,week_num
0,Massachusetts,MA,,,4.0,9.0,0.0,0.0,2020-03-26,2020-13
1,Massachusetts,MA,,,7.0,20.0,0.0,0.0,2020-04-01,2020-14
2,Massachusetts,MA,,,26.0,46.0,0.0,3.0,2020-04-08,2020-15
3,Massachusetts,MA,,,42.0,90.0,0.0,4.0,2020-04-15,2020-16
4,Massachusetts,MA,,296.0,93.0,127.0,0.0,7.0,2020-04-22,2020-17
5,Massachusetts,MA,,747.0,120.0,257.0,0.0,7.0,2020-04-29,2020-18
6,Massachusetts,MA,,1906.0,151.0,351.0,0.0,7.0,2020-05-06,2020-19


In [3]:
# Read in population data and fillna using OLS
pop = pd.read_csv('clean_data/population.csv')

# Fillin the nan values using a linear model
from sklearn.linear_model import LinearRegression
train = pop[pop['staff_pop']>0].copy()
test = pop[pop['staff_pop'].isna()].copy()
X_train, X_test, y_train = train[['prison_pop']], test[['prison_pop']], train['staff_pop']
ols = LinearRegression().fit(X_train, y_train)
y_pred = ols.predict(X_test)
y_pred = list(map(int, y_pred))
pop.loc[pop['staff_pop'].isnull(), 'staff_pop'] = y_pred
assert pop.isna().sum()['staff_pop'] == 0

# Define 2 types of targets for OLS

In [4]:
# Define 2 types of targets for OLS
# 1. Percentage (in %) of for most recent cumulative prisoner/staff cases over prisoner/staff population
# 2. The slope of cumulative prisoner/staff cases

state_names = [c for c in list(all_states.keys()) if 'US' not in c]
result = {'state':[], 'prison_perc':[], 'staff_perc':[], 'prison_slope':[], 'staff_slope':[]}

for state in state_names:
    result['state'].append(state)
    state_df = all_states[state]
    # Normalize cases by population
    prisoner_pop = pop[pop['abbreviation'] == state]['prison_pop'].values[0]
    staff_pop = pop[pop['abbreviation'] == state]['staff_pop'].values[0]
    state_df['total_prisoner_cases'] /= prisoner_pop
    state_df['total_staff_cases'] /= staff_pop
    
    result['prison_perc'].append(state_df['total_prisoner_cases'].max() * 100)
    result['staff_perc'].append(state_df['total_staff_cases'].max() * 100)
    prison_slope = (state_df['total_prisoner_cases'].max() - state_df['total_prisoner_cases'].min()) / 7
    staff_slope = (state_df['total_staff_cases'].max() - state_df['total_staff_cases'].min()) / 7
    result['prison_slope'].append(prison_slope)
    result['staff_slope'].append(staff_slope)
result = pd.DataFrame(result)
result

Unnamed: 0,state,prison_perc,staff_perc,prison_slope,staff_slope
0,MN,0.960982,2.164066,0.001373,0.003092
1,AZ,0.165571,0.609185,0.000237,0.0
2,WA,0.141593,0.397456,0.000202,0.000503
3,WV,0.0,0.066401,0.0,9.5e-05
4,AL,0.037818,0.271609,5.4e-05,0.000365
5,NC,1.855579,0.373038,0.002651,0.000533
6,MO,0.123077,0.181818,0.00017,0.00026
7,NV,0.0,0.571429,0.0,0.000765
8,OR,0.30433,0.452196,0.000435,0.000646
9,TX,0.953441,1.397167,0.001361,0.001988
