In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, I/O
import pingouin as pg # for two-sample t-testing

In [2]:
data = pd.read_csv("../Datasets/Refined_Datasets/state_new_deaths_monthly.csv")
data.drop(columns=['Unnamed: 0'],inplace=True)
data.head()

Unnamed: 0,Month,State,New Deaths
0,2020-02,Alabama,0.0
1,2020-03,Alabama,23.0
2,2020-04,Alabama,249.0
3,2020-05,Alabama,358.0
4,2020-06,Alabama,320.0


In [3]:
#Record American Rescue Plan Act of 2021 and Cares Act of 2020 dates
ARP_2021 = '2021-03'
CA_2020 = '2020-03'
data[data['Month']==ARP_2021].head()

Unnamed: 0,Month,State,New Deaths
13,2021-03,Alabama,625.0
29,2021-03,Alaska,23.0
45,2021-03,Arizona,987.0
61,2021-03,Arkansas,383.0
77,2021-03,California,6439.0


In [4]:
#get data after CA_2020
new_deaths_after_CA_2020 = data[(data['Month']>CA_2020) & (data['Month']<ARP_2021)]['New Deaths']
new_deaths_after_CA_2020

2      249.0
3      358.0
4      320.0
5      630.0
6      602.0
       ...  
792     37.0
793    128.0
794    223.0
795    158.0
796     75.0
Name: New Deaths, Length: 550, dtype: float64

In [5]:
#get data before CA_2020
new_deaths_before_CA_2020 = data[data['Month']<CA_2020]['New Deaths']
new_deaths_before_CA_2020

0      0.0
16     0.0
32     0.0
48     0.0
64     0.0
80     0.0
96     0.0
112    0.0
128    0.0
144    0.0
160    0.0
176    0.0
192    0.0
208    0.0
224    0.0
240    0.0
256    0.0
272    0.0
288    0.0
304    0.0
320    0.0
336    0.0
352    0.0
368    0.0
384    0.0
400    0.0
416    0.0
432    0.0
448    0.0
464    0.0
480    0.0
496    0.0
512    0.0
528    0.0
544    0.0
560    0.0
576    0.0
592    0.0
608    0.0
624    0.0
640    0.0
656    0.0
672    0.0
688    0.0
704    0.0
720    0.0
736    1.0
752    0.0
768    0.0
784    0.0
Name: New Deaths, dtype: float64

In [6]:
avg_new_deaths_pre_CA_2020 = new_deaths_before_CA_2020.mean()
avg_new_deaths_pre_CA_2020

0.02

In [7]:
avg_new_deaths_after_CA = new_deaths_after_CA_2020.mean()
avg_new_deaths_after_CA

920.7327272727273

These already look statistically significant. Let's conduct a t-test and see what results.

In [8]:
test = pg.ttest(new_deaths_after_CA_2020,  new_deaths_before_CA_2020)
p = test.iloc[0]['p-val'].item()
sig = p < 0.05
print(p) #show the p-value first
print(sig) #show whether difference is significant at 95% confidence level

7.451914543148873e-33
True


In [9]:
#get data before ARP_2021
new_deaths_before_ARP_2021 = data[(data['Month']>CA_2020) & (data['Month']<ARP_2021)]['New Deaths']
new_deaths_before_ARP_2021

2      249.0
3      358.0
4      320.0
5      630.0
6      602.0
       ...  
792     37.0
793    128.0
794    223.0
795    158.0
796     75.0
Name: New Deaths, Length: 550, dtype: float64

In [10]:
#get data after ARP_2021
new_deaths_after_ARP_2021 = data[data['Month']>ARP_2021]['New Deaths']
new_deaths_after_ARP_2021

14     342.0
15      17.0
30      34.0
31       0.0
46     357.0
       ...  
767      7.0
782    235.0
783     17.0
798      7.0
799      0.0
Name: New Deaths, Length: 100, dtype: float64

In [11]:
avg_new_deaths_before_ARP = new_deaths_before_ARP_2021.mean()
avg_new_deaths_before_ARP

920.7327272727273

In [12]:
avg_new_deaths_after_ARP_2021 = new_deaths_after_ARP_2021.mean()
avg_new_deaths_after_ARP_2021

243.08

Again, even before t-testing, the difference in these means looks statistically significant. A t-test will likely confirm this.

In [13]:
test = pg.ttest(new_deaths_after_ARP_2021,  new_deaths_before_ARP_2021)
p = test.iloc[0]['p-val'].item()
sig = p < 0.05
print(p) #show the p-value first
print(sig) #show whether difference is significant at 95% confidence level

6.844282239913526e-14
True
