In [202]:
import glob
import pandas as pd

# Path to csv files
covid_data = 'Data/covid/*.csv'
vaccination_usa = 'Data/vaccinations/us.csv'
vaccination_usa_state = 'Data/vaccinations/us_state_vaccinations.csv'

In [203]:
data = pd.DataFrame()

# Read all css files per day comnine in one dataset
for file in glob.glob(covid_data):
    df = pd.read_csv(file)
    # Check if "Last_update" column has null values, fill it with forward fill
    # propagate[s] last valid observation forward to next valid
    if df['Last_Update'].isna().sum() > 0:
        df.fillna(method='ffill', inplace=True)
    # Remove invalid states from dataset
    df = df[~df['Province_State'].isin(['Diamond Princess', 'Grand Princess', 'Recovered'])]
    # Set Recovered field to 0 when no confirmed cases
    df['Recovered'] = df.apply(lambda x: x['Confirmed'] if x['Confirmed']==0 else x['Recovered'], axis=1)
    data = data.append(df,ignore_index=True)
    

In [204]:
# Convert data to YYYY-MM-DD format
data['Last_Update'] = pd.to_datetime(data['Last_Update'], format='%Y/%m/%d %H:%M:%S').dt.strftime('%Y-%m-%d')
data

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,UID,ISO3,Testing_Rate,Hospitalization_Rate,Total_Test_Results,Case_Fatality_Ratio
0,Alabama,US,2020-06-08,32.3182,-86.9023,20777,692,11395.0,8413.0,1.0,418.095585,259566.0,2022.0,3.375610,84000001.0,USA,5293.824320,9.863415,,
1,Alaska,US,2020-06-08,61.3707,-152.4044,543,10,382.0,151.0,2.0,74.226466,64904.0,,1.841621,84000002.0,USA,8872.181479,,,
2,American Samoa,US,2020-06-08,-14.2710,-170.1320,0,0,0.0,0.0,60.0,0.000000,174.0,,,16.0,ASM,312.719038,,,
3,Arizona,US,2020-06-08,33.7298,-111.4312,26989,1051,5517.0,20421.0,4.0,370.793369,281621.0,3352.0,3.894179,84000004.0,USA,3869.102206,12.419875,,
4,Arkansas,US,2020-06-08,34.9697,-92.3731,9426,154,6424.0,2848.0,5.0,312.346329,160273.0,844.0,1.633779,84000005.0,USA,5310.914824,8.953957,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17355,Virginia,US,2020-04-25,37.7693,-78.1700,11594,411,1672.0,11183.0,51.0,146.625304,69440.0,1837.0,3.544937,84000051.0,USA,878.183637,15.844402,,
17356,Washington,US,2020-04-25,47.4009,-121.4905,12977,762,,12255.0,53.0,171.894387,160324.0,455.0,5.563690,84000053.0,USA,2123.664613,3.506203,,
17357,West Virginia,US,2020-04-25,38.4912,-80.9545,1010,32,439.0,978.0,54.0,76.341763,29811.0,97.0,3.168317,84000054.0,USA,2253.291388,9.603960,,
17358,Wisconsin,US,2020-04-25,44.2685,-89.6165,5356,263,,5093.0,55.0,103.506165,59929.0,1353.0,4.910381,84000055.0,USA,1158.144318,25.261389,,


In [205]:
# Find all missing values
data.isna().sum()

Province_State              0
Country_Region              0
Last_Update                 0
Lat                         0
Long_                       0
Confirmed                   0
Deaths                      0
Recovered                2492
Active                      9
FIPS                        0
Incident_Rate               0
People_Tested            5544
People_Hospitalized     12168
Mortality_Rate           5746
UID                         0
ISO3                        0
Testing_Rate                0
Hospitalization_Rate    12168
Total_Test_Results      11816
Case_Fatality_Ratio     11915
dtype: int64

In [206]:
# Drop columns that have more than 60-70% of missing values
# 17360 rows / 11915 = 0.68
data.drop(['People_Hospitalized', 'Hospitalization_Rate', 
                'Total_Test_Results', 'Case_Fatality_Ratio'], axis=1, inplace=True)

In [191]:
data.isna().sum()

Province_State       0
Country_Region       0
Last_Update          0
Lat                  0
Long_                0
Confirmed            0
Deaths               0
Recovered         2492
Active               9
FIPS                 0
Incident_Rate        0
People_Tested     5544
Mortality_Rate    5746
UID                  0
ISO3                 0
Testing_Rate         0
dtype: int64

In [207]:
# Create tmp DataFrame with missing values and States
df_nan = data[['Province_State','Recovered', 'People_Tested', 'Mortality_Rate', 'Active']]
# Group by State and use interpolate method to fill out missing values with previous value for the same state
df_interpolated = df_nan.groupby('Province_State').apply(lambda x: x.interpolate(method='linear'))
# Check that missing value amount is drastically reduced
df_interpolated.isna().sum()

Province_State      0
Recovered         185
People_Tested       0
Mortality_Rate     45
Active              0
dtype: int64

In [208]:
df_interpolated

Unnamed: 0,Province_State,Recovered,People_Tested,Mortality_Rate,Active
0,Alabama,11395.0,259566.0,3.375610,8413.0
1,Alaska,382.0,64904.0,1.841621,151.0
2,American Samoa,0.0,174.0,,0.0
3,Arizona,5517.0,281621.0,3.894179,20421.0
4,Arkansas,6424.0,160273.0,1.633779,2848.0
...,...,...,...,...,...
17355,Virginia,1672.0,69440.0,3.544937,11183.0
17356,Washington,0.0,160324.0,5.563690,12255.0
17357,West Virginia,439.0,29811.0,3.168317,978.0
17358,Wisconsin,461729.0,59929.0,4.910381,5093.0


In [210]:
# Group by state and  fill out missing values with mean value 
df_means = df_interpolated.groupby('Province_State').transform(lambda x:x.fillna(x.mean()))
# Update original DataFrame with new values
data.update(df_means)
data.isna().sum()

Province_State    0
Country_Region    0
Last_Update       0
Lat               0
Long_             0
Confirmed         0
Deaths            0
Recovered         0
Active            0
FIPS              0
Incident_Rate     0
People_Tested     0
Mortality_Rate    0
UID               0
ISO3              0
Testing_Rate      0
dtype: int64

In [211]:
data

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,Mortality_Rate,UID,ISO3,Testing_Rate
0,Alabama,US,2020-06-08,32.3182,-86.9023,20777,692,11395.0,8413.0,1.0,418.095585,259566.0,3.375610,84000001.0,USA,5293.824320
1,Alaska,US,2020-06-08,61.3707,-152.4044,543,10,382.0,151.0,2.0,74.226466,64904.0,1.841621,84000002.0,USA,8872.181479
2,American Samoa,US,2020-06-08,-14.2710,-170.1320,0,0,0.0,0.0,60.0,0.000000,174.0,2.841323,16.0,ASM,312.719038
3,Arizona,US,2020-06-08,33.7298,-111.4312,26989,1051,5517.0,20421.0,4.0,370.793369,281621.0,3.894179,84000004.0,USA,3869.102206
4,Arkansas,US,2020-06-08,34.9697,-92.3731,9426,154,6424.0,2848.0,5.0,312.346329,160273.0,1.633779,84000005.0,USA,5310.914824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17355,Virginia,US,2020-04-25,37.7693,-78.1700,11594,411,1672.0,11183.0,51.0,146.625304,69440.0,3.544937,84000051.0,USA,878.183637
17356,Washington,US,2020-04-25,47.4009,-121.4905,12977,762,0.0,12255.0,53.0,171.894387,160324.0,5.563690,84000053.0,USA,2123.664613
17357,West Virginia,US,2020-04-25,38.4912,-80.9545,1010,32,439.0,978.0,54.0,76.341763,29811.0,3.168317,84000054.0,USA,2253.291388
17358,Wisconsin,US,2020-04-25,44.2685,-89.6165,5356,263,461729.0,5093.0,55.0,103.506165,59929.0,4.910381,84000055.0,USA,1158.144318


In [53]:
# tmp1 = data[data['Province_State'] == 'California']
# tmp1[tmp1['Recovered'].notnull()]

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,Mortality_Rate,UID,ISO3,Testing_Rate
1629,California,US,2020-05-02,36.1162,-119.6816,52536,2126,0.0,49900.0,6.0,132.690209,654985.0,4.086418,84000006.0,USA,1670.51276
2525,California,US,2020-04-20,36.1162,-119.6816,33946,1225,749.0,32461.0,6.0,85.914781,290500.0,3.636526,84000006.0,USA,740.908504
3477,California,US,2020-04-15,36.1162,-119.6816,26699,860,489.0,25826.0,6.0,68.061564,216486.0,3.226411,84000006.0,USA,552.13879
3533,California,US,2020-04-14,36.1162,-119.6816,25373,767,427.0,24589.0,6.0,64.669453,202208.0,3.028869,84000006.0,USA,515.723328
4877,California,US,2020-04-18,36.1162,-119.6816,30785,1140,703.0,29351.0,6.0,77.766063,251614.0,3.738808,84000006.0,USA,641.731334
4933,California,US,2020-04-19,36.1162,-119.6816,31660,1177,721.0,30254.0,6.0,80.163495,280900.0,3.744711,84000006.0,USA,716.424092
5772,California,US,2020-04-12,36.1162,-119.6816,22805,640,367.0,22155.0,6.0,58.137726,190328.0,2.81202,84000006.0,USA,485.423868
5829,California,US,2020-04-13,36.1162,-119.6816,23946,714,391.0,23217.0,6.0,61.035048,190882.0,2.987756,84000006.0,USA,486.836823
7621,California,US,2020-05-01,36.1162,-119.6816,50712,2031,0.0,48099.0,6.0,127.854538,625337.0,4.051466,84000006.0,USA,1594.896734
9693,California,US,2020-04-16,36.1162,-119.6816,27687,956,548.0,26721.0,6.0,70.589069,246400.0,3.457745,84000006.0,USA,628.433237


In [77]:
# # data.groupby('Province_State').transform(pd.DataFrame.interpolate).isna().sum()
# # data.isna().sum()
# # data.groupby('Province_State')
# # data[data['Lat'].isnull()]['Province_State'].unique()
# # data[~data['Province_State'].isin(['Diamond Princess', 'Grand Princess', 'Recovered'])]
# data.apply(lambda x: x['Confirmed'] if x['Confirmed']==0 else x['Recovered'], axis=1).isna().sum()

2492

In [75]:
# data[data.isnull().any(1)]
# #
# grp = data.groupby('Province_State').get_group('American Samoa')
# # df['B'].apply(lambda x: x if df['A'] == 0 else df['A']
# tmp = grp.apply(lambda x: x['Confirmed'] if x['Confirmed']==0 else x['Recovered'], axis=1)
# tmp[tmp != 0]

Series([], dtype: int64)

In [20]:
# data[data['Last_Update'].str.contains('2021-02-10') == True]

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,UID,ISO3,Testing_Rate,Hospitalization_Rate,Total_Test_Results,Case_Fatality_Ratio
4818,Alabama,US,2021-02-10 05:30:29,32.3182,-86.9023,474666,8579,252880.0,213207.0,1.0,9680.768725,,,,84000001.0,USA,44824.150017,,2197811.0,1.807376
4819,Alaska,US,2021-02-10 05:30:29,61.3707,-152.4044,55893,280,,55613.0,2.0,7640.404896,,,,84000002.0,USA,213418.039902,,1561249.0,0.500957
4820,American Samoa,US,2021-02-10 05:30:29,-14.271,-170.132,0,0,,0.0,60.0,0.0,,,,16.0,ASM,3846.084722,,2140.0,
4821,Arizona,US,2021-02-10 05:30:29,33.7298,-111.4312,787268,14286,108965.0,664017.0,4.0,10816.027055,,,,84000004.0,USA,49043.890565,,3569766.0,1.81463
4822,Arkansas,US,2021-02-10 05:30:29,34.9697,-92.3731,308848,5148,288774.0,14926.0,5.0,10234.196787,,,,84000005.0,USA,84247.817287,,2542434.0,1.666839
4823,California,US,2021-02-10 08:30:40,36.1162,-119.6816,3442672,45009,,3397663.0,6.0,8712.929161,,,,84000006.0,USA,112834.20829,,44583304.0,1.307386
4824,Colorado,US,2021-02-10 05:30:29,39.0598,-105.3111,407210,5746,21611.0,379853.0,8.0,7071.169784,,,,84000008.0,USA,98713.172474,,5684631.0,1.411066
4825,Connecticut,US,2021-02-10 05:30:29,41.5978,-72.7554,264608,7298,9800.0,247510.0,9.0,7421.786801,,,,84000009.0,USA,171221.896021,,6104552.0,2.758042
4826,Delaware,US,2021-02-10 05:30:29,39.3185,-75.5071,81210,1245,,79989.0,10.0,8339.803073,,,,84000010.0,USA,132045.649665,,1285813.0,1.503509
4827,Diamond Princess,US,2021-02-10 05:30:29,,,49,0,,49.0,88888.0,,,,,84088888.0,USA,,,,0.0


In [11]:
# data['Last_Update'].str.match('2021-01-20')

0        False
1        False
2        False
3        False
4        False
         ...  
17993    False
17994    False
17995    False
17996    False
17997    False
Name: Last_Update, Length: 17998, dtype: object

In [8]:
# data.groupby(['Province_State', 'Last_Update']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,Country_Region,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,UID,ISO3,Testing_Rate,Hospitalization_Rate,Total_Test_Results,Case_Fatality_Ratio
Province_State,Last_Update,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Alabama,2020-04-12,US,32.3182,-86.9023,3667,93,,3470.0,1.0,75.988020,21583.0,437.0,2.610160,84000001.0,USA,460.300152,12.264945,,
Alabama,2020-04-13,US,32.3182,-86.9023,3870,99,,3635.0,1.0,79.634933,29182.0,457.0,2.651312,84000001.0,USA,622.363852,12.238886,,
Alabama,2020-04-14,US,32.3182,-86.9023,4041,114,,3839.0,1.0,84.305541,33117.0,493.0,2.883886,84000001.0,USA,706.285508,12.471541,,
Alabama,2020-04-15,US,32.3182,-86.9023,4307,118,,3957.0,1.0,86.907433,34077.0,525.0,2.895706,84000001.0,USA,726.759406,12.883436,,
Alabama,2020-04-16,US,32.3182,-86.9023,4465,133,,4212.0,1.0,92.665716,36391.0,553.0,3.060990,84000001.0,USA,776.110032,12.727273,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyoming,2021-02-12,US,42.7560,-107.3025,52979,647,51525.0,807.0,56.0,9153.896527,,,,84000056.0,USA,103860.674305,,601103.0,1.221239
Wyoming,2021-02-13,US,42.7560,-107.3025,53086,647,51640.0,799.0,56.0,9172.384360,,,,84000056.0,USA,104461.096933,,604578.0,1.218777
Wyoming,2021-02-14,US,42.7560,-107.3025,53086,647,51640.0,799.0,56.0,9172.384360,,,,84000056.0,USA,104461.096933,,604578.0,1.218777
Wyoming,2021-02-15,US,42.7560,-107.3025,53136,647,51716.0,773.0,56.0,9181.023535,,,,84000056.0,USA,104461.096933,,604578.0,1.217630


In [212]:
vac_usa_data = pd.read_csv(vaccination_usa)
vac_usa_data

Unnamed: 0,location,date,vaccine,source_url,total_vaccinations,people_vaccinated,people_fully_vaccinated
0,United States,2020-12-20,Pfizer/BioNTech,https://www.cdc.gov/coronavirus/2019-ncov/vacc...,556208,556208.0,
1,United States,2020-12-21,Pfizer/BioNTech,https://covid.cdc.gov/covid-data-tracker/#vacc...,614117,614117.0,
2,United States,2020-12-23,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,1008025,1008025.0,
3,United States,2020-12-26,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,1944585,1944585.0,
4,United States,2020-12-28,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,2127143,2127143.0,
5,United States,2020-12-30,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,2794588,2794588.0,
6,United States,2021-01-02,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4225756,4225756.0,
7,United States,2021-01-04,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4563260,4563260.0,
8,United States,2021-01-05,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4836469,4836469.0,
9,United States,2021-01-06,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,5306797,5306797.0,


In [215]:
vac_usa_data.isna().sum()

location                    0
date                        0
vaccine                     0
source_url                  0
total_vaccinations          0
people_vaccinated           1
people_fully_vaccinated    15
dtype: int64

In [330]:
# Fill missing value with mean value in people_vaccinated column
vac_usa_data['people_vaccinated'].fillna((vac_usa_data['people_vaccinated'].mean()), inplace=True)
vac_usa_data.isna().sum()

location                   0
date                       0
vaccine                    0
source_url                 0
total_vaccinations         0
people_vaccinated          0
people_fully_vaccinated    0
dtype: int64

In [331]:
vac_usa_data['people_fully_vaccinated'].fillna(0, inplace=True)
vac_usa_data.isna().sum()

location                   0
date                       0
vaccine                    0
source_url                 0
total_vaccinations         0
people_vaccinated          0
people_fully_vaccinated    0
dtype: int64

In [536]:
# Load data per state
vac_state_data = pd.read_csv(vaccination_usa_state)
vac_state_data.isna().sum()

date                                     0
location                                 0
total_vaccinations                     270
total_distributed                      312
people_vaccinated                      324
people_fully_vaccinated_per_hundred    532
total_vaccinations_per_hundred         430
people_fully_vaccinated                406
people_vaccinated_per_hundred          455
distributed_per_hundred                443
daily_vaccinations_raw                  65
daily_vaccinations                      65
daily_vaccinations_per_million         240
share_doses_used                       312
dtype: int64

In [530]:
vac_state_data

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.15,1.59,7270.0,1.44,7.69,,,,0.207
1,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.19,1.71,9245.0,1.52,7.73,5906.0,5906.0,1205.0,0.222
2,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.88,,1.64,8.88,8260.0,7083.0,1445.0,0.212
3,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.27,2.05,13488.0,1.77,9.07,8267.0,7478.0,1525.0,0.226
4,2021-01-16,Alabama,,,,,,,,,7557.0,7498.0,1529.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2358,2021-02-12,Wyoming,89138.0,122000.0,65857.0,3.98,15.40,23032.0,11.38,21.08,9623.0,3103.0,5361.0,0.731
2359,2021-02-13,Wyoming,89138.0,122000.0,65857.0,3.98,15.40,23032.0,11.38,21.08,0.0,2469.0,4266.0,0.731
2360,2021-02-14,Wyoming,99099.0,122200.0,71653.0,4.69,17.12,27132.0,12.38,21.11,9961.0,3892.0,6725.0,0.811
2361,2021-02-15,Wyoming,,,,,,,,,543.5,3312.0,5723.0,


In [531]:
vac_states = vac_state_data['location'].unique()
vac_states

array(['Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
       'Bureau of Prisons', 'California', 'Colorado', 'Connecticut',
       'Delaware', 'Dept of Defense', 'District of Columbia',
       'Federated States of Micronesia', 'Florida', 'Georgia', 'Guam',
       'Hawaii', 'Idaho', 'Illinois', 'Indian Health Svc', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Long Term Care', 'Louisiana',
       'Maine', 'Marshall Islands', 'Maryland', 'Massachusetts',
       'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
       'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
       'New York State', 'North Carolina', 'North Dakota',
       'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Puerto Rico', 'Republic of Palau', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
       'United States', 'Utah', 'Vermont', 'Veterans Health',
       'Virgin Islands', 'Virginia', 'Washington', 'West V

In [532]:
data_states = data['Province_State'].unique()
data_states 

array(['Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
       'California', 'Colorado', 'Connecticut', 'Delaware',
       'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii',
       'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
       'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
       'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
       'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Northern Mariana Islands',
       'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia',
       'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'],
      dtype=object)

In [533]:
# Find difference between two tables 
np.setdiff1d(vac_states, data_states)

array(['Bureau of Prisons', 'Dept of Defense',
       'Federated States of Micronesia', 'Indian Health Svc',
       'Long Term Care', 'Marshall Islands', 'New York State',
       'Republic of Palau', 'United States', 'Veterans Health'],
      dtype=object)

In [540]:
# Drop island's data
vac_state_data = vac_state_data[~vac_state_data['location'].isin(['Federated States of Micronesia', 'Marshall Islands', 'Republic of Palau'])]
# Change "New York State" to "New York"

vac_state_data['location'].mask(vac_state_data['location'] == 'New York State', 'New York', inplace=True)

In [542]:
# Aggregate Indian Health Svc and Indiana
tmp = vac_state_data[vac_state_data['location'].isin(['Indian Health Svc', 'Indiana'])]
tmp = tmp.groupby('date', as_index=False).agg({ 'total_vaccinations': np.sum,
                          'total_distributed': np.sum,
                          'people_vaccinated': np.sum,
                          'people_fully_vaccinated_per_hundred': np.mean,
                          'total_vaccinations_per_hundred': np.mean,
                          'people_fully_vaccinated': np.sum, 
                          'people_vaccinated_per_hundred': np.mean,
                          'distributed_per_hundred': np.mean,
                          'daily_vaccinations_raw': np.sum,
                          'daily_vaccinations': np.sum,
                          'daily_vaccinations_per_million': np.mean,
                          'share_doses_used': np.mean,
                                                    
})
# Add location column
tmp['location'] = 'Indiana'
vac_state_data.drop(vac_state_data[vac_state_data['location'].isin(['Indian Health Svc', 'Indiana'])].index, inplace = True)
vac_state_data.append(tmp)

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.15,1.59,7270.0,1.44,7.69,,,,0.2070
1,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.19,1.71,9245.0,1.52,7.73,5906.0,5906.0,1205.0,0.2220
2,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.88,,1.64,8.88,8260.0,7083.0,1445.0,0.2120
3,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.27,2.05,13488.0,1.77,9.07,8267.0,7478.0,1525.0,0.2260
4,2021-01-16,Alabama,,,,,,,,,7557.0,7498.0,1529.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31,2021-02-12,Indiana,1220537.0,2040075.0,910940.0,3.17,12.98,303264.0,9.80,20.20,63170.0,37904.0,3452.0,0.5760
32,2021-02-13,Indiana,1381585.0,2079625.0,1011753.0,3.98,15.13,362806.0,11.14,20.62,161048.0,50821.0,5708.0,0.6295
33,2021-02-14,Indiana,1437488.0,2083925.0,1043572.0,4.27,15.73,386689.0,11.44,20.62,55903.0,57954.0,6559.0,0.6535
34,2021-02-15,Indiana,0.0,0.0,0.0,,,0.0,,,19779.0,59451.0,6908.0,


In [544]:
tmp = vac_state_data[vac_state_data['location'].isin(['Bureau of Prisons', 'Dept of Defense', 'Long Term Care',
       'United States', 'Veterans Health', 'District of Columbia'])]
tmp = tmp.groupby('date', as_index=False).agg({ 'total_vaccinations': np.sum,
                          'total_distributed': np.sum,
                          'people_vaccinated': np.sum,
                          'people_fully_vaccinated_per_hundred': np.mean,
                          'total_vaccinations_per_hundred': np.mean,
                          'people_fully_vaccinated': np.sum, 
                          'people_vaccinated_per_hundred': np.mean,
                          'distributed_per_hundred': np.mean,
                          'daily_vaccinations_raw': np.sum,
                          'daily_vaccinations': np.sum,
                          'daily_vaccinations_per_million': np.mean,
                          'share_doses_used': np.mean,
                                                    
})
# Add location column
tmp['location'] = 'District of Columbia'
vac_state_data.drop(vac_state_data[vac_state_data['location'].isin(['Bureau of Prisons', 'Dept of Defense', 'Long Term Care',
       'United States', 'Veterans Health', 'District of Columbia'])].index, inplace = True)
vac_state_data.append(tmp)

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.150,1.590,7270.0,1.440,7.690,,,,0.2070
1,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.190,1.710,9245.0,1.520,7.730,5906.0,5906.0,1205.0,0.2220
2,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.880,,1.640,8.880,8260.0,7083.0,1445.0,0.2120
3,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.270,2.050,13488.0,1.770,9.070,8267.0,7478.0,1525.0,0.2260
4,2021-01-16,Alabama,,,,,,,,,7557.0,7498.0,1529.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,2021-02-12,District of Columbia,56219143.0,72357325.0,37538066.0,4.170,15.375,12671769.0,11.115,24.125,2267412.0,1886555.0,4037.0,0.7394
55,2021-02-13,District of Columbia,58701161.0,73243875.0,38807683.0,4.405,15.860,13711014.0,11.370,24.260,2482018.0,1881864.0,3937.0,0.7612
56,2021-02-14,District of Columbia,61228680.0,73422925.0,40115771.0,4.700,16.560,14763552.0,11.770,24.285,2527519.0,1884378.0,4079.5,0.7890
57,2021-02-15,District of Columbia,0.0,0.0,0.0,,,0.0,,,1291226.5,1873587.0,4064.5,


In [546]:
# Group by location and use interpolate method to fill out missing values with previous value for the same state
df_interpolated = vac_state_data.groupby('location').apply(lambda x: x.interpolate(method='linear'))
# Check that missing value amount is drastically reduced
df_interpolated.isna().sum()

date                                    0
location                                0
total_vaccinations                      0
total_distributed                       0
people_vaccinated                      11
people_fully_vaccinated_per_hundred    32
total_vaccinations_per_hundred          0
people_fully_vaccinated                32
people_vaccinated_per_hundred          11
distributed_per_hundred                 0
daily_vaccinations_raw                 54
daily_vaccinations                     54
daily_vaccinations_per_million         54
share_doses_used                        0
dtype: int64

In [547]:
# Group by state and  fill out missing values with mean value 
df_means = df_interpolated.groupby('location').transform(lambda x:x.fillna(x.mean()))
# Update original DataFrame with new values
vac_state_data.update(df_means)
vac_state_data.isna().sum()

date                                   0
location                               0
total_vaccinations                     0
total_distributed                      0
people_vaccinated                      0
people_fully_vaccinated_per_hundred    0
total_vaccinations_per_hundred         0
people_fully_vaccinated                0
people_vaccinated_per_hundred          0
distributed_per_hundred                0
daily_vaccinations_raw                 0
daily_vaccinations                     0
daily_vaccinations_per_million         0
share_doses_used                       0
dtype: int64

In [548]:
vac_state_data

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.00,0.150,1.590,7270.0,1.44,7.69,15958.514286,15116.571429,3082.971429,0.2070
1,2021-01-13,Alabama,84040.0,378975.0,74792.00,0.190,1.710,9245.0,1.52,7.73,5906.000000,5906.000000,1205.000000,0.2220
2,2021-01-14,Alabama,92300.0,435350.0,80480.00,0.230,1.880,11366.5,1.64,8.88,8260.000000,7083.000000,1445.000000,0.2120
3,2021-01-15,Alabama,100567.0,444650.0,86956.00,0.270,2.050,13488.0,1.77,9.07,8267.000000,7478.000000,1525.000000,0.2260
4,2021-01-16,Alabama,108124.0,444650.0,93796.75,0.285,2.205,14202.5,1.91,9.07,7557.000000,7498.000000,1529.000000,0.2430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2358,2021-02-12,Wyoming,89138.0,122000.0,65857.00,3.980,15.400,23032.0,11.38,21.08,9623.000000,3103.000000,5361.000000,0.7310
2359,2021-02-13,Wyoming,89138.0,122000.0,65857.00,3.980,15.400,23032.0,11.38,21.08,0.000000,2469.000000,4266.000000,0.7310
2360,2021-02-14,Wyoming,99099.0,122200.0,71653.00,4.690,17.120,27132.0,12.38,21.11,9961.000000,3892.000000,6725.000000,0.8110
2361,2021-02-15,Wyoming,99642.5,122200.0,71996.00,4.725,17.215,27331.5,12.44,21.11,543.500000,3312.000000,5723.000000,0.8155
