# Load data from local SCV files

In [1]:
# Load from local files
import pandas as pd
import numpy as np
import glob

# Path to csv files
covid_data = 'Data/covid/*.csv'
vaccination_usa = 'Data/vaccinations/us.csv'
vaccination_usa_state = 'Data/vaccinations/us_state_vaccinations.csv'


def read_multiple_csv(regex_path):
    data = pd.DataFrame()
    # Read all css files per day comnine in one dataset
    for file in glob.glob(covid_data):
        df = pd.read_csv(file)
        # Check if "Last_update" column has null values, fill it with forward fill
        # propagate[s] last valid observation forward to next valid
        if df['Last_Update'].isna().sum() > 0:
            df.fillna(method='ffill', inplace=True)
        # Remove invalid states from dataset
        df = df[~df['Province_State'].isin(['Diamond Princess', 'Grand Princess', 'Recovered'])]
        # Set Recovered field to 0 when no confirmed cases
        df['Recovered'] = df.apply(lambda x: x['Confirmed'] if x['Confirmed']==0 else x['Recovered'], axis=1)
        data = data.append(df,ignore_index=True)
    return data


# Load data
vac_usa_data = pd.read_csv(vaccination_usa)
vac_state_data = pd.read_csv(vaccination_usa_state)
covid_data = read_multiple_csv(covid_data)

# Load SCV files from remote github repository

In [2]:
from urllib.request import urlopen
import pandas as pd
import numpy as np
import re

# Example of github link
# https://github.com/owid/covid-19-data/blob/master/public/data/vaccinations/us_state_vaccinations.csv

# Hopkins covid 19 dataset
git_raw_domain = 'https://raw.githubusercontent.com/'
covid_url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports_us"

# URL Link to cvs file for vaccination in USA by day
url_vaccination_usa = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/'\
                       + 'country_data/United%20States.csv'
# URL Link to cvs file for vaccination in USA by state
url_vaccination_state = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/'\
                       + 'us_state_vaccinations.csv'


def load_file_from_github(url, raw_domain):
    """
        Load file from github:
            url - github url for all csv files
            raw_domain  - domain to get raw csv file
    """
    # Get github page
    txt = urlopen(url).read().decode('utf8')
    links = re.findall("href=[\"\'](.*?csv)[\"\']", txt)
    # Create pandas DataFrame
    data = pd.DataFrame()

    for link in links:
        df = pd.read_csv(raw_domain + link.replace('blob/', ''),error_bad_lines=False)
        # Check if "Last_update" column has null values, fill it with forward fill
        # propagate[s] last valid observation forward to next valid
        if df['Last_Update'].isna().sum() > 0:
            # Each file should be for one specific day
            df.fillna(method='ffill', inplace=True)
        # Remove invalid states from dataset
        df = df[~df['Province_State'].isin(['Diamond Princess', 'Grand Princess', 'Recovered'])]
        # Set Recovered field to 0 when no confirmed cases
        df['Recovered'] = df.apply(lambda x: x['Confirmed'] if x['Confirmed']==0 else x['Recovered'], axis=1)
        data = data.append(df,ignore_index=True)
    return data

# Load data
vac_usa_data = pd.read_csv(url_vaccination_usa,error_bad_lines=False)
vac_state_data = pd.read_csv(url_vaccination_state,error_bad_lines=False)
covid_data = load_file_from_github(covid_url, git_raw_domain)

In [3]:
# Sort dataset by date
vac_usa_data = vac_usa_data.sort_values(by='date')
vac_usa_data

Unnamed: 0,location,date,vaccine,source_url,total_vaccinations,people_vaccinated,people_fully_vaccinated
0,United States,2020-12-20,Pfizer/BioNTech,https://www.cdc.gov/coronavirus/2019-ncov/vacc...,556208,556208.0,
1,United States,2020-12-21,Pfizer/BioNTech,https://covid.cdc.gov/covid-data-tracker/#vacc...,614117,614117.0,
2,United States,2020-12-23,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,1008025,1008025.0,
3,United States,2020-12-26,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,1944585,1944585.0,
4,United States,2020-12-28,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,2127143,2127143.0,
5,United States,2020-12-30,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,2794588,2794588.0,
6,United States,2021-01-02,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4225756,4225756.0,
7,United States,2021-01-04,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4563260,4563260.0,
8,United States,2021-01-05,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4836469,4836469.0,
9,United States,2021-01-06,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,5306797,5306797.0,


In [4]:
# Sort dataset by date
vac_state_dat = vac_state_data.sort_values(by='date')
vac_state_dat

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
2145,2020-12-20,United States,556208.0,,,,0.17,,,,,,,
2146,2020-12-21,United States,614117.0,,,,0.18,,,,57909.000000,57909.0,174.0,
2147,2020-12-22,United States,,,,,,,,,196954.000000,127432.0,384.0,
2148,2020-12-23,United States,1008025.0,,,,0.30,,,,196954.000000,150606.0,454.0,
2149,2020-12-24,United States,,,,,,,,,312186.666667,191001.0,575.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2284,2021-02-19,Vermont,127619.0,159650.0,86590.0,6.54,20.45,40829.0,13.88,25.58,9435.000000,3940.0,6314.0,0.799
974,2021-02-19,Long Term Care,6289188.0,,,,,,,,107192.000000,111629.0,,
1013,2021-02-19,Louisiana,859565.0,987625.0,572666.0,6.13,18.49,285057.0,12.32,21.24,64054.000000,22636.0,4869.0,0.870
662,2021-02-19,Hawaii,301369.0,397750.0,210213.0,6.42,21.28,90892.0,14.85,28.09,53977.000000,13223.0,9339.0,0.758


In [28]:
# Sort dataset by date
covid_data = covid_data.sort_values(by=['Last_Update', 'Province_State'])
covid_data

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,UID,ISO3,Testing_Rate,People_Tested,Mortality_Rate
2744,Alabama,US,2020-04-12,32.3182,-86.9023,3667,93,91214.076271,3470.0,1.0,75.988020,84000001.0,USA,460.300152,21583.0,2.610160
2745,Alaska,US,2020-04-12,61.3707,-152.4044,272,8,66.000000,264.0,2.0,45.504049,84000002.0,USA,1344.711576,8038.0,2.941176
2797,American Samoa,US,2020-04-12,-14.2710,-170.1322,0,0,0.000000,0.0,60.0,0.000000,16.0,ASM,5.391708,3.0,0.000000
2746,Arizona,US,2020-04-12,33.7298,-111.4312,3542,115,66.000000,3427.0,4.0,48.662422,84000004.0,USA,578.522286,42109.0,3.246753
2747,Arkansas,US,2020-04-12,34.9697,-92.3731,1280,27,367.000000,1253.0,5.0,49.439423,84000005.0,USA,761.753354,19722.0,2.109375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2739,Virginia,US,2021-02-19,37.7693,-78.1700,557896,7090,45305.000000,505501.0,51.0,6536.169622,84000051.0,USA,66543.510711,2787106.0,1.928971
2740,Washington,US,2021-02-19,47.4009,-121.4905,332007,4803,0.000000,327204.0,53.0,4359.969339,84000053.0,USA,65631.125743,2592766.0,2.078735
2741,West Virginia,US,2021-02-19,38.4912,-80.9545,128760,2236,116436.000000,10088.0,54.0,7184.678489,84000054.0,USA,118076.530552,849461.0,1.774398
2742,Wisconsin,US,2021-02-19,44.2685,-89.6165,610055,6816,541515.000000,61724.0,55.0,10477.662778,84000055.0,USA,113344.195915,3564737.0,0.864590


In [5]:
covid_data.rename(columns = {'Province_State':'location', 'Last_Update': 'date' }, inplace = True)
covid_data

Unnamed: 0,location,Country_Region,date,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,Total_Test_Results,People_Hospitalized,Case_Fatality_Ratio,UID,ISO3,Testing_Rate,Hospitalization_Rate,People_Tested,Mortality_Rate
0,Alabama,US,2021-01-02 05:30:44,32.3182,-86.9023,365747,4872,202137.0,158738.0,1.0,7459.375895,1885216.0,,1.332068,84000001.0,USA,38448.804196,,,
1,Alaska,US,2021-01-02 05:30:44,61.3707,-152.4044,46986,206,7165.0,39615.0,2.0,6422.844801,1275750.0,,0.438428,84000002.0,USA,174391.185778,,,
2,American Samoa,US,2021-01-02 05:30:44,-14.2710,-170.1320,0,0,0.0,0.0,60.0,0.000000,2140.0,,,16.0,ASM,3846.084722,,,
3,Arizona,US,2021-01-02 05:30:44,33.7298,-111.4312,530267,9015,76934.0,444318.0,4.0,7285.171274,2878868.0,,1.700087,84000004.0,USA,39551.860582,,,
4,Arkansas,US,2021-01-02 05:30:44,34.9697,-92.3731,229442,3711,199247.0,26484.0,5.0,7602.945718,2051488.0,,1.617402,84000005.0,USA,67979.497674,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17523,Virginia,US,2021-01-01 05:30:27,37.7693,-78.1700,349584,5032,30707.0,313845.0,51.0,4095.638473,4255991.0,,1.439425,84000051.0,USA,49862.123205,,,
17524,Washington,US,2021-01-01 05:30:27,47.4009,-121.4905,246752,3461,,243291.0,53.0,3240.386963,3805390.0,,1.402623,84000053.0,USA,49972.993711,,,
17525,West Virginia,US,2021-01-01 05:30:27,38.4912,-80.9545,85334,1338,59508.0,24488.0,54.0,4761.551368,1514520.0,,1.567957,84000054.0,USA,84508.692646,,,
17526,Wisconsin,US,2021-01-01 05:30:27,44.2685,-89.6165,520438,5242,447500.0,67696.0,55.0,8938.495481,5348488.0,,1.007229,84000055.0,USA,91860.002192,,,


## Handle NaN values in COVID dataset 

In [6]:
# Convert data to YYYY-MM-DD format
covid_data['date'] = pd.to_datetime(covid_data['date'], format='%Y/%m/%d %H:%M:%S').dt.strftime('%Y-%m-%d')
covid_data

Unnamed: 0,location,Country_Region,date,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,Total_Test_Results,People_Hospitalized,Case_Fatality_Ratio,UID,ISO3,Testing_Rate,Hospitalization_Rate,People_Tested,Mortality_Rate
0,Alabama,US,2021-01-02,32.3182,-86.9023,365747,4872,202137.0,158738.0,1.0,7459.375895,1885216.0,,1.332068,84000001.0,USA,38448.804196,,,
1,Alaska,US,2021-01-02,61.3707,-152.4044,46986,206,7165.0,39615.0,2.0,6422.844801,1275750.0,,0.438428,84000002.0,USA,174391.185778,,,
2,American Samoa,US,2021-01-02,-14.2710,-170.1320,0,0,0.0,0.0,60.0,0.000000,2140.0,,,16.0,ASM,3846.084722,,,
3,Arizona,US,2021-01-02,33.7298,-111.4312,530267,9015,76934.0,444318.0,4.0,7285.171274,2878868.0,,1.700087,84000004.0,USA,39551.860582,,,
4,Arkansas,US,2021-01-02,34.9697,-92.3731,229442,3711,199247.0,26484.0,5.0,7602.945718,2051488.0,,1.617402,84000005.0,USA,67979.497674,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17523,Virginia,US,2021-01-01,37.7693,-78.1700,349584,5032,30707.0,313845.0,51.0,4095.638473,4255991.0,,1.439425,84000051.0,USA,49862.123205,,,
17524,Washington,US,2021-01-01,47.4009,-121.4905,246752,3461,,243291.0,53.0,3240.386963,3805390.0,,1.402623,84000053.0,USA,49972.993711,,,
17525,West Virginia,US,2021-01-01,38.4912,-80.9545,85334,1338,59508.0,24488.0,54.0,4761.551368,1514520.0,,1.567957,84000054.0,USA,84508.692646,,,
17526,Wisconsin,US,2021-01-01,44.2685,-89.6165,520438,5242,447500.0,67696.0,55.0,8938.495481,5348488.0,,1.007229,84000055.0,USA,91860.002192,,,


In [7]:
# Find all missing values
covid_data.isna().sum()

location                    0
Country_Region              0
date                        0
Lat                         0
Long_                       0
Confirmed                   0
Deaths                      0
Recovered                2525
Active                      9
FIPS                        0
Incident_Rate               0
Total_Test_Results      11816
People_Hospitalized     12336
Case_Fatality_Ratio     11918
UID                         0
ISO3                        0
Testing_Rate                0
Hospitalization_Rate    12336
People_Tested            5712
Mortality_Rate           5914
dtype: int64

In [8]:
# Drop columns that have more than 60-70% of missing values
# 17360 rows / 11915 = 0.68
covid_data.drop(['People_Hospitalized', 'Hospitalization_Rate', 
                'Total_Test_Results', 'Case_Fatality_Ratio'], axis=1, inplace=True)

In [24]:
covid_data.isna().sum()

Province_State       0
Country_Region       0
Last_Update          0
Lat                  0
Long_                0
Confirmed            0
Deaths               0
Recovered         2525
Active               9
FIPS                 0
Incident_Rate        0
UID                  0
ISO3                 0
Testing_Rate         0
People_Tested     5712
Mortality_Rate    5914
dtype: int64

In [9]:
# Create tmp DataFrame with missing values and States
df_nan = covid_data[['location','Recovered', 'People_Tested', 'Mortality_Rate', 'Active']]
# Group by State and use interpolate method to fill out missing values with previous value for the same state
df_interpolated = df_nan.groupby('location').apply(lambda x: x.interpolate(method='linear'))
# Check that missing value amount is drastically reduced
df_interpolated.isna().sum()

location             0
Recovered          294
People_Tested     2744
Mortality_Rate    2744
Active               0
dtype: int64

In [10]:
df_interpolated

Unnamed: 0,location,Recovered,People_Tested,Mortality_Rate,Active
0,Alabama,202137.0,,,158738.0
1,Alaska,7165.0,,,39615.0
2,American Samoa,0.0,,,0.0
3,Arizona,76934.0,,,444318.0
4,Arkansas,199247.0,,,26484.0
...,...,...,...,...,...
17523,Virginia,30707.0,2787106.0,1.928971,313845.0
17524,Washington,0.0,2592766.0,2.078735,243291.0
17525,West Virginia,59508.0,849461.0,1.774398,24488.0
17526,Wisconsin,447500.0,3564737.0,0.864590,67696.0


In [11]:
# Group by state and  fill out missing values with mean value 
df_means = df_interpolated.groupby('location').transform(lambda x:x.fillna(x.mean()))
# Update original DataFrame with new values
covid_data.update(df_means)
covid_data.isna().sum()

location          0
Country_Region    0
date              0
Lat               0
Long_             0
Confirmed         0
Deaths            0
Recovered         0
Active            0
FIPS              0
Incident_Rate     0
UID               0
ISO3              0
Testing_Rate      0
People_Tested     0
Mortality_Rate    0
dtype: int64

In [12]:
covid_data

Unnamed: 0,location,Country_Region,date,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,UID,ISO3,Testing_Rate,People_Tested,Mortality_Rate
0,Alabama,US,2021-01-02,32.3182,-86.9023,365747,4872,202137.0,158738.0,1.0,7459.375895,84000001.0,USA,38448.804196,8.071685e+05,2.216728
1,Alaska,US,2021-01-02,61.3707,-152.4044,46986,206,7165.0,39615.0,2.0,6422.844801,84000002.0,USA,174391.185778,3.551041e+05,1.152205
2,American Samoa,US,2021-01-02,-14.2710,-170.1320,0,0,0.0,0.0,60.0,0.000000,16.0,ASM,3846.084722,1.095072e+03,2.797253
3,Arizona,US,2021-01-02,33.7298,-111.4312,530267,9015,76934.0,444318.0,4.0,7285.171274,84000004.0,USA,39551.860582,1.049418e+06,2.832888
4,Arkansas,US,2021-01-02,34.9697,-92.3731,229442,3711,199247.0,26484.0,5.0,7602.945718,84000005.0,USA,67979.497674,7.218591e+05,1.594698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17523,Virginia,US,2021-01-01,37.7693,-78.1700,349584,5032,30707.0,313845.0,51.0,4095.638473,84000051.0,USA,49862.123205,2.787106e+06,1.928971
17524,Washington,US,2021-01-01,47.4009,-121.4905,246752,3461,0.0,243291.0,53.0,3240.386963,84000053.0,USA,49972.993711,2.592766e+06,2.078735
17525,West Virginia,US,2021-01-01,38.4912,-80.9545,85334,1338,59508.0,24488.0,54.0,4761.551368,84000054.0,USA,84508.692646,8.494610e+05,1.774398
17526,Wisconsin,US,2021-01-01,44.2685,-89.6165,520438,5242,447500.0,67696.0,55.0,8938.495481,84000055.0,USA,91860.002192,3.564737e+06,0.864590


## Handle NaN values in Vaccination dataset in USA

In [13]:
vac_usa_data

Unnamed: 0,location,date,vaccine,source_url,total_vaccinations,people_vaccinated,people_fully_vaccinated
0,United States,2020-12-20,Pfizer/BioNTech,https://www.cdc.gov/coronavirus/2019-ncov/vacc...,556208,556208.0,
1,United States,2020-12-21,Pfizer/BioNTech,https://covid.cdc.gov/covid-data-tracker/#vacc...,614117,614117.0,
2,United States,2020-12-23,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,1008025,1008025.0,
3,United States,2020-12-26,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,1944585,1944585.0,
4,United States,2020-12-28,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,2127143,2127143.0,
5,United States,2020-12-30,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,2794588,2794588.0,
6,United States,2021-01-02,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4225756,4225756.0,
7,United States,2021-01-04,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4563260,4563260.0,
8,United States,2021-01-05,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4836469,4836469.0,
9,United States,2021-01-06,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,5306797,5306797.0,


In [14]:
vac_usa_data.isna().sum()

location                    0
date                        0
vaccine                     0
source_url                  0
total_vaccinations          0
people_vaccinated           1
people_fully_vaccinated    15
dtype: int64

In [15]:
# Fill missing value with mean value in people_vaccinated column
vac_usa_data['people_vaccinated'].fillna((vac_usa_data['people_vaccinated'].mean()), inplace=True)
vac_usa_data.isna().sum()

location                    0
date                        0
vaccine                     0
source_url                  0
total_vaccinations          0
people_vaccinated           0
people_fully_vaccinated    15
dtype: int64

In [16]:
vac_usa_data['people_fully_vaccinated'].fillna(0, inplace=True)
vac_usa_data.isna().sum()

location                   0
date                       0
vaccine                    0
source_url                 0
total_vaccinations         0
people_vaccinated          0
people_fully_vaccinated    0
dtype: int64

## Handle NaN values in Vaccination dataset in USA by state

In [17]:
vac_state_data

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.15,1.59,7270.0,1.44,7.69,,,,0.207
1,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.19,1.71,9245.0,1.52,7.73,5906.0,5906.0,1205.0,0.222
2,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.88,,1.64,8.88,8260.0,7083.0,1445.0,0.212
3,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.27,2.05,13488.0,1.77,9.07,8267.0,7478.0,1525.0,0.226
4,2021-01-16,Alabama,,,,,,,,,7557.0,7498.0,1529.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2553,2021-02-15,Wyoming,,,,,,,,,543.5,3312.0,5723.0,
2554,2021-02-16,Wyoming,100186.0,122200.0,72339.0,4.76,17.31,27531.0,12.50,21.11,543.5,3390.0,5857.0,0.820
2555,2021-02-17,Wyoming,100186.0,127075.0,72339.0,4.76,17.31,27531.0,12.50,21.96,0.0,2953.0,5102.0,0.788
2556,2021-02-18,Wyoming,105426.0,127075.0,74983.0,5.20,18.22,30110.0,12.96,21.96,5240.0,3702.0,6396.0,0.830


In [18]:
vac_state_data.isna().sum()

date                                     0
location                                 0
total_vaccinations                     270
total_distributed                      315
people_vaccinated                      327
people_fully_vaccinated_per_hundred    544
total_vaccinations_per_hundred         442
people_fully_vaccinated                409
people_vaccinated_per_hundred          467
distributed_per_hundred                455
daily_vaccinations_raw                  65
daily_vaccinations                      65
daily_vaccinations_per_million         252
share_doses_used                       315
dtype: int64

In [19]:
# Find all unique states in vac_state_data dataset
vac_states = vac_state_data['location'].unique()
vac_states

array(['Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
       'Bureau of Prisons', 'California', 'Colorado', 'Connecticut',
       'Delaware', 'Dept of Defense', 'District of Columbia',
       'Federated States of Micronesia', 'Florida', 'Georgia', 'Guam',
       'Hawaii', 'Idaho', 'Illinois', 'Indian Health Svc', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Long Term Care', 'Louisiana',
       'Maine', 'Marshall Islands', 'Maryland', 'Massachusetts',
       'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
       'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
       'New York State', 'North Carolina', 'North Dakota',
       'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Puerto Rico', 'Republic of Palau', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
       'United States', 'Utah', 'Vermont', 'Veterans Health',
       'Virgin Islands', 'Virginia', 'Washington', 'West V

In [20]:
# Find all unique states in covid_date dataset
data_states = covid_data['location'].unique()
data_states 

array(['Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
       'California', 'Colorado', 'Connecticut', 'Delaware',
       'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii',
       'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
       'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
       'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
       'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Northern Mariana Islands',
       'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia',
       'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'],
      dtype=object)

In [21]:
# Find difference between two tables 
np.setdiff1d(vac_states, data_states)

array(['Bureau of Prisons', 'Dept of Defense',
       'Federated States of Micronesia', 'Indian Health Svc',
       'Long Term Care', 'Marshall Islands', 'New York State',
       'Republic of Palau', 'United States', 'Veterans Health'],
      dtype=object)

In [23]:
# Drop island's data
vac_state_data = vac_state_data[~vac_state_data['location'].isin(['Federated States of Micronesia', 'Marshall Islands', 'Republic of Palau'])]
# Change "New York State" to "New York"
vac_state_data['location'].mask(vac_state_data['location'] == 'New York State', 'New York', inplace=True)

In [24]:
# Aggregate Indian Health Svc and Indiana
tmp = vac_state_data[vac_state_data['location'].isin(['Indian Health Svc', 'Indiana'])]
tmp = tmp.groupby('date', as_index=False).agg({ 'total_vaccinations': np.sum,
                          'total_distributed': np.sum,
                          'people_vaccinated': np.sum,
                          'people_fully_vaccinated_per_hundred': np.mean,
                          'total_vaccinations_per_hundred': np.mean,
                          'people_fully_vaccinated': np.sum, 
                          'people_vaccinated_per_hundred': np.mean,
                          'distributed_per_hundred': np.mean,
                          'daily_vaccinations_raw': np.sum,
                          'daily_vaccinations': np.sum,
                          'daily_vaccinations_per_million': np.mean,
                          'share_doses_used': np.mean,
                                                    
})
# Add location column
tmp['location'] = 'Indiana'
vac_state_data.drop(vac_state_data[vac_state_data['location'].isin(['Indian Health Svc', 'Indiana'])].index, inplace = True)
vac_state_data.append(tmp)

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.150,1.590,7270.0,1.440,7.690,,,,0.2070
1,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.190,1.710,9245.0,1.520,7.730,5906.0,5906.0,1205.0,0.2220
2,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.880,,1.640,8.880,8260.0,7083.0,1445.0,0.2120
3,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.270,2.050,13488.0,1.770,9.070,8267.0,7478.0,1525.0,0.2260
4,2021-01-16,Alabama,,,,,,,,,7557.0,7498.0,1529.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,2021-02-15,Indiana,0.0,0.0,0.0,,,0.0,,,19779.0,59451.0,6908.0,
35,2021-02-16,Indiana,1477046.0,2142125.0,1065697.0,4.500,16.220,403939.0,11.690,21.460,19779.0,59800.0,7017.0,0.6540
36,2021-02-17,Indiana,1504246.0,2144075.0,1080231.0,4.850,17.880,416487.0,12.865,27.880,27200.0,57404.0,6325.5,0.6650
37,2021-02-18,Indiana,1527492.0,2150900.0,1093405.0,4.985,18.235,426451.0,13.090,27.950,23246.0,52875.0,5794.0,0.6750


In [25]:
tmp = vac_state_data[vac_state_data['location'].isin(['Bureau of Prisons', 'Dept of Defense', 'Long Term Care',
       'United States', 'Veterans Health', 'District of Columbia'])]
tmp = tmp.groupby('date', as_index=False).agg({ 'total_vaccinations': np.sum,
                          'total_distributed': np.sum,
                          'people_vaccinated': np.sum,
                          'people_fully_vaccinated_per_hundred': np.mean,
                          'total_vaccinations_per_hundred': np.mean,
                          'people_fully_vaccinated': np.sum, 
                          'people_vaccinated_per_hundred': np.mean,
                          'distributed_per_hundred': np.mean,
                          'daily_vaccinations_raw': np.sum,
                          'daily_vaccinations': np.sum,
                          'daily_vaccinations_per_million': np.mean,
                          'share_doses_used': np.mean,
                                                    
})
# Add location column
tmp['location'] = 'District of Columbia'
vac_state_data.drop(vac_state_data[vac_state_data['location'].isin(['Bureau of Prisons', 'Dept of Defense', 'Long Term Care',
       'United States', 'Veterans Health', 'District of Columbia'])].index, inplace = True)
vac_state_data.append(tmp)

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.0,0.150,1.590,7270.0,1.440,7.690,,,,0.2070
1,2021-01-13,Alabama,84040.0,378975.0,74792.0,0.190,1.710,9245.0,1.520,7.730,5906.0,5906.0,1205.0,0.2220
2,2021-01-14,Alabama,92300.0,435350.0,80480.0,,1.880,,1.640,8.880,8260.0,7083.0,1445.0,0.2120
3,2021-01-15,Alabama,100567.0,444650.0,86956.0,0.270,2.050,13488.0,1.770,9.070,8267.0,7478.0,1525.0,0.2260
4,2021-01-16,Alabama,,,,,,,,,7557.0,7498.0,1529.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,2021-02-15,District of Columbia,0.0,0.0,0.0,,,0.0,,,1291226.5,1873587.0,4064.5,
58,2021-02-16,District of Columbia,63811133.0,75041375.0,41564279.0,5.030,17.275,15732076.0,12.155,24.525,1291226.5,1933471.0,4409.5,0.8026
59,2021-02-17,District of Columbia,65008653.0,75845525.0,42184289.0,5.160,17.575,16199875.0,12.320,24.640,1197520.0,1847245.0,4499.5,0.8038
60,2021-02-18,District of Columbia,66664421.0,76875025.0,42973368.0,5.360,17.935,16942740.0,12.480,24.785,1655768.0,1816099.0,4336.0,0.8128


In [26]:
# Group by location and use interpolate method to fill out missing values with previous value for the same state
df_interpolated = vac_state_data.groupby('location').apply(lambda x: x.interpolate(method='linear'))
# Check that missing value amount is drastically reduced
df_interpolated.isna().sum()

date                                    0
location                                0
total_vaccinations                      0
total_distributed                       0
people_vaccinated                      11
people_fully_vaccinated_per_hundred    32
total_vaccinations_per_hundred          0
people_fully_vaccinated                32
people_vaccinated_per_hundred          11
distributed_per_hundred                 0
daily_vaccinations_raw                 54
daily_vaccinations                     54
daily_vaccinations_per_million         54
share_doses_used                        0
dtype: int64

In [27]:
# Group by state and  fill out missing values with mean value 
df_means = df_interpolated.groupby('location').transform(lambda x:x.fillna(x.mean()))
# Update original DataFrame with new values
vac_state_data.update(df_means)
vac_state_data.isna().sum()

date                                   0
location                               0
total_vaccinations                     0
total_distributed                      0
people_vaccinated                      0
people_fully_vaccinated_per_hundred    0
total_vaccinations_per_hundred         0
people_fully_vaccinated                0
people_vaccinated_per_hundred          0
distributed_per_hundred                0
daily_vaccinations_raw                 0
daily_vaccinations                     0
daily_vaccinations_per_million         0
share_doses_used                       0
dtype: int64

In [28]:
vac_state_data

Unnamed: 0,date,location,total_vaccinations,total_distributed,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,2021-01-12,Alabama,78134.0,377025.0,70861.00,0.150,1.590,7270.0,1.44,7.69,17394.684211,15543.578947,3170.078947,0.2070
1,2021-01-13,Alabama,84040.0,378975.0,74792.00,0.190,1.710,9245.0,1.52,7.73,5906.000000,5906.000000,1205.000000,0.2220
2,2021-01-14,Alabama,92300.0,435350.0,80480.00,0.230,1.880,11366.5,1.64,8.88,8260.000000,7083.000000,1445.000000,0.2120
3,2021-01-15,Alabama,100567.0,444650.0,86956.00,0.270,2.050,13488.0,1.77,9.07,8267.000000,7478.000000,1525.000000,0.2260
4,2021-01-16,Alabama,108124.0,444650.0,93796.75,0.285,2.205,14202.5,1.91,9.07,7557.000000,7498.000000,1529.000000,0.2430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2553,2021-02-15,Wyoming,99642.5,122200.0,71996.00,4.725,17.215,27331.5,12.44,21.11,543.500000,3312.000000,5723.000000,0.8155
2554,2021-02-16,Wyoming,100186.0,122200.0,72339.00,4.760,17.310,27531.0,12.50,21.11,543.500000,3390.000000,5857.000000,0.8200
2555,2021-02-17,Wyoming,100186.0,127075.0,72339.00,4.760,17.310,27531.0,12.50,21.96,0.000000,2953.000000,5102.000000,0.7880
2556,2021-02-18,Wyoming,105426.0,127075.0,74983.00,5.200,18.220,30110.0,12.96,21.96,5240.000000,3702.000000,6396.000000,0.8300


In [29]:
# Merge two Dataframes together
complete_df = pd.merge(covid_data,
                       vac_state_data,
                       on=['date', 'location'], 
                       how='left')
complete_df

Unnamed: 0,location,Country_Region,date,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,...,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
0,Alabama,US,2021-01-02,32.3182,-86.9023,365747,4872,202137.0,158738.0,1.0,...,,,,,,,,,,
1,Alaska,US,2021-01-02,61.3707,-152.4044,46986,206,7165.0,39615.0,2.0,...,,,,,,,,,,
2,American Samoa,US,2021-01-02,-14.2710,-170.1320,0,0,0.0,0.0,60.0,...,,,,,,,,,,
3,Arizona,US,2021-01-02,33.7298,-111.4312,530267,9015,76934.0,444318.0,4.0,...,,,,,,,,,,
4,Arkansas,US,2021-01-02,34.9697,-92.3731,229442,3711,199247.0,26484.0,5.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17523,Virginia,US,2021-01-01,37.7693,-78.1700,349584,5032,30707.0,313845.0,51.0,...,,,,,,,,,,
17524,Washington,US,2021-01-01,47.4009,-121.4905,246752,3461,0.0,243291.0,53.0,...,,,,,,,,,,
17525,West Virginia,US,2021-01-01,38.4912,-80.9545,85334,1338,59508.0,24488.0,54.0,...,,,,,,,,,,
17526,Wisconsin,US,2021-01-01,44.2685,-89.6165,520438,5242,447500.0,67696.0,55.0,...,,,,,,,,,,


In [30]:
# Forward prapogate values
complete_df = complete_df.groupby('location').apply(lambda x: x.interpolate(method='linear'))
# Fill out all missing values with 0
complete_df.fillna(0, inplace =True)
complete_df[complete_df['location'] == 'Wyoming'].iloc[-50:]

Unnamed: 0,location,Country_Region,date,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,...,people_vaccinated,people_fully_vaccinated_per_hundred,total_vaccinations_per_hundred,people_fully_vaccinated,people_vaccinated_per_hundred,distributed_per_hundred,daily_vaccinations_raw,daily_vaccinations,daily_vaccinations_per_million,share_doses_used
14783,Wyoming,US,2020-11-13,42.756,-107.3025,20479,127,11585.0,8767.0,56.0,...,87289.0,6.33,21.48,36651.0,15.08,25.92,18884.0,5025.0,8682.0,0.829
14839,Wyoming,US,2020-11-14,42.756,-107.3025,21341,127,12082.0,9132.0,56.0,...,87289.0,6.33,21.48,36651.0,15.08,25.92,18884.0,5025.0,8682.0,0.829
14895,Wyoming,US,2020-11-15,42.756,-107.3025,21881,144,12247.0,9490.0,56.0,...,87289.0,6.33,21.48,36651.0,15.08,25.92,18884.0,5025.0,8682.0,0.829
14951,Wyoming,US,2020-11-16,42.756,-107.3025,22494,144,12453.0,9897.0,56.0,...,87289.0,6.33,21.48,36651.0,15.08,25.92,18884.0,5025.0,8682.0,0.829
15007,Wyoming,US,2020-11-17,42.756,-107.3025,23193,144,12902.0,10147.0,56.0,...,87289.0,6.33,21.48,36651.0,15.08,25.92,18884.0,5025.0,8682.0,0.829
15063,Wyoming,US,2020-11-18,42.756,-107.3025,24453,155,13407.0,10891.0,56.0,...,87289.0,6.33,21.48,36651.0,15.08,25.92,18884.0,5025.0,8682.0,0.829
15119,Wyoming,US,2020-11-19,42.756,-107.3025,25275,155,13752.0,11368.0,56.0,...,87289.0,6.33,21.48,36651.0,15.08,25.92,18884.0,5025.0,8682.0,0.829
15175,Wyoming,US,2020-11-20,42.756,-107.3025,26169,176,14904.0,11089.0,56.0,...,87289.0,6.33,21.48,36651.0,15.08,25.92,18884.0,5025.0,8682.0,0.829
15231,Wyoming,US,2020-11-21,42.756,-107.3025,27129,176,16316.0,10637.0,56.0,...,87289.0,6.33,21.48,36651.0,15.08,25.92,18884.0,5025.0,8682.0,0.829
15287,Wyoming,US,2020-11-22,42.756,-107.3025,27410,176,16530.0,10704.0,56.0,...,87289.0,6.33,21.48,36651.0,15.08,25.92,18884.0,5025.0,8682.0,0.829


In [31]:
# Safe pre-processed dataset in CSV file locally
complete_df.to_csv('processed_by_sate.csv', encoding='utf-8', index=False)

In [35]:
covid_data.group

Unnamed: 0,location,Country_Region,date,Lat,Long_,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,UID,ISO3,Testing_Rate,People_Tested,Mortality_Rate
0,Alabama,US,2021-01-02,32.3182,-86.9023,365747,4872,202137.0,158738.0,1.0,7459.375895,84000001.0,USA,38448.804196,8.071685e+05,2.216728
1,Alaska,US,2021-01-02,61.3707,-152.4044,46986,206,7165.0,39615.0,2.0,6422.844801,84000002.0,USA,174391.185778,3.551041e+05,1.152205
2,American Samoa,US,2021-01-02,-14.2710,-170.1320,0,0,0.0,0.0,60.0,0.000000,16.0,ASM,3846.084722,1.095072e+03,2.797253
3,Arizona,US,2021-01-02,33.7298,-111.4312,530267,9015,76934.0,444318.0,4.0,7285.171274,84000004.0,USA,39551.860582,1.049418e+06,2.832888
4,Arkansas,US,2021-01-02,34.9697,-92.3731,229442,3711,199247.0,26484.0,5.0,7602.945718,84000005.0,USA,67979.497674,7.218591e+05,1.594698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17523,Virginia,US,2021-01-01,37.7693,-78.1700,349584,5032,30707.0,313845.0,51.0,4095.638473,84000051.0,USA,49862.123205,2.787106e+06,1.928971
17524,Washington,US,2021-01-01,47.4009,-121.4905,246752,3461,0.0,243291.0,53.0,3240.386963,84000053.0,USA,49972.993711,2.592766e+06,2.078735
17525,West Virginia,US,2021-01-01,38.4912,-80.9545,85334,1338,59508.0,24488.0,54.0,4761.551368,84000054.0,USA,84508.692646,8.494610e+05,1.774398
17526,Wisconsin,US,2021-01-01,44.2685,-89.6165,520438,5242,447500.0,67696.0,55.0,8938.495481,84000055.0,USA,91860.002192,3.564737e+06,0.864590


In [74]:
covid_grouped = covid_data.groupby('date', as_index=False).agg({ 
                          'Confirmed': np.sum,
                          'Deaths': np.sum,
                          'Recovered': np.sum,
                          'Active': np.sum, 
                          'FIPS': np.sum,
                          'Incident_Rate': np.mean,
                          'UID': np.mean,
                          'Testing_Rate': np.mean,
                          'People_Tested': np.sum,
                          'Mortality_Rate': np.mean})
covid_grouped

Unnamed: 0,date,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,UID,Testing_Rate,People_Tested,Mortality_Rate
0,2020-04-12,555461,22367,3.307164e+05,533142.0,1813.0,132.096557,7.650007e+07,939.956205,2.805892e+06,3.233762
1,2020-04-13,559705,23437,3.156668e+05,536688.0,1810.0,139.013118,7.636371e+07,994.309773,2.738799e+06,3.335102
2,2020-04-14,628793,26735,3.059833e+05,704483.0,1834.0,153.044290,7.663165e+07,1037.753196,3.277827e+06,3.424767
3,2020-04-15,636665,28719,2.937077e+05,608157.0,1822.0,152.156206,7.650007e+07,1091.853820,3.242755e+06,3.561394
4,2020-04-16,667979,33290,2.833841e+05,635024.0,1822.0,160.063317,7.650007e+07,1137.003935,3.401064e+06,3.672137
...,...,...,...,...,...,...,...,...,...,...,...
308,2021-02-15,27640389,485381,1.169335e+07,15512699.0,1822.0,7768.277804,7.650007e+07,98400.979447,7.818233e+07,3.129015
309,2021-02-16,27694359,486322,1.176557e+07,15483912.0,1822.0,7784.075027,7.650007e+07,98776.914728,7.818233e+07,3.129015
310,2021-02-17,27756818,488078,1.180296e+07,15497757.0,1822.0,7800.590947,7.650007e+07,99073.276745,7.818233e+07,3.129015
311,2021-02-18,27826633,490537,1.185089e+07,15508093.0,1822.0,7818.916195,7.650007e+07,99528.861160,7.818233e+07,3.129015


In [75]:
vac_usa_data

Unnamed: 0,location,date,vaccine,source_url,total_vaccinations,people_vaccinated,people_fully_vaccinated
0,United States,2020-12-20,Pfizer/BioNTech,https://www.cdc.gov/coronavirus/2019-ncov/vacc...,556208,556208.0,0.0
1,United States,2020-12-21,Pfizer/BioNTech,https://covid.cdc.gov/covid-data-tracker/#vacc...,614117,614117.0,0.0
2,United States,2020-12-23,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,1008025,1008025.0,0.0
3,United States,2020-12-26,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,1944585,1944585.0,0.0
4,United States,2020-12-28,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,2127143,2127143.0,0.0
5,United States,2020-12-30,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,2794588,2794588.0,0.0
6,United States,2021-01-02,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4225756,4225756.0,0.0
7,United States,2021-01-04,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4563260,4563260.0,0.0
8,United States,2021-01-05,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,4836469,4836469.0,0.0
9,United States,2021-01-06,"Moderna, Pfizer/BioNTech",https://covid.cdc.gov/covid-data-tracker/#vacc...,5306797,5306797.0,0.0


In [76]:
# Merge two Dataframes together
complete_df = pd.merge(covid_grouped,
                       vac_usa_data[['date', 'vaccine', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated']],
                       on=['date'], 
                       how='left')
complete_df.isna().sum()

date                         0
Confirmed                    0
Deaths                       0
Recovered                    0
Active                       0
FIPS                         0
Incident_Rate                0
UID                          0
Testing_Rate                 0
People_Tested                0
Mortality_Rate               0
vaccine                    266
total_vaccinations         266
people_vaccinated          266
people_fully_vaccinated    266
dtype: int64

In [77]:
# Forward prapogate values
complete_df = complete_df.interpolate(method ='pad', limit_direction ='forward')
# Fill out all missing values with 0
complete_df.fillna(0, inplace =True)
complete_df.iloc[-65:]

Unnamed: 0,date,Confirmed,Deaths,Recovered,Active,FIPS,Incident_Rate,UID,Testing_Rate,People_Tested,Mortality_Rate,vaccine,total_vaccinations,people_vaccinated,people_fully_vaccinated
248,2020-12-17,17056288,312810,6.600361e+06,10058941.0,1822.0,5136.937859,7.650007e+07,69018.801936,1.564784e+08,2.190583,0,0.0,0.0,0.0
249,2020-12-18,17295308,316271,6.684351e+06,10214148.0,1822.0,5200.060792,7.650007e+07,69577.019728,1.564784e+08,2.190583,0,0.0,0.0,0.0
250,2020-12-19,17546469,319220,6.758860e+06,10389399.0,1822.0,5266.374113,7.650007e+07,70030.726655,1.564784e+08,2.190583,0,0.0,0.0,0.0
251,2020-12-20,17738085,321856,6.885696e+06,10456287.0,1822.0,5314.400421,7.650007e+07,70627.674125,1.564784e+08,2.190583,Pfizer/BioNTech,556208.0,556208.0,0.0
252,2020-12-21,17925435,323554,6.937856e+06,10591717.0,1822.0,5360.341230,7.650007e+07,71040.912651,1.564784e+08,2.190583,Pfizer/BioNTech,614117.0,614117.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,2021-02-15,27640389,485381,1.169335e+07,15512699.0,1822.0,7768.277804,7.650007e+07,98400.979447,7.818233e+07,3.129015,"Moderna, Pfizer/BioNTech",52884356.0,38292270.0,14077440.0
309,2021-02-16,27694359,486322,1.176557e+07,15483912.0,1822.0,7784.075027,7.650007e+07,98776.914728,7.818233e+07,3.129015,"Moderna, Pfizer/BioNTech",55220364.0,39670551.0,15015434.0
310,2021-02-17,27756818,488078,1.180296e+07,15497757.0,1822.0,7800.590947,7.650007e+07,99073.276745,7.818233e+07,3.129015,"Moderna, Pfizer/BioNTech",56281827.0,40268009.0,15471536.0
311,2021-02-18,27826633,490537,1.185089e+07,15508093.0,1822.0,7818.916195,7.650007e+07,99528.861160,7.818233e+07,3.129015,"Moderna, Pfizer/BioNTech",57737767.0,41021049.0,16162358.0


In [78]:
# Safe pre-processed dataset in CSV file locally
complete_df.to_csv('processed_by_date_usa.csv', encoding='utf-8', index=False)