## Covid-19 Data Exploration and Cleanup

In [1]:
# Import libraries
import matplotlib.pyplot as plt
import pandas as pd
import gmaps
import json

# Import API key
from api_keys import g_key

# Output File (CSV), need 3 files for 3 sets of data for comparison
output_data_file_1 = "output_data/covid_df_1.csv"
output_data_file_2 = "output_data/covid_df_2.csv"
output_data_file_3 = "output_data/covid_df_3.csv"

In [2]:
#File 1 reference

# Load file 1 to read
covid_data_1 = "Resources/daily.csv"

# Read csv file into DataFrame, set state as index
covid_df_1 = pd.read_csv(covid_data_1, index_col = "state")
#covid_df_1 = pd.read_csv(covid_data_1)
covid_df_1

Unnamed: 0_level_0,date,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,...,posNeg,deathIncrease,hospitalizedIncrease,hash,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AK,20200717,2081.0,163333.0,,32.0,,,,0.0,,...,165414,0,0,e904def0e1f67ac528572f0506eef868c163a3f3,0,0,0,0,0,
AL,20200717,63091.0,500024.0,,1416.0,7584.0,,967.0,,521.0,...,563115,35,0,5bb063f5f4fe7c02d748b1fee5b811ac19d8f8ba,0,0,0,0,0,
AR,20200717,31762.0,375735.0,,464.0,2070.0,,,97.0,298.0,...,407497,12,76,c1ffb0bb728cd1ac03f834ec70d20070b443f395,0,0,0,0,0,
AS,20200717,0.0,1037.0,,,,,,,,...,1037,0,0,b0c23fca3c4bb610a1625c60aca491eab6a71161,0,0,0,0,0,
AZ,20200717,138523.0,626445.0,,3466.0,6402.0,944.0,,687.0,,...,764968,91,106,fc227a78fd73620678a4d4cb836c128b38541778,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WA,20200126,2.0,0.0,,,,,,,,...,2,0,0,7acb526e14f20a29cc74a0b32a37328bc6eac6c2,0,0,0,0,0,
WA,20200125,2.0,0.0,,,,,,,,...,2,0,0,9b52ca94dd2a996822542ea5f17a7363e7ad91cf,0,0,0,0,0,
WA,20200124,2.0,0.0,,,,,,,,...,2,0,0,094154f68e74bfc30b977cdee888f9c07be4360e,0,0,0,0,0,
WA,20200123,2.0,0.0,,,,,,,,...,2,0,0,e16af2a6a8f060355ff5ba499a28309a262c0b1e,0,0,0,0,0,


In [3]:
# Narrow down columns of covid_df_1 to state, date, positive
df_1 = covid_df_1[["date", "positive"]]
df_1

Unnamed: 0_level_0,date,positive
state,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,20200717,2081.0
AL,20200717,63091.0
AR,20200717,31762.0
AS,20200717,0.0
AZ,20200717,138523.0
...,...,...
WA,20200126,2.0
WA,20200125,2.0
WA,20200124,2.0
WA,20200123,2.0


In [4]:
# Rename columns for df_1 positive to cases to be consistent with other dataframes
df_1.rename(columns = {"positive": "cases"})

Unnamed: 0_level_0,date,cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,20200717,2081.0
AL,20200717,63091.0
AR,20200717,31762.0
AS,20200717,0.0
AZ,20200717,138523.0
...,...,...
WA,20200126,2.0
WA,20200125,2.0
WA,20200124,2.0
WA,20200123,2.0


In [5]:
# Filter out GA, TX and WA
states = ["GA", "TX", "WA"]
filter_df_1  = df_1[df_1.index.isin(states)]
filter_df_1

Unnamed: 0_level_0,date,positive
state,Unnamed: 1_level_1,Unnamed: 2_level_1
GA,20200717,135183.0
TX,20200717,307572.0
WA,20200717,44313.0
GA,20200716,131275.0
TX,20200716,292656.0
...,...,...
WA,20200126,2.0
WA,20200125,2.0
WA,20200124,2.0
WA,20200123,2.0


In [6]:
# Rename state codes to Georgia, Texas and Washington
filter_df_1.rename(index={"GA": "Georgia", "TX": "Texas", "WA":"Washington"})

Unnamed: 0_level_0,date,positive
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Georgia,20200717,135183.0
Texas,20200717,307572.0
Washington,20200717,44313.0
Georgia,20200716,131275.0
Texas,20200716,292656.0
...,...,...
Washington,20200126,2.0
Washington,20200125,2.0
Washington,20200124,2.0
Washington,20200123,2.0


In [14]:
# Save to csv file
filter_df_1.to_csv("output_data/covid_df_1.csv")

In [12]:
# For reference for next section
#Group by state, date and positive, use this when plotting in next section, can't save groupby as csv 
group_df_1 = filter_df_1.groupby(["state", "date"])
group_df_1.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,positive
state,date,Unnamed: 2_level_1
GA,20200304,2.0
GA,20200305,2.0
GA,20200306,2.0
GA,20200307,6.0
GA,20200308,7.0
...,...,...
WA,20200713,40656.0
WA,20200714,41757.0
WA,20200715,42304.0
WA,20200716,43046.0


In [15]:
# Load file 2 to read
covid_data_2 = "Resources/us-counties.csv"

# Read csv file into DataFrame, set state as index
covid_df_2 = pd.read_csv(covid_data_2, index_col = "state")
covid_df_2

Unnamed: 0_level_0,date,county,fips,cases,deaths
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Washington,1/21/2020,Snohomish,53061.0,1,0
Washington,1/22/2020,Snohomish,53061.0,1,0
Washington,1/23/2020,Snohomish,53061.0,1,0
Illinois,1/24/2020,Cook,17031.0,1,0
Washington,1/24/2020,Snohomish,53061.0,1,0
...,...,...,...,...,...
Wyoming,7/19/2020,Sweetwater,56037.0,173,2
Wyoming,7/19/2020,Teton,56039.0,208,1
Wyoming,7/19/2020,Uinta,56041.0,221,0
Wyoming,7/19/2020,Washakie,56043.0,43,5


In [16]:
# Narrow down columns of covid_df_2 to state, date, cases
df_2 = covid_df_2[["date", "cases"]]
df_2

Unnamed: 0_level_0,date,cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington,1/21/2020,1
Washington,1/22/2020,1
Washington,1/23/2020,1
Illinois,1/24/2020,1
Washington,1/24/2020,1
...,...,...
Wyoming,7/19/2020,173
Wyoming,7/19/2020,208
Wyoming,7/19/2020,221
Wyoming,7/19/2020,43


In [18]:
# Filter out Georgia, Texas and Washington
states = ["Georgia", "Texas", "Washington"]
filter_df_2  = df_2[df_2.index.isin(states)]
filter_df_2

Unnamed: 0_level_0,date,cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington,1/21/2020,1
Washington,1/22/2020,1
Washington,1/23/2020,1
Washington,1/24/2020,1
Washington,1/25/2020,1
...,...,...
Washington,7/19/2020,5
Washington,7/19/2020,294
Washington,7/19/2020,780
Washington,7/19/2020,66


In [19]:
# Save to csv file
filter_df_2.to_csv("output_data/covid_df_2.csv")

In [20]:
# For reference for next section
#Group by state, date and positive, use this when plotting in next section, can't save groupby as csv 
group_df_2 = filter_df_2.groupby(["state", "date"])
group_df_2.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,cases
state,date,Unnamed: 2_level_1
Georgia,3/10/2020,22
Georgia,3/11/2020,31
Georgia,3/12/2020,31
Georgia,3/13/2020,41
Georgia,3/14/2020,65
...,...,...
Washington,7/5/2020,37468
Washington,7/6/2020,38517
Washington,7/7/2020,39063
Washington,7/8/2020,39661


In [21]:
# Load file 3 to read
covid_url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

# Read csv file into DataFrame, set state as index
covid_df_3 = pd.read_csv(covid_url, index_col = "state")
covid_df_3

Unnamed: 0_level_0,date,fips,cases,deaths
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Washington,2020-01-21,53,1,0
Washington,2020-01-22,53,1,0
Washington,2020-01-23,53,1,0
Illinois,2020-01-24,17,1,0
Washington,2020-01-24,53,1,0
...,...,...,...,...
Virginia,2020-07-20,51,78375,2031
Washington,2020-07-20,53,49949,1521
West Virginia,2020-07-20,54,5142,100
Wisconsin,2020-07-20,55,46754,855


In [22]:
# Narrow down columns of covid_df_2 to state, date, cases
df_3 = covid_df_3[["date", "cases"]]
df_3

Unnamed: 0_level_0,date,cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington,2020-01-21,1
Washington,2020-01-22,1
Washington,2020-01-23,1
Illinois,2020-01-24,1
Washington,2020-01-24,1
...,...,...
Virginia,2020-07-20,78375
Washington,2020-07-20,49949
West Virginia,2020-07-20,5142
Wisconsin,2020-07-20,46754


In [23]:
# Filter out Georgia, Texas and Washington
states = ["Georgia", "Texas", "Washington"]
filter_df_3  = df_3[df_3.index.isin(states)]
filter_df_3

Unnamed: 0_level_0,date,cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington,2020-01-21,1
Washington,2020-01-22,1
Washington,2020-01-23,1
Washington,2020-01-24,1
Washington,2020-01-25,1
...,...,...
Texas,2020-07-19,338036
Washington,2020-07-19,48601
Georgia,2020-07-20,132788
Texas,2020-07-20,345672


In [24]:
# Save to csv file
filter_df_3.to_csv("output_data/covid_df_3.csv")

In [25]:
# For reference for next section
#Group by state, date and positive, use this when plotting in next section, can't save groupby as csv 
group_df_3 = filter_df_3.groupby(["state", "date"])
group_df_3.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,cases
state,date,Unnamed: 2_level_1
Georgia,2020-03-02,2
Georgia,2020-03-03,2
Georgia,2020-03-04,2
Georgia,2020-03-05,2
Georgia,2020-03-06,3
...,...,...
Washington,2020-07-16,46268
Washington,2020-07-17,47226
Washington,2020-07-18,47889
Washington,2020-07-19,48601


In [None]:
# Pull population data
# Downloaded data from https://www.kaggle.com/headsortails/covid19-us-county-jhu-data-demographics/data?select=us_county.csv
# Saved as csv named population datasets

#Population data
#Load file to read
pop_file = ("Resources/population datasets.csv")

# Read csv file into DataFrame
pop_df = pd.read_csv(pop_file)
pop_df

In [None]:
# Build dataset with state data
# Reference https://www.washingtonpost.com/graphics/2020/national/states-reopening-coronavirus-map/

data = {'State':['Georgia', 'Texas', 'Washington'], 
        'Governor':['Brian Kemp (R)', 'Greg Abbott (R)', 'Jay Inslee (D)'],
       'Statewide Shelter in Place Date':['4/3/20', '4/2/20', '3/23/20'],
       'Phase One Reopening Date':['4/24/20', '5/1/20', '5/4/20'],
       'Total cases (2/29/20 to 7/21/20)': [145575, 332434, 47743],
       'Population': [10297484, 27885195, 7294336]} 
  
# Create DataFrame 
state_df = pd.DataFrame(data) 
  
# Print the output. 
state_df