## Covid-19 Data Exploration and Cleanup

In [1]:
# Import libraries
import matplotlib.pyplot as plt
import pandas as pd
import gmaps
import json

# Import API key
from api_keys import g_key

# Output File (CSV), need 3 files for 3 sets of data for comparison
output_data_file_1 = "output_data/covid_df_1.csv"
output_data_file_2 = "output_data/covid_df_2.csv"
output_data_file_3 = "output_data/covid_df_3.csv"

In [2]:
# Load file 1 to read
covid_data_1 = "Resources/daily.csv"

# Read csv file into DataFrame, set state as index
covid_df_1 = pd.read_csv(covid_data_1, index_col = "state")
#covid_df_1 = pd.read_csv(covid_data_1)
covid_df_1

Unnamed: 0_level_0,date,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,...,posNeg,deathIncrease,hospitalizedIncrease,hash,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AK,20200717,2081.0,163333.0,,32.0,,,,0.0,,...,165414,0,0,e904def0e1f67ac528572f0506eef868c163a3f3,0,0,0,0,0,
AL,20200717,63091.0,500024.0,,1416.0,7584.0,,967.0,,521.0,...,563115,35,0,5bb063f5f4fe7c02d748b1fee5b811ac19d8f8ba,0,0,0,0,0,
AR,20200717,31762.0,375735.0,,464.0,2070.0,,,97.0,298.0,...,407497,12,76,c1ffb0bb728cd1ac03f834ec70d20070b443f395,0,0,0,0,0,
AS,20200717,0.0,1037.0,,,,,,,,...,1037,0,0,b0c23fca3c4bb610a1625c60aca491eab6a71161,0,0,0,0,0,
AZ,20200717,138523.0,626445.0,,3466.0,6402.0,944.0,,687.0,,...,764968,91,106,fc227a78fd73620678a4d4cb836c128b38541778,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WA,20200126,2.0,0.0,,,,,,,,...,2,0,0,7acb526e14f20a29cc74a0b32a37328bc6eac6c2,0,0,0,0,0,
WA,20200125,2.0,0.0,,,,,,,,...,2,0,0,9b52ca94dd2a996822542ea5f17a7363e7ad91cf,0,0,0,0,0,
WA,20200124,2.0,0.0,,,,,,,,...,2,0,0,094154f68e74bfc30b977cdee888f9c07be4360e,0,0,0,0,0,
WA,20200123,2.0,0.0,,,,,,,,...,2,0,0,e16af2a6a8f060355ff5ba499a28309a262c0b1e,0,0,0,0,0,


In [3]:
# Narrow down columns of covid_df_1 to state, date, positive
df_1 = covid_df_1[["date", "positive"]]
df_1

Unnamed: 0_level_0,date,positive
state,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,20200717,2081.0
AL,20200717,63091.0
AR,20200717,31762.0
AS,20200717,0.0
AZ,20200717,138523.0
...,...,...
WA,20200126,2.0
WA,20200125,2.0
WA,20200124,2.0
WA,20200123,2.0


In [4]:
# Rename columns for df_1 positive to cases to be consistent with other dataframes
df_1.rename(columns = {"positive": "cases"})

Unnamed: 0_level_0,date,cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,20200717,2081.0
AL,20200717,63091.0
AR,20200717,31762.0
AS,20200717,0.0
AZ,20200717,138523.0
...,...,...
WA,20200126,2.0
WA,20200125,2.0
WA,20200124,2.0
WA,20200123,2.0


In [5]:
# Filter out GA, TX and WA
states = ["GA", "TX", "WA"]
filter_df_1  = df_1[df_1.index.isin(states)]
filter_df_1

Unnamed: 0_level_0,date,positive
state,Unnamed: 1_level_1,Unnamed: 2_level_1
GA,20200717,135183.0
TX,20200717,307572.0
WA,20200717,44313.0
GA,20200716,131275.0
TX,20200716,292656.0
...,...,...
WA,20200126,2.0
WA,20200125,2.0
WA,20200124,2.0
WA,20200123,2.0


In [6]:
# Rename state codes to Georgia, Texas and Washington
filter_df_1.rename(index={"GA": "Georgia", "TX": "Texas", "WA":"Washington"})

Unnamed: 0_level_0,date,positive
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Georgia,20200717,135183.0
Texas,20200717,307572.0
Washington,20200717,44313.0
Georgia,20200716,131275.0
Texas,20200716,292656.0
...,...,...
Washington,20200126,2.0
Washington,20200125,2.0
Washington,20200124,2.0
Washington,20200123,2.0


In [14]:
# Save to csv file
filter_df_1.to_csv("output_data/covid_df_1.csv")

In [12]:
# For reference for next section
#Group by state, date and positive, use this when plotting in next section, can't save groupby as csv 
group_df_1 = filter_df_1.groupby(["state", "date"])
group_df_1.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,positive
state,date,Unnamed: 2_level_1
GA,20200304,2.0
GA,20200305,2.0
GA,20200306,2.0
GA,20200307,6.0
GA,20200308,7.0
...,...,...
WA,20200713,40656.0
WA,20200714,41757.0
WA,20200715,42304.0
WA,20200716,43046.0


In [None]:
# Load file 2 to read
covid_data_2 = "Resources/us-counties.csv"

# Read csv file into DataFrame, set state as index
covid_df_2 = pd.read_csv(covid_data_2, index_col = "state")
covid_df_2

In [None]:
# Narrow down columns of covid_df_2 to state, date, cases
df_2 = covid_df_2[["date", "cases"]]
df_2

In [None]:
# Load file 3 to read
covid_url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

# Read csv file into DataFrame, set state as index
covid_df_3 = pd.read_csv(covid_url, index_col = "state")
covid_df_3

In [None]:
# URL to pull population data
# Downloaded data from https://www.kaggle.com/headsortails/covid19-us-county-jhu-data-demographics/data?select=us_county.csv
# Saved as csv named population datasets

#Population data
#Load file to read
pop_file = ("Resources/population datasets.csv")

# Read csv file into DataFrame
pop_df = pd.read_csv(pop_file)
pop_df

In [None]:
# Build dataset with state data
# Reference https://www.washingtonpost.com/graphics/2020/national/states-reopening-coronavirus-map/

data = {'State':['Georgia', 'Texas', 'Washington'], 
        'Governor':['Brian Kemp (R)', 'Greg Abbott (R)', 'Jay Inslee (D)'],
       'Statewide Shelter in Place Date':['4/3/20', '4/2/20', '3/23/20'],
       'Phase One Reopening Date':['4/24/20', '5/1/20', '5/4/20'],
       'Total cases (2/29/20 to 7/21/20)': [145575, 332434, 47743],
       'Population': [10297484, 27885195, 7294336]} 
  
# Create DataFrame 
state_df = pd.DataFrame(data) 
  
# Print the output. 
state_df