## Covid-19 Data Exploration and Cleanup

In [1]:
# Import libraries
import matplotlib.pyplot as plt
import pandas as pd
import datetime


# Output File (CSV), need 3 files for 3 sets of data for comparison

output_data_file_1 = "output_data/data_source_1.csv"
output_data_file_2 = "output_data/data_source_2.csv"
output_data_file_3 = "output_data/data_source_3.csv"

### Source 1 Covid data import and clean up for file 1
#### This will be output into output_data/data_source_1

In [2]:
#Load data for source 1
#Resource = https://covidtracking.com/data/download

# Load file 1 to read
covid_data_1 = "Resources/daily.csv"

# Read csv file into DataFrame, set state as index
covid_df_1 = pd.read_csv(covid_data_1, index_col = "state")
covid_df_1

Unnamed: 0_level_0,date,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,...,posNeg,deathIncrease,hospitalizedIncrease,hash,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AK,20200717,2081.0,163333.0,,32.0,,,,0.0,,...,165414,0,0,e904def0e1f67ac528572f0506eef868c163a3f3,0,0,0,0,0,
AL,20200717,63091.0,500024.0,,1416.0,7584.0,,967.0,,521.0,...,563115,35,0,5bb063f5f4fe7c02d748b1fee5b811ac19d8f8ba,0,0,0,0,0,
AR,20200717,31762.0,375735.0,,464.0,2070.0,,,97.0,298.0,...,407497,12,76,c1ffb0bb728cd1ac03f834ec70d20070b443f395,0,0,0,0,0,
AS,20200717,0.0,1037.0,,,,,,,,...,1037,0,0,b0c23fca3c4bb610a1625c60aca491eab6a71161,0,0,0,0,0,
AZ,20200717,138523.0,626445.0,,3466.0,6402.0,944.0,,687.0,,...,764968,91,106,fc227a78fd73620678a4d4cb836c128b38541778,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WA,20200126,2.0,0.0,,,,,,,,...,2,0,0,7acb526e14f20a29cc74a0b32a37328bc6eac6c2,0,0,0,0,0,
WA,20200125,2.0,0.0,,,,,,,,...,2,0,0,9b52ca94dd2a996822542ea5f17a7363e7ad91cf,0,0,0,0,0,
WA,20200124,2.0,0.0,,,,,,,,...,2,0,0,094154f68e74bfc30b977cdee888f9c07be4360e,0,0,0,0,0,
WA,20200123,2.0,0.0,,,,,,,,...,2,0,0,e16af2a6a8f060355ff5ba499a28309a262c0b1e,0,0,0,0,0,


In [3]:
# Narrow down columns of covid_df_1 to state, date, positive
df_1 = covid_df_1[["date", "positive", "negative"]]
df_1

Unnamed: 0_level_0,date,positive,negative
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,20200717,2081.0,163333.0
AL,20200717,63091.0,500024.0
AR,20200717,31762.0,375735.0
AS,20200717,0.0,1037.0
AZ,20200717,138523.0,626445.0
...,...,...,...
WA,20200126,2.0,0.0
WA,20200125,2.0,0.0
WA,20200124,2.0,0.0
WA,20200123,2.0,0.0


In [4]:
# Filter out FL, GA, NY, TX and WA
states = ["FL", "GA", "NY", "TX", "WA"]
filter_df_1  = df_1[df_1.index.isin(states)]
filter_df_1

Unnamed: 0_level_0,date,positive,negative
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FL,20200717,327241.0,2553527.0
GA,20200717,135183.0,1063495.0
NY,20200717,405551.0,4593898.0
TX,20200717,307572.0,2459749.0
WA,20200717,44313.0,708861.0
...,...,...,...
WA,20200126,2.0,0.0
WA,20200125,2.0,0.0
WA,20200124,2.0,0.0
WA,20200123,2.0,0.0


In [5]:
# Rename state codes to Florida, Georgia, New York, Texas and Washington
state_name_df = filter_df_1.rename(index={"FL": "Florida", "GA": "Georgia", "NY": "New York","TX": "Texas", "WA":"Washington"})
state_name_df

Unnamed: 0_level_0,date,positive,negative
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Florida,20200717,327241.0,2553527.0
Georgia,20200717,135183.0,1063495.0
New York,20200717,405551.0,4593898.0
Texas,20200717,307572.0,2459749.0
Washington,20200717,44313.0,708861.0
...,...,...,...
Washington,20200126,2.0,0.0
Washington,20200125,2.0,0.0
Washington,20200124,2.0,0.0
Washington,20200123,2.0,0.0


In [6]:
# Set date range for data
start_date = 20200301
end_date = 20200717
after_start_date = state_name_df["date"] >= start_date
before_end_date = state_name_df["date"] <= end_date
between_two_dates = after_start_date & before_end_date
filtered_dates_1 = state_name_df.loc[between_two_dates]
print(filtered_dates_1)

                date  positive   negative
state                                    
Florida     20200717  327241.0  2553527.0
Georgia     20200717  135183.0  1063495.0
New York    20200717  405551.0  4593898.0
Texas       20200717  307572.0  2459749.0
Washington  20200717   44313.0   708861.0
...              ...       ...        ...
Texas       20200304       1.0        NaN
Washington  20200304     626.0      417.0
Washington  20200303     534.0      264.0
Washington  20200302     450.0      123.0
Washington  20200301     377.0       12.0

[683 rows x 3 columns]


In [7]:
# Sort by date in ascending order
filtered_dates_1 = filtered_dates_1.sort_values(["state", "date"])
filtered_dates_1

Unnamed: 0_level_0,date,positive,negative
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Florida,20200304,2.0,24.0
Florida,20200305,9.0,31.0
Florida,20200306,9.0,55.0
Florida,20200307,14.0,100.0
Florida,20200308,17.0,118.0
...,...,...,...
Washington,20200713,40656.0,645349.0
Washington,20200714,41757.0,666517.0
Washington,20200715,42304.0,675930.0
Washington,20200716,43046.0,690840.0


In [8]:
print(state_name_df.dtypes)

date          int64
positive    float64
negative    float64
dtype: object


In [9]:
# Rename columns to match formatting of other data sets
state_name_df = state_name_df.rename(columns = {'positive': 'positive cases', 
                                                'negative': 'negative cases'})
state_name_df

Unnamed: 0_level_0,date,positive cases,negative cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Florida,20200717,327241.0,2553527.0
Georgia,20200717,135183.0,1063495.0
New York,20200717,405551.0,4593898.0
Texas,20200717,307572.0,2459749.0
Washington,20200717,44313.0,708861.0
...,...,...,...
Washington,20200126,2.0,0.0
Washington,20200125,2.0,0.0
Washington,20200124,2.0,0.0
Washington,20200123,2.0,0.0


In [10]:
# Change formatting of date to match those of other data sets
state_name_df['date'] = pd.to_datetime(state_name_df['date'], format = '%Y%m%d')
state_name_df

Unnamed: 0_level_0,date,positive cases,negative cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Florida,2020-07-17,327241.0,2553527.0
Georgia,2020-07-17,135183.0,1063495.0
New York,2020-07-17,405551.0,4593898.0
Texas,2020-07-17,307572.0,2459749.0
Washington,2020-07-17,44313.0,708861.0
...,...,...,...
Washington,2020-01-26,2.0,0.0
Washington,2020-01-25,2.0,0.0
Washington,2020-01-24,2.0,0.0
Washington,2020-01-23,2.0,0.0


In [11]:
# Alter variable type of positive and negative cases to match formatting of other data sets
state_name_df['positive cases'] = state_name_df['positive cases'].astype(int)
state_name_df.dropna(inplace = True)
state_name_df['negative cases'] = state_name_df['negative cases'].astype(int)
state_name_df

Unnamed: 0_level_0,date,positive cases,negative cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Florida,2020-07-17,327241,2553527
Georgia,2020-07-17,135183,1063495
New York,2020-07-17,405551,4593898
Texas,2020-07-17,307572,2459749
Washington,2020-07-17,44313,708861
...,...,...,...
Washington,2020-01-26,2,0
Washington,2020-01-25,2,0
Washington,2020-01-24,2,0
Washington,2020-01-23,2,0


In [12]:
#Save to csv file
state_name_df.to_csv('output_data/data_source_1.csv')

### Source 2 data import and clean up for file 2
#### This will be output into output_data/data_source_2


In [13]:
# Load data for source 2
# Reference https://www.kaggle.com/fireballbyedimyrnmom/us-counties-covid-19-dataset?select=us-counties.csv

covid_data_2 = "Resources/us-counties.csv"

# Read csv file into DataFrame
covid_df_2 = pd.read_csv(covid_data_2)
covid_df_2

Unnamed: 0,date,county,state,fips,cases,deaths
0,1/21/2020,Snohomish,Washington,53061.0,1,0
1,1/22/2020,Snohomish,Washington,53061.0,1,0
2,1/23/2020,Snohomish,Washington,53061.0,1,0
3,1/24/2020,Cook,Illinois,17031.0,1,0
4,1/24/2020,Snohomish,Washington,53061.0,1,0
...,...,...,...,...,...,...
350360,7/19/2020,Sweetwater,Wyoming,56037.0,173,2
350361,7/19/2020,Teton,Wyoming,56039.0,208,1
350362,7/19/2020,Uinta,Wyoming,56041.0,221,0
350363,7/19/2020,Washakie,Wyoming,56043.0,43,5


In [14]:
# Narrow down columns of covid_df_2 to state, date, cases
df_2 = covid_df_2[["date", "state", "cases"]]
df_2

Unnamed: 0,date,state,cases
0,1/21/2020,Washington,1
1,1/22/2020,Washington,1
2,1/23/2020,Washington,1
3,1/24/2020,Illinois,1
4,1/24/2020,Washington,1
...,...,...,...
350360,7/19/2020,Wyoming,173
350361,7/19/2020,Wyoming,208
350362,7/19/2020,Wyoming,221
350363,7/19/2020,Wyoming,43


In [15]:
# Set index to states
df_2 = df_2.set_index('state')
df_2

Unnamed: 0_level_0,date,cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington,1/21/2020,1
Washington,1/22/2020,1
Washington,1/23/2020,1
Illinois,1/24/2020,1
Washington,1/24/2020,1
...,...,...
Wyoming,7/19/2020,173
Wyoming,7/19/2020,208
Wyoming,7/19/2020,221
Wyoming,7/19/2020,43


In [16]:
# Rename the 'cases' column to 'positive cases' to be consistent with other data sets
df_2 = df_2.rename(columns = {'cases': 'positive cases'})
df_2

Unnamed: 0_level_0,date,positive cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington,1/21/2020,1
Washington,1/22/2020,1
Washington,1/23/2020,1
Illinois,1/24/2020,1
Washington,1/24/2020,1
...,...,...
Wyoming,7/19/2020,173
Wyoming,7/19/2020,208
Wyoming,7/19/2020,221
Wyoming,7/19/2020,43


In [17]:
# Filter out Florida, Georgia, New York, Texas and Washington
states = ["New York", "Georgia", "New York", "Texas", "Washington"]
filter_df_2  = df_2[df_2.index.isin(states)]
filter_df_2

Unnamed: 0_level_0,date,positive cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington,1/21/2020,1
Washington,1/22/2020,1
Washington,1/23/2020,1
Washington,1/24/2020,1
Washington,1/25/2020,1
...,...,...
Washington,7/19/2020,5
Washington,7/19/2020,294
Washington,7/19/2020,780
Washington,7/19/2020,66


In [18]:
# Set date range for data
start_date = '3/1/2020'
end_date = '7/17/2020'
after_start_date = filter_df_2["date"] >= start_date
before_end_date = filter_df_2["date"] <= end_date
between_two_dates = after_start_date & before_end_date
filtered_dates_2 = filter_df_2.loc[between_two_dates]
print(filtered_dates_2)

                 date  positive cases
state                                
New York     3/1/2020               1
Texas        3/1/2020              11
Washington   3/1/2020              11
Washington   3/1/2020               2
Washington   3/1/2020               4
...               ...             ...
Washington  7/17/2020               5
Washington  7/17/2020             266
Washington  7/17/2020             760
Washington  7/17/2020              58
Washington  7/17/2020            9275

[51219 rows x 2 columns]


In [19]:
# Group duplicate dates for each state into a total positive case per day
group_df_2 = filter_df_2.groupby(["state", "date"])
group_df_2['positive cases'].sum()

state       date     
Georgia     3/10/2020       22
            3/11/2020       31
            3/12/2020       31
            3/13/2020       41
            3/14/2020       65
                         ...  
Washington  7/5/2020     37468
            7/6/2020     38517
            7/7/2020     39063
            7/8/2020     39661
            7/9/2020     40369
Name: positive cases, Length: 621, dtype: int64

In [20]:
# Save data frame to a CSV file
group_df_2 = filter_df_2.groupby(["state", "date"])
group_df_2.sum().reset_index().to_csv('output_data/group_df_2.csv')

In [21]:
# Open newly saved data CSV file
new = "output_data/group_df_2.csv"
new_df_2 = pd.read_csv(new)
new_df_2

Unnamed: 0.1,Unnamed: 0,state,date,positive cases
0,0,Georgia,3/10/2020,22
1,1,Georgia,3/11/2020,31
2,2,Georgia,3/12/2020,31
3,3,Georgia,3/13/2020,41
4,4,Georgia,3/14/2020,65
...,...,...,...,...
616,616,Washington,7/5/2020,37468
617,617,Washington,7/6/2020,38517
618,618,Washington,7/7/2020,39063
619,619,Washington,7/8/2020,39661


In [22]:
# Select for columns of interest
new_df_2 = new_df_2[['state', 'date', 'positive cases']]
new_df_2

Unnamed: 0,state,date,positive cases
0,Georgia,3/10/2020,22
1,Georgia,3/11/2020,31
2,Georgia,3/12/2020,31
3,Georgia,3/13/2020,41
4,Georgia,3/14/2020,65
...,...,...,...
616,Washington,7/5/2020,37468
617,Washington,7/6/2020,38517
618,Washington,7/7/2020,39063
619,Washington,7/8/2020,39661


In [23]:
# Alter date format to be consistent with other data frames
new_df_2['date'] = pd.to_datetime(new_df_2['date'])
new_df_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,state,date,positive cases
0,Georgia,2020-03-10,22
1,Georgia,2020-03-11,31
2,Georgia,2020-03-12,31
3,Georgia,2020-03-13,41
4,Georgia,2020-03-14,65
...,...,...,...
616,Washington,2020-07-05,37468
617,Washington,2020-07-06,38517
618,Washington,2020-07-07,39063
619,Washington,2020-07-08,39661


In [24]:
#Sort by date in ascending order
data_source_2 = new_df_2.sort_values(["state", "date"])
data_source_2

Unnamed: 0,state,date,positive cases
10,Georgia,2020-03-02,2
21,Georgia,2020-03-03,2
24,Georgia,2020-03-04,2
25,Georgia,2020-03-05,2
26,Georgia,2020-03-06,3
...,...,...,...
608,Washington,2020-07-15,45004
609,Washington,2020-07-16,46268
610,Washington,2020-07-17,47226
611,Washington,2020-07-18,47889


In [25]:
# Set index to state
data_source_2 = data_source_2.set_index('state')
data_source_2

Unnamed: 0_level_0,date,positive cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Georgia,2020-03-02,2
Georgia,2020-03-03,2
Georgia,2020-03-04,2
Georgia,2020-03-05,2
Georgia,2020-03-06,3
...,...,...
Washington,2020-07-15,45004
Washington,2020-07-16,46268
Washington,2020-07-17,47226
Washington,2020-07-18,47889


In [26]:
# Save to CSV file
data_source_2.to_csv('output_data/data_source_2.csv')

### Source 3 data import and clean up for file 3
#### This will be output into output_data/data_source_3


In [27]:
# Load data for source 3
#Reference = https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv

covid_url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

# Read csv file into DataFrame, set state as index
covid_df_3 = pd.read_csv(covid_url, index_col = "state")
covid_df_3

Unnamed: 0_level_0,date,fips,cases,deaths
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Washington,2020-01-21,53,1,0
Washington,2020-01-22,53,1,0
Washington,2020-01-23,53,1,0
Illinois,2020-01-24,17,1,0
Washington,2020-01-24,53,1,0
...,...,...,...,...
Virginia,2020-07-24,51,82364,2067
Washington,2020-07-24,53,53279,1591
West Virginia,2020-07-24,54,5695,103
Wisconsin,2020-07-24,55,50895,887


In [28]:
# Narrow down columns of covid_df_2 to state, date, cases
df_3 = covid_df_3[["date", "cases"]]
df_3

Unnamed: 0_level_0,date,cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington,2020-01-21,1
Washington,2020-01-22,1
Washington,2020-01-23,1
Illinois,2020-01-24,1
Washington,2020-01-24,1
...,...,...
Virginia,2020-07-24,82364
Washington,2020-07-24,53279
West Virginia,2020-07-24,5695
Wisconsin,2020-07-24,50895


In [29]:
# Filter out Florida, Georgia, New York, Texas and Washington
states = ["Florida", "Georgia", "New York", "Texas", "Washington"]
filter_df_3  = df_3[df_3.index.isin(states)]
filter_df_3

Unnamed: 0_level_0,date,cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington,2020-01-21,1
Washington,2020-01-22,1
Washington,2020-01-23,1
Washington,2020-01-24,1
Washington,2020-01-25,1
...,...,...
Florida,2020-07-24,402304
Georgia,2020-07-24,147732
New York,2020-07-24,415163
Texas,2020-07-24,383662


In [30]:
# Rename column to match formatting in other data sets
filter_df_3 = filter_df_3.rename(columns = {'cases': 'positive cases'})
filter_df_3

Unnamed: 0_level_0,date,positive cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington,2020-01-21,1
Washington,2020-01-22,1
Washington,2020-01-23,1
Washington,2020-01-24,1
Washington,2020-01-25,1
...,...,...
Florida,2020-07-24,402304
Georgia,2020-07-24,147732
New York,2020-07-24,415163
Texas,2020-07-24,383662


In [31]:
# Set timeframe parameters
start_date = '2020-03-01'
end_date = '2020-07-17'
after_start_date = filter_df_3["date"] >= start_date
before_end_date = filter_df_3["date"] <= end_date
between_two_dates = after_start_date & before_end_date
filtered_dates_3 = filter_df_3.loc[between_two_dates]
print(filtered_dates_3)

                  date  positive cases
state                                 
Florida     2020-03-01               2
New York    2020-03-01               1
Texas       2020-03-01              11
Washington  2020-03-01              17
Florida     2020-03-02               2
...                ...             ...
Florida     2020-07-17          327233
Georgia     2020-07-17          124267
New York    2020-07-17          410254
Texas       2020-07-17          321254
Washington  2020-07-17           47226

[694 rows x 2 columns]


In [32]:
#Sort by date in ascending order
filtered_dates_3 = filtered_dates_3.sort_values(["state", "date"])
filtered_dates_3

Unnamed: 0_level_0,date,positive cases
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Florida,2020-03-01,2
Florida,2020-03-02,2
Florida,2020-03-03,3
Florida,2020-03-04,3
Florida,2020-03-05,4
...,...,...
Washington,2020-07-13,43538
Washington,2020-07-14,44257
Washington,2020-07-15,45004
Washington,2020-07-16,46268


In [33]:
# Save to csv file
filtered_dates_3.to_csv('output_data/data_source_3.csv')

### Population data import and clean up 
#### This will be output into output_data/pop_df


In [41]:
# Pull population data
# Downloaded data from https://www.kaggle.com/headsortails/covid19-us-county-jhu-data-demographics/data?select=us_county.csv
# Saved as csv named population datasets

#Population data
#Load file to read
pop_file = ("Resources/population datasets.csv")

# Read csv file into DataFrame
pop_df = pd.read_csv(pop_file, index_col = "state")
pop_df

Unnamed: 0_level_0,fips,county,state_code,male,female,median_age,population,female_percentage,lat,long
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,1001,Autauga County,AL,26874,28326,37.8,55200,51.315217,32.534923,-86.642730
Alabama,1003,Baldwin County,AL,101188,106919,42.8,208107,51.376936,30.727479,-87.722564
Alabama,1005,Barbour County,AL,13697,12085,39.9,25782,46.873788,31.869581,-85.393210
Alabama,1007,Bibb County,AL,12152,10375,39.9,22527,46.055844,32.998628,-87.126475
Alabama,1009,Blount County,AL,28434,29211,40.8,57645,50.673953,33.980869,-86.567380
...,...,...,...,...,...,...,...,...,...,...
Puerto Rico,72145,Vega Baja Municipio,,25580,27791,40.7,53371,52.071350,18.428461,-66.397926
Puerto Rico,72147,Vieques Municipio,,4332,4439,43.6,8771,50.609965,18.122662,-65.439095
Puerto Rico,72149,Villalba Municipio,,11169,11824,38.8,22993,51.424347,18.128155,-66.472816
Puerto Rico,72151,Yabucoa Municipio,,16541,17608,42.5,34149,51.562271,18.070468,-65.896311


In [42]:
# Narrow down columns of pop_df to state and population
narrow_pop_df = pop_df[["population"]]
narrow_pop_df

Unnamed: 0_level_0,population
state,Unnamed: 1_level_1
Alabama,55200
Alabama,208107
Alabama,25782
Alabama,22527
Alabama,57645
...,...
Puerto Rico,53371
Puerto Rico,8771
Puerto Rico,22993
Puerto Rico,34149


In [43]:
# Filter out Florida, Georgia, New York, Texas and Washington
states = ["Florida", "Georgia", "New York", "Texas", "Washington"]
pop_filter_df  = narrow_pop_df[narrow_pop_df.index.isin(states)]
pop_filter_df

Unnamed: 0_level_0,population
state,Unnamed: 1_level_1
Florida,263148
Florida,27785
Florida,182482
Florida,26979
Florida,576808
...,...
Washington,4189
Washington,60236
Washington,216812
Washington,48593


In [44]:
# Filter state and population
state_pop_df = pop_filter_df.groupby('state')['population'].sum()
state_pop_df

state
Florida       20598139
Georgia       10297484
New York      19618453
Texas         27885195
Washington     7294336
Name: population, dtype: int64

In [45]:
# Save to csv file
pop_filter_df.to_csv("output_data/state_pop_df.csv")

### Dataset for state data with state information
#### This will be output into output_data/state_df

In [39]:
# Build dataset with state data
# Reference https://www.washingtonpost.com/graphics/2020/national/states-reopening-coronavirus-map/

data = {'State':['Florida', 'Georgia', 'New York', 'Texas', 'Washington'], 
        'Governor':['Ron DeSantis (R)','Brian Kemp (R)', 'Andrew Cuomo (D)','Greg Abbott (R)', 'Jay Inslee (D)'],
       'Statewide Shelter in Place Date':['4/2/20','4/3/20', '3/22/20','4/2/20', '3/23/20'],
       'Phase One Reopening Date':['5/4/20', '4/24/20', 'Still closed','5/1/20', '5/4/20'],
       'Total cases (2/29/20 to 7/21/20)': [379619, 145575, 408886, 332434, 47743],
       'Population': [20598139, 10297484, 19618453, 27885195, 7294336]} 
  
# Create DataFrame 
state_df = pd.DataFrame(data) 
  
# Print the output
state_df

Unnamed: 0,State,Governor,Statewide Shelter in Place Date,Phase One Reopening Date,Total cases (2/29/20 to 7/21/20),Population
0,Florida,Ron DeSantis (R),4/2/20,5/4/20,379619,20598139
1,Georgia,Brian Kemp (R),4/3/20,4/24/20,145575,10297484
2,New York,Andrew Cuomo (D),3/22/20,Still closed,408886,19618453
3,Texas,Greg Abbott (R),4/2/20,5/1/20,332434,27885195
4,Washington,Jay Inslee (D),3/23/20,5/4/20,47743,7294336


In [40]:
# Save to csv file
state_df.to_csv("output_data/state_df.csv")