## Import Statements & Configurations


In [66]:
import pandas as pd
import lxml
import requests
pd.set_option('display.max_rows', None)

## Load state-level data 

In [67]:
url = 'https://www.worldometers.info/coronavirus/country/us/#nav-yesterday'
req = requests.get(url)
dfs = pd.read_html(req.text)

In [68]:
# Create a variable for the Covid Table by State
statesT = dfs[0]
print(statesT.head())

     #    USAState  TotalCases  NewCases  TotalDeaths  NewDeaths  \
0  NaN   USA Total    71597479  202900.0     888343.0      701.0   
1  1.0  California     7376878       NaN      78730.0        NaN   
2  2.0       Texas     5919216   54691.0      78423.0      154.0   
3  3.0     Florida     5328573       NaN      63574.0        NaN   
4  4.0    New York     4813820   23333.0      63619.0       56.0   

   TotalRecovered  ActiveCases  Tot Cases/1M pop  Deaths/1M pop   TotalTests  \
0      44235490.0   26473646.0          216305.0         2684.0  901998821.0   
1             NaN          NaN          186699.0         1993.0  131160392.0   
2             NaN          NaN          204140.0         2705.0   55606051.0   
3       3665356.0    1599643.0          248098.0         2960.0   50280048.0   
4       2597185.0    2153016.0          247452.0         3270.0   95256290.0   

   Tests/ 1M pop  Population                            Source    Projections  
0      2725050.0         NaN  

### Refactor the Table

In [69]:
# Remove the following columns: #, Source, & Projections Columns
statesT.drop(['#', 'Source', 'Projections'], axis=1, inplace=True)

# Remove the following row: Country Totals 
statesT.drop([0, 64], axis=0, inplace=True)
print(statesT.head())

     USAState  TotalCases  NewCases  TotalDeaths  NewDeaths  TotalRecovered  \
1  California     7376878       NaN      78730.0        NaN             NaN   
2       Texas     5919216   54691.0      78423.0      154.0             NaN   
3     Florida     5328573       NaN      63574.0        NaN       3665356.0   
4    New York     4813820   23333.0      63619.0       56.0       2597185.0   
5    Illinois     2773362       NaN      33446.0        NaN       1924335.0   

   ActiveCases  Tot Cases/1M pop  Deaths/1M pop   TotalTests  Tests/ 1M pop  \
1          NaN          186699.0         1993.0  131160392.0      3319489.0   
2          NaN          204140.0         2705.0   55606051.0      1917722.0   
3    1599643.0          248098.0         2960.0   50280048.0      2341031.0   
4    2153016.0          247452.0         3270.0   95256290.0      4896599.0   
5     815581.0          218861.0         2639.0   49488107.0      3905367.0   

   Population  
1  39512223.0  
2  28995881.0  
3 

In [77]:
# Verify State Column has no HTML.
# The method used to extract information did not include html in the column. Code was not needed.
# print(statesT['USAState'])

## Describe the Dataframe by printing the first few rows

In [71]:
# Print the dataframe first few rows
print(statesT.head())

     USAState  TotalCases  NewCases  TotalDeaths  NewDeaths  TotalRecovered  \
1  California     7376878       NaN      78730.0        NaN             NaN   
2       Texas     5919216   54691.0      78423.0      154.0             NaN   
3     Florida     5328573       NaN      63574.0        NaN       3665356.0   
4    New York     4813820   23333.0      63619.0       56.0       2597185.0   
5    Illinois     2773362       NaN      33446.0        NaN       1924335.0   

   ActiveCases  Tot Cases/1M pop  Deaths/1M pop   TotalTests  Tests/ 1M pop  \
1          NaN          186699.0         1993.0  131160392.0      3319489.0   
2          NaN          204140.0         2705.0   55606051.0      1917722.0   
3    1599643.0          248098.0         2960.0   50280048.0      2341031.0   
4    2153016.0          247452.0         3270.0   95256290.0      4896599.0   
5     815581.0          218861.0         2639.0   49488107.0      3905367.0   

   Population  
1  39512223.0  
2  28995881.0  
3 

## Print Dataframe Summary Statistics

In [72]:
# The stats will provide infomation such as min, max, the count, the standard deviation and the mean (avg).
print(statesT.describe())

         TotalCases      NewCases   TotalDeaths   NewDeaths  TotalRecovered  \
count  6.300000e+01     18.000000     60.000000   13.000000    4.500000e+01   
mean   1.136468e+06  11272.222222  14805.716667   53.923077    6.700809e+05   
std    1.458594e+06  13562.849785  18286.432025   51.527439    7.790418e+05   
min    3.000000e+00     42.000000      7.000000    1.000000    3.000000e+00   
25%    2.123280e+05   2730.750000   2393.750000   14.000000    1.436660e+05   
50%    6.813820e+05   6182.500000   9590.500000   42.000000    4.221640e+05   
75%    1.448960e+06  16298.750000  19053.750000   66.000000    8.976670e+05   
max    7.376878e+06  54691.000000  78730.000000  154.000000    3.665356e+06   

        ActiveCases  Tot Cases/1M pop  Deaths/1M pop    TotalTests  \
count  4.500000e+01         52.000000      52.000000  6.000000e+01   
mean   2.775032e+05     213541.442308    2491.923077  1.503331e+07   
std    4.256681e+05      37734.711237     747.857485  2.236571e+07   
min    0

## Print the top 5 states (name & value) for each of these metrics:

In [73]:
# New Cases
print(statesT[['USAState','NewCases']].sort_values(by=['NewCases'], ascending=False).head(5))

        USAState  NewCases
2          Texas   54691.0
12       Arizona   27681.0
4       New York   23333.0
6   Pennsylvania   17623.0
23       Alabama   16506.0


In [74]:
# Total Deaths
print(statesT[['USAState', 'TotalDeaths']].sort_values(by=['TotalDeaths'], ascending=False).head(5))

       USAState  TotalDeaths
1    California      78730.0
2         Texas      78423.0
4      New York      63619.0
3       Florida      63574.0
6  Pennsylvania      39487.0


In [75]:
# Total Cases/ 1M Pop.
tcases = statesT.iloc[:, [0, 6]]
print(tcases.sort_values(by=[tcases.columns[1]], ascending=False).head(5))

      USAState  ActiveCases
4     New York    2153016.0
3      Florida    1599643.0
5     Illinois     815581.0
11  New Jersey     809455.0
22  Washington     780732.0


In [76]:
# Total Deaths/ 1M Pop.
tdeaths = statesT.iloc[:, [0, 7]]
print(tdeaths.sort_values(by=[tdeaths.columns[1]], ascending=False).head(5))

        USAState  Tot Cases/1M pop
40  Rhode Island          308780.0
45  North Dakota          272944.0
47        Alaska          257827.0
28          Utah          257642.0
13     Tennessee          251062.0
