## Import Statements & Configurations


In [2]:
import pandas as pd
import lxml
import requests
pd.set_option('display.max_rows', None)

## Load state-level data 

In [3]:
# Create Variables to access chart
url = 'https://www.worldometers.info/coronavirus/country/us/#nav-yesterday'
req = requests.get(url)
dfs = pd.read_html(req.text)

In [4]:
# Create a variable for the Covid Table by State
statesT = dfs[0]
print(statesT.head())

     #    USAState  TotalCases  NewCases  TotalDeaths  NewDeaths  \
0  NaN   USA Total    71916547  187990.0     889192.0      569.0   
1  1.0  California     7448890   28014.0      78870.0       64.0   
2  2.0       Texas     5972238   53022.0      78491.0       68.0   
3  3.0     Florida     5351391       NaN      63576.0        NaN   
4  4.0    New York     4832399   18579.0      63824.0      205.0   

   TotalRecovered  ActiveCases  Tot Cases/1M pop  Deaths/1M pop   TotalTests  \
0      44361922.0   26665433.0          217269.0         2686.0  880232119.0   
1             NaN          NaN          188521.0         1996.0  131160392.0   
2             NaN          NaN          205968.0         2707.0   55862537.0   
3       3666925.0    1620890.0          249160.0         2960.0   50383801.0   
4       2601723.0    2166852.0          248407.0         3281.0   95500969.0   

   Tests/ 1M pop  Population                            Source    Projections  
0      2659290.0         NaN  

### Refactor the Table

In [5]:
# Remove the following columns: #, Source, & Projections Columns
statesT.drop(['#', 'Source', 'Projections'], axis=1, inplace=True)

# Remove the following row: Country Totals 
statesT.drop([0, 64], axis=0, inplace=True)
print(statesT.head())

     USAState  TotalCases  NewCases  TotalDeaths  NewDeaths  TotalRecovered  \
1  California     7448890   28014.0      78870.0       64.0             NaN   
2       Texas     5972238   53022.0      78491.0       68.0             NaN   
3     Florida     5351391       NaN      63576.0        NaN       3666925.0   
4    New York     4832399   18579.0      63824.0      205.0       2601723.0   
5    Illinois     2773362       NaN      33446.0        NaN       1924335.0   

   ActiveCases  Tot Cases/1M pop  Deaths/1M pop   TotalTests  Tests/ 1M pop  \
1          NaN          188521.0         1996.0  131160392.0      3319489.0   
2          NaN          205968.0         2707.0   55862537.0      1926568.0   
3    1620890.0          249160.0         2960.0   50383801.0      2345862.0   
4    2166852.0          248407.0         3281.0   95500969.0      4909177.0   
5     815581.0          218861.0         2639.0   49488107.0      3905367.0   

   Population  
1  39512223.0  
2  28995881.0  
3 

In [14]:
# Verify State Column has no HTML.
# The method used to extract information did not include html in the column. Code was not needed.
# print(statesT['USAState'])

## Describe the Dataframe by printing the first few rows

In [7]:
# Print the dataframe first few rows
print(statesT.head())

     USAState  TotalCases  NewCases  TotalDeaths  NewDeaths  TotalRecovered  \
1  California     7448890   28014.0      78870.0       64.0             NaN   
2       Texas     5972238   53022.0      78491.0       68.0             NaN   
3     Florida     5351391       NaN      63576.0        NaN       3666925.0   
4    New York     4832399   18579.0      63824.0      205.0       2601723.0   
5    Illinois     2773362       NaN      33446.0        NaN       1924335.0   

   ActiveCases  Tot Cases/1M pop  Deaths/1M pop   TotalTests  Tests/ 1M pop  \
1          NaN          188521.0         1996.0  131160392.0      3319489.0   
2          NaN          205968.0         2707.0   55862537.0      1926568.0   
3    1620890.0          249160.0         2960.0   50383801.0      2345862.0   
4    2166852.0          248407.0         3281.0   95500969.0      4909177.0   
5     815581.0          218861.0         2639.0   49488107.0      3905367.0   

   Population  
1  39512223.0  
2  28995881.0  
3 

## Print Dataframe Summary Statistics

In [8]:
# The stats will provide infomation such as min, max, the count, the standard deviation and the mean (avg).
print(statesT.describe())

         TotalCases      NewCases   TotalDeaths   NewDeaths  TotalRecovered  \
count  6.300000e+01     19.000000     60.000000   10.000000    4.500000e+01   
mean   1.141532e+06   9894.210526  14819.866667   56.900000    6.708617e+05   
std    1.468724e+06  13171.504455  18310.706689   57.011597    7.797275e+05   
min    3.000000e+00     38.000000      7.000000   12.000000    3.000000e+00   
25%    2.138480e+05   1396.500000   2393.750000   15.500000    1.436660e+05   
50%    6.813820e+05   3804.000000   9596.500000   52.500000    4.221640e+05   
75%    1.455528e+06  11132.500000  19071.000000   65.500000    8.976670e+05   
max    7.448890e+06  53022.000000  78870.000000  205.000000    3.666925e+06   

        ActiveCases  Tot Cases/1M pop  Deaths/1M pop    TotalTests  \
count  4.500000e+01         52.000000      52.000000  6.000000e+01   
mean   2.793075e+05     214316.519231    2493.519231  1.467054e+07   
std    4.289316e+05      37832.925438     748.262051  2.239961e+07   
min    0

## Print the top 5 states (name & value) for each of these metrics:

In [9]:
# New Cases
print(statesT[['USAState','NewCases']].sort_values(by=['NewCases'], ascending=False).head(5))

        USAState  NewCases
2          Texas   53022.0
1     California   28014.0
12       Arizona   22922.0
4       New York   18579.0
6   Pennsylvania   12333.0


In [10]:
# Total Deaths
print(statesT[['USAState', 'TotalDeaths']].sort_values(by=['TotalDeaths'], ascending=False).head(5))

       USAState  TotalDeaths
1    California      78870.0
2         Texas      78491.0
4      New York      63824.0
3       Florida      63576.0
6  Pennsylvania      39538.0


In [11]:
# Total Cases/ 1M Pop.
tcases = statesT.iloc[:, [0, 7]]
print(tcases.sort_values(by=[tcases.columns[1]], ascending=False).head(5))

        USAState  Tot Cases/1M pop
40  Rhode Island          308780.0
45  North Dakota          276933.0
47        Alaska          257827.0
28          Utah          257642.0
13     Tennessee          254234.0


In [12]:
# Total Deaths/ 1M Pop.
tdeaths = statesT.iloc[:, [0, 8]]
print(tdeaths.sort_values(by=[tdeaths.columns[1]], ascending=False).head(5))

       USAState  Deaths/1M pop
32  Mississippi         3602.0
12      Arizona         3520.0
11   New Jersey         3463.0
23      Alabama         3432.0
24    Louisiana         3296.0


## Chris Navoczynski