<h1>Import Dependencies</h1>

In [1]:
import requests
import lxml.html as lh
import pandas as pd

<h1>Get Website Data</h1>

In [2]:
# Store website in a url variable
url = "https://www.worldometers.info/coronavirus/country/us/"

In [3]:
# Use the request method on the url
req = requests.get(url)

In [4]:
# Store the contents of the website using the html lxml.html parser lh
doc = lh.fromstring(req.content)

<h1>Get Table Data</h1>

In [5]:
#Parse data that is stored between HTML table row tags <tr>..</tr>
tr_elements = doc.xpath('//tr')

In [6]:
# Confirm that you are gathering tabular data
# You can do that by inspecting the length of 
# the rows by using a list comprehension
# make sure that each row has the same number of columns
[len(T) for T in tr_elements[:10]]

[15, 15, 15, 15, 15, 15, 15, 15, 15, 15]

<h1>Get Table Header Data</h1>

In [7]:
#Create empty list
table_data=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1

    #lxlm text_content() method extracts text values
    name=t.text_content()
    #Check you are gathering the header information with print statement
    #This is optional   
    print (i,name) 
    
    #use the List .append() method to add the text from each row
    #into the empty list col you created
    table_data.append((name,[]))

1 #
2 USAState
3 TotalCases
4 NewCases
5 TotalDeaths
6 NewDeaths
7 TotalRecovered
8 ActiveCases
9 Tot Cases/1M pop
10 Deaths/1M pop
11 TotalTests
12 Tests/
1M pop

13 Population
14 Source
15 Projections


<h1>Get Table Data</h1>

In [8]:
# The header data was stored in the first row 
# From above code(index 0 = tr_elements[0])
# data is stored on the second row onwards
# Use a for loop to iterate through the remaining tr elements
# Make sure that each row is tabular, if not break out
# For tabular data (row with equal columms) 
# Store the data into your table_data list.
for j in range(1,len(tr_elements)):
    
    #T is our j'th row
    T=tr_elements[j]
    
    #Count the number of columns in the table and assign to variable
    #In this example, we have 28 columns with data     
    col_nbr = 15
    
    #If row is not of size 28 (# columns), 
    #the //tr data is not from our table 
    if len(T)!=col_nbr:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    #This for loop uses the lxlm .iterchildren() method
    for t in T.iterchildren():
        #This code uses the lxlm .text_content() method         
        data=t.text_content() 
        
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        table_data[i][1].append(data)
        #Increment i for the next column
        i+=1

In [9]:
# Quality control - confirm that you gathered columns with equal rows
[len(C) for (title,C) in table_data]

[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64]

<h1>Add Data to Dataframe</h1>

In [10]:
# Create the dataframe
Dict={title:column for (title,column) in table_data}
df=pd.DataFrame(Dict)

<h1>Drop Unnecesary Columns and remove substrings and commas</h1>

In [11]:
df.drop(columns=['#', 'Source', 'Projections'], inplace=True)
df.columns = df.columns.str.replace('\n','')
df = df.replace(',','', regex=True)

In [12]:
for column in df:
    df[column]= df[column].astype(str)
    df[column] = df[column].map(lambda x: x.lstrip('\n'))

In [13]:
df.head()

Unnamed: 0,USAState,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population
0,USA Total,22837347,134084.0,382637,1154.0,13444039,9010671,68994,1156,270944916,818558,
1,California,2676896,,29682,,1113950,1533264,67749,751,35826824,906728,39512223.0
2,Texas,1961165,,30527,,1574684,355954,67636,1053,16816301,579955,28995881.0
3,Florida,1477010,12313.0,22920,106.0,768664,685426,68769,1067,16715694,778280,21477737.0
4,New York,1164957,12743.0,39588,141.0,492692,632677,59884,2035,27321002,1404422,19453561.0


<h1>Convert Columns to Integers and Replace NaN with Zero</h1>

In [14]:
int_cols = df.columns[1:] # get the columns to be converted to int

In [15]:
df[int_cols] = df[int_cols].apply(pd.to_numeric, errors='coerce') # use .to_numeric to convert and coerce NaN

In [16]:
df.fillna(0, inplace=True) # convert NaN to zero

In [17]:
df.head() # inspect data

Unnamed: 0,USAState,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population
0,USA Total,22837347,134084.0,382637.0,1154.0,13444039.0,9010671.0,68994.0,1156.0,270944916.0,818558.0,0.0
1,California,2676896,0.0,29682.0,0.0,1113950.0,1533264.0,67749.0,751.0,35826824.0,906728.0,39512223.0
2,Texas,1961165,0.0,30527.0,0.0,1574684.0,355954.0,67636.0,1053.0,16816301.0,579955.0,28995881.0
3,Florida,1477010,12313.0,22920.0,106.0,768664.0,685426.0,68769.0,1067.0,16715694.0,778280.0,21477737.0
4,New York,1164957,12743.0,39588.0,141.0,492692.0,632677.0,59884.0,2035.0,27321002.0,1404422.0,19453561.0


<h1>Calculate Case Fatality Death Rate</h1>

In [18]:
df['cfdr'] = (df['TotalDeaths'] / df['TotalCases'] *100).round(2)

<h1>Save Dataframe to CSV</h1>

In [19]:
df.to_csv("data/covid_state_level_data.csv", encoding='utf-8', index=False)