In [1]:
# importing the libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#Lets define the URL

url="https://www.worldometers.info/coronavirus/"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html

In [3]:
#we pick the id of the table we want to scrape and extract HTML code for that particular table only
covid_table = soup.find("table", attrs={"id": "main_table_countries_today"})

In [4]:
#the head will form our columns
head = covid_table.thead.find_all("tr") 
head #the headers are contained in this HTML code

[<tr>
 <th width="1%">#</th>
 <th width="100">Country,<br/>Other</th>
 <th width="20">Total<br/>Cases</th>
 <th width="30">New<br/>Cases</th>
 <th width="30">Total<br/>Deaths</th>
 <th width="30">New<br/>Deaths</th>
 <th width="30">Total<br/>Recovered</th>
 <th width="30">New<br/>Recovered</th>
 <th width="30">Active<br/>Cases</th>
 <th width="30">Serious,<br/>Critical</th>
 <th width="30">Tot Cases/<br/>1M pop</th>
 <th width="30">Deaths/<br/>1M pop</th>
 <th width="30">Total<br/>Tests</th>
 <th width="30">Tests/<br/>
 <nobr>1M pop</nobr>
 </th>
 <th width="30">Population</th>
 <th style="display:none" width="30">Continent</th>
 <th width="30">1 Case<br/>every X ppl</th><th width="30">1 Death<br/>every X ppl</th><th width="30">1 Test<br/>every X ppl</th>
 <th width="30">New Cases/1M pop</th>
 <th width="30">New Deaths/1M pop</th>
 <th width="30">Active Cases/1M pop</th>
 </tr>]

In [5]:
headings = []
for th in head[0].find_all("th"):
    # remove any newlines and extra spaces from left and right
    print(th.text)
    #headings.append(td.b.text.replace('\n', ' ').strip())
    headings.append(th.text.replace("\n","").strip())
print(headings)

#
Country,Other
TotalCases
NewCases
TotalDeaths
NewDeaths
TotalRecovered
NewRecovered
ActiveCases
Serious,Critical
Tot Cases/1M pop
Deaths/1M pop
TotalTests
Tests/
1M pop

Population
Continent
1 Caseevery X ppl
1 Deathevery X ppl
1 Testevery X ppl
New Cases/1M pop
New Deaths/1M pop
Active Cases/1M pop
['#', 'Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases', 'Serious,Critical', 'Tot\xa0Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/1M pop', 'Population', 'Continent', '1 Caseevery X ppl', '1 Deathevery X ppl', '1 Testevery X ppl', 'New Cases/1M pop', 'New Deaths/1M pop', 'Active Cases/1M pop']


In [6]:
body = covid_table.tbody.find_all("tr") 
body[0] #here is one example of HTML snippet for one row

<tr class="total_row_world row_continent" data-continent="North America" style="display: none">
<td></td>
<td style="text-align:left;">
<nobr>North America</nobr>
</td>
<td>127,000,199</td>
<td></td>
<td>1,637,356</td>
<td></td>
<td>122,980,768</td>
<td>+626</td>
<td>2,382,075</td>
<td>5,948</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td data-continent="North America" style="display:none;">North America</td>
<!-- 1 Case every X -->
<td>
</td>
<!-- 1 Death every X -->
<td></td>
<!-- 1 test every X -->
<td></td>
<td></td>
<td></td>
<td></td>
</tr>

In [7]:
#lets declare empty list data that will hold all rows data
data = []
for r in range(1,len(body)):
    row = [] # empty lsit to hold one row data
    for tr in body[r].find_all("td"):
        row.append(tr.text.replace("\n","").strip())
        #append row data to row after removing newlines escape and triming unnecesary spaces
    data.append(row)

In [8]:
#We can now pass data into a pandas dataframe
#with headings as the columns
df = pd.DataFrame(data,columns=headings)
df.head(10)


Unnamed: 0,#,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",...,TotalTests,Tests/1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
0,,Asia,218134419,334.0,1547613,,201916516,16996.0,14670290,15101.0,...,,,,Asia,,,,,,
1,,Europe,249665850,4728.0,2065787,91.0,245795069,9281.0,1804994,5531.0,...,,,,Europe,,,,,,
2,,South America,68820634,,1357470,,66485092,,978072,10097.0,...,,,,South America,,,,,,
3,,Oceania,14538582,,29206,,14394010,,115366,69.0,...,,,,Australia/Oceania,,,,,,
4,,Africa,12830605,,258804,,12087518,,484283,547.0,...,,,,Africa,,,,,,
5,,,721,,15,,706,,0,0.0,...,,,,,,,,,,
6,,World,690991010,5062.0,6896251,91.0,663659679,26903.0,20435080,37293.0,...,,,,All,,,,,,
7,1.0,USA,107329712,,1168267,,105479716,,681729,782.0,...,1180815103.0,3526871.0,334805269.0,North America,3.0,287.0,0.0,,,2036.0
8,2.0,India,44994351,,531908,,44460975,,1468,,...,930797975.0,661721.0,1406631776.0,Asia,31.0,2645.0,2.0,,,1.0
9,3.0,France,40138560,,167642,,39948906,2631.0,22012,869.0,...,271490188.0,4139547.0,65584518.0,Europe,2.0,391.0,0.0,,,336.0


In [9]:
data = df[df["#"]!=""].reset_index(drop=True)

data = data.drop_duplicates(subset = ["Country,Other"])


In [10]:
data.columns


Index(['#', 'Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths',
       'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases',
       'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop', 'TotalTests',
       'Tests/1M pop', 'Population', 'Continent', '1 Caseevery X ppl',
       '1 Deathevery X ppl', '1 Testevery X ppl', 'New Cases/1M pop',
       'New Deaths/1M pop', 'Active Cases/1M pop'],
      dtype='object')

In [11]:
# Columns to keep
cols = ['Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths',
       'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases',
       'Serious,Critical', 'TotalTests',"Continent"]

In [12]:
data_final = data[cols]
data_final.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Continent
0,USA,107329712,,1168267,,105479716,,681729,782.0,1180815103,North America
1,India,44994351,,531908,,44460975,,1468,,930797975,Asia
2,France,40138560,,167642,,39948906,2631.0,22012,869.0,271490188,Europe
3,Germany,38428685,,174352,,38240600,,13733,,122332384,Europe
4,Brazil,37671420,,703964,,36249161,,718295,,63776166,South America
