#  Using Webscraping to Extract  Data Example from 'https://www.worldometers.info/coronavirus/'


In [1]:
# import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os

In [2]:
# create url object
url = 'https://www.worldometers.info/coronavirus/#page-top'
# determine my_head User-Agent
my_head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'}
# request to get the page
html = requests.get(url,my_head)
# check connection
html.status_code # 200 is right connection

200

In [3]:
# using beautiful soup library to parsing the html page
soup = BeautifulSoup(html.text, 'lxml') # use text because we want to work with a string format


In [4]:
soup.prettify()

'<!DOCTYPE html>\n<!--[if IE 8]> <html lang="en" class="ie8"> <![endif]-->\n<!--[if IE 9]> <html lang="en" class="ie9"> <![endif]-->\n<!--[if !IE]><!-->\n<html lang="en">\n <!--<![endif]-->\n <head>\n  <meta charset="utf-8"/>\n  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>\n  <meta content="width=device-width, initial-scale=1" name="viewport"/>\n  <title>\n   COVID - Coronavirus Statistics - Worldometer\n  </title>\n  <meta content="Daily and weekly updated statistics tracking the number of COVID-19 cases, recovered, and deaths. Historical data with cumulative charts, graphs, and updates." name="description"/>\n  <!-- Favicon -->\n  <link href="/favicon/favicon.ico" rel="shortcut icon" type="image/x-icon"/>\n  <link href="/favicon/apple-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>\n  <link href="/favicon/apple-icon-60x60.png" rel="apple-touch-icon" sizes="60x60"/>\n  <link href="/favicon/apple-icon-72x72.png" rel="apple-touch-icon" sizes="72x72"/>\n  <link href="/fa

In [5]:
# find the table what we want to use
tables = soup.find_all('table')
len(tables) # to know what we have?

3

In [6]:
# we can determine the table using id parameter
table = soup.find('table' , id='main_table_countries_today')
table.thead # i used table head to invoke the head of table that I want to use , then use the variable "table"

<thead>
<tr>
<th width="1%">#</th>
<th width="100">Country,<br/>Other</th>
<th width="20">Total<br/>Cases</th>
<th width="30">New<br/>Cases</th>
<th width="30">Total<br/>Deaths</th>
<th width="30">New<br/>Deaths</th>
<th width="30">Total<br/>Recovered</th>
<th width="30">New<br/>Recovered</th>
<th width="30">Active<br/>Cases</th>
<th width="30">Serious,<br/>Critical</th>
<th width="30">Tot Cases/<br/>1M pop</th>
<th width="30">Deaths/<br/>1M pop</th>
<th width="30">Total<br/>Tests</th>
<th width="30">Tests/<br/>
<nobr>1M pop</nobr>
</th>
<th width="30">Population</th>
<th style="display:none" width="30">Continent</th>
<th width="30">1 Case<br/>every X ppl</th><th width="30">1 Death<br/>every X ppl</th><th width="30">1 Test<br/>every X ppl</th>
<th width="30">New Cases/1M pop</th>
<th width="30">New Deaths/1M pop</th>
<th width="30">Active Cases/1M pop</th>
</tr>
</thead>

In [7]:
# now we gonna write code to get headers of table
headers = [i.text for i in table.thead.find_all('th')] # list to append data into it.
len(headers) # to know how many columns in the table, and to invoke it use variable headers to know.


22

In [8]:
# to scrap the values data
table_values = []
for row in table.tbody.find_all('tr'): # find table row
    tag_td = row.find_all('td')        # find table data 
    values = [i.text for i in tag_td] # use nested loop to determine value for every row
    table_values.append(values)       # put the date into the table_values list by append function
table_values  # to invoke the list

[['',
  '\nNorth America\n',
  '129,049,975',
  '',
  '1,653,030',
  '',
  '124,790,702',
  '+1,334',
  '2,606,243',
  '6,446',
  '',
  '',
  '',
  '',
  '',
  'North America',
  '\n',
  '',
  '',
  '',
  '',
  ''],
 ['',
  '\nAsia\n',
  '220,815,223',
  '+160',
  '1,551,658',
  '+11',
  '204,532,203',
  '+139',
  '14,731,362',
  '15,143',
  '',
  '',
  '',
  '',
  '',
  'Asia',
  '\n',
  '',
  '',
  '',
  '',
  ''],
 ['',
  '\nEurope\n',
  '250,847,202',
  '+41,170',
  '2,078,574',
  '+112',
  '246,688,330',
  '+38,035',
  '2,080,298',
  '5,615',
  '',
  '',
  '',
  '',
  '',
  'Europe',
  '\n',
  '',
  '',
  '',
  '',
  ''],
 ['',
  '\nSouth America\n',
  '69,181,719',
  '',
  '1,362,310',
  '',
  '66,560,799',
  '',
  '1,258,610',
  '10,095',
  '',
  '',
  '',
  '',
  '',
  'South America',
  '\n',
  '',
  '',
  '',
  '',
  ''],
 ['',
  '\nOceania\n',
  '14,698,885',
  '',
  '30,699',
  '',
  '14,561,855',
  '+2',
  '106,331',
  '49',
  '',
  '',
  '',
  '',
  '',
  'Australia/Ocean

In [9]:
my_data = pd.DataFrame(table_values, columns=headers) # use pandas data frame
my_data.head()   # to show the fifth first row  data from the table

Unnamed: 0,#,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",...,TotalTests,Tests/\n1M pop\n,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
0,,\nNorth America\n,129049975,,1653030,,124790702,1334.0,2606243,6446,...,,,,North America,\n,,,,,
1,,\nAsia\n,220815223,160.0,1551658,11.0,204532203,139.0,14731362,15143,...,,,,Asia,\n,,,,,
2,,\nEurope\n,250847202,41170.0,2078574,112.0,246688330,38035.0,2080298,5615,...,,,,Europe,\n,,,,,
3,,\nSouth America\n,69181719,,1362310,,66560799,,1258610,10095,...,,,,South America,\n,,,,,
4,,\nOceania\n,14698885,,30699,,14561855,2.0,106331,49,...,,,,Australia/Oceania,\n,,,,,


In [10]:
# Drop and clearing unnecessary rows
my_data.drop(my_data.index[0:8], inplace=True)
#my_data.drop(my_data.index[222:229], inplace=True)
my_data.reset_index(inplace=True, drop=True)
# Drop “#” column
my_data.drop('#', inplace=True, axis=1)

In [11]:
my_data

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,...,TotalTests,Tests/\n1M pop\n,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
0,USA,109233586,,1181548,,107158346,,893692,1282,326260,...,1186431670,3543647,334805269,North America,3,283,0,,,2669
1,India,45001268,,533293,,,,,,31992,...,930797975,661721,1406631776,Asia,31,2638,2,,,0.2
2,France,40138560,,167642,,39970918,,0,869,612013,...,271490188,4139547,65584518,Europe,2,391,0,,,
3,Germany,38553102,,176793,,38240600,,135709,,459602,...,122332384,1458359,83883596,Europe,2,474,1,,,1618
4,Brazil,37949944,,706808,,36249161,,993975,,176222,...,63776166,296146,215353593,South America,6,305,3,,,4616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,Tokelau,80,,,,,,80,,58055,...,,,1378,Australia/Oceania,17,,,,,58055
227,Vatican City,29,,,,29,,0,,36295,...,,,799,Europe,28,,,,,
228,Western Sahara,10,,1,,9,,0,,16,...,,,626161,Africa,62616,626161,,,,
229,MS Zaandam,9,,2,,7,,0,,,...,,,,,,,,,,


In [12]:
# # Export to csv
my_data.to_csv('covid_data.csv', index=False)
# Try to read csv
my_data2 = pd.read_csv('covid_data.csv')

In [13]:
my_data2  # show the data 

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,...,TotalTests,Tests/\n1M pop\n,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
0,USA,109233586,,1181548,,107158346,,893692,1282,326260,...,1186431670,3543647,334805269,North America,3,283,0.0,,,2669
1,India,45001268,,533293,,,,,,31992,...,930797975,661721,1406631776,Asia,31,2638,2.0,,,0.2
2,France,40138560,,167642,,39970918,,0,869,612013,...,271490188,4139547,65584518,Europe,2,391,0.0,,,
3,Germany,38553102,,176793,,38240600,,135709,,459602,...,122332384,1458359,83883596,Europe,2,474,1.0,,,1618
4,Brazil,37949944,,706808,,36249161,,993975,,176222,...,63776166,296146,215353593,South America,6,305,3.0,,,4616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,Tokelau,80,,,,,,80,,58055,...,,,1378,Australia/Oceania,17,,,,,58055
227,Vatican City,29,,,,29,,0,,36295,...,,,799,Europe,28,,,,,
228,Western Sahara,10,,1,,9,,0,,16,...,,,626161,Africa,62616,626161,,,,
229,MS Zaandam,9,,2,,7,,0,,,...,,,,,,,,,,
