In [1]:
# Load the packages
import requests
from bs4 import BeautifulSoup

In [2]:
# Defining the url of the site
base_site = "https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol"

# Making a get request
response = requests.get(base_site)
response.status_code

200

In [3]:
# Extracting the HTML
html = response.content

# Making the soup
soup = BeautifulSoup(html, "html.parser")

## Absolute and Relative URLs

**Absolute URL**

it's the full URL of the page that you link to

- contains the entire address from the protocol (HTTPS) to the domain name (www.example.com) 
- includes the location within your website in your folder system (/foldernameA or /foldernameB) names within the URL

*Example*:

< a href = http://www.example.com/xyz.html >

**Relative URL**

It assumes that the link you add is on the same site and is part of the same root domain

- does not use the full web address
- contains the location following the domain
- starts with the forward slash and leads the browser to stay within the current site

*Example*:

< a href = "/xyz.html" >

In [4]:
links = soup.find_all('a')
links

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="new" href="/w/index.php?title=Az%C9%99rbaycan_futbol_klublar%C4%B1&amp;action=edit&amp;redlink=1" title="Azərbaycan futbol klubları (səhifə mövcud deyil)">Azərbaycan futbol klubları</a>,
 <a href="#Yaranması"><span class="tocnumber">1</span> <span class="toctext">Yaranması</span></a>,
 <a href="#1937-1948"><span class="tocnumber">2</span> <span class="toctext">1937-1948</span></a>,
 <a href="#1949-1959"><span class="tocnumber">3</span> <span class="toctext">1949-1959</span></a>,
 <a href="#1960-1972"><span class="tocnumber">4</span> <span class="toctext">1960-1972</span></a>,
 <a href='#"Qızıl"_il'><span class="tocnumber">4.1</span> <span class="toctext">"Qızıl" il</span></a>,
 <a href="#1973-1976"><span class="tocnumber">5</span> <span class="toctext">1973-1976</span></a>,
 <a href="#1977-1988"><span class="tocnumber">6</span> 

In [13]:
# To obtain the absolute URL address we will use urljoin

from urllib.parse import urljoin

In [10]:
link = links[26]
link

<a href="/wiki/Almaniya" title="Almaniya">Almaniya</a>

In [11]:
base_site

'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol'

In [12]:
relative_url = link.get('href')
relative_url

'/wiki/Almaniya'

In [14]:
full_url = urljoin(base_site, relative_url)
full_url

'https://az.wikipedia.org/wiki/Almaniya'

## Processing multiple links at once

In [15]:
links

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="new" href="/w/index.php?title=Az%C9%99rbaycan_futbol_klublar%C4%B1&amp;action=edit&amp;redlink=1" title="Azərbaycan futbol klubları (səhifə mövcud deyil)">Azərbaycan futbol klubları</a>,
 <a href="#Yaranması"><span class="tocnumber">1</span> <span class="toctext">Yaranması</span></a>,
 <a href="#1937-1948"><span class="tocnumber">2</span> <span class="toctext">1937-1948</span></a>,
 <a href="#1949-1959"><span class="tocnumber">3</span> <span class="toctext">1949-1959</span></a>,
 <a href="#1960-1972"><span class="tocnumber">4</span> <span class="toctext">1960-1972</span></a>,
 <a href='#"Qızıl"_il'><span class="tocnumber">4.1</span> <span class="toctext">"Qızıl" il</span></a>,
 <a href="#1973-1976"><span class="tocnumber">5</span> <span class="toctext">1973-1976</span></a>,
 <a href="#1977-1988"><span class="tocnumber">6</span> 

In [16]:
# Examining the links' addresses
[l.get('href') for l in links]

[None,
 '#mw-head',
 '#searchInput',
 '/w/index.php?title=Az%C9%99rbaycan_futbol_klublar%C4%B1&action=edit&redlink=1',
 '#Yaranması',
 '#1937-1948',
 '#1949-1959',
 '#1960-1972',
 '#"Qızıl"_il',
 '#1973-1976',
 '#1977-1988',
 '#1989-1991',
 '#1992-2006',
 '#Klublar_beynəlxalq_arenada',
 '#Həmçinin_bax',
 '#Xarici_keçidlər',
 '/w/index.php?title=Az%C9%99rbaycanda_futbol&veaction=edit&section=1',
 '/w/index.php?title=Az%C9%99rbaycanda_futbol&action=edit&section=1',
 '/wiki/1911',
 '/wiki/Az%C9%99rbaycan',
 '/wiki/G%C3%BCrc%C3%BCstan',
 '/wiki/1914',
 '/wiki/Bak%C4%B1_Futbol_%C4%B0ttifaq%C4%B1',
 '/wiki/Bak%C4%B1',
 '/wiki/Batumi',
 '/wiki/%C4%B0r%C9%99van',
 '/wiki/Almaniya',
 '/wiki/%C4%B0ngilt%C9%99r%C9%99',
 '/w/index.php?title=Az%C9%99rbaycanda_futbol&veaction=edit&section=2',
 '/w/index.php?title=Az%C9%99rbaycanda_futbol&action=edit&section=2',
 '/wiki/%C6%8Fl%C9%99kb%C9%99r_M%C9%99mm%C9%99dov',
 '/w/index.php?title=Az%C9%99rbaycanda_futbol&veaction=edit&section=3',
 '/w/index.php?t

Notice that some links don't have URL (None appears)

In [18]:
# Dropping the links without href attribute
clean_links = [l for l in links if l.get('href') != None]
clean_links

[<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="new" href="/w/index.php?title=Az%C9%99rbaycan_futbol_klublar%C4%B1&amp;action=edit&amp;redlink=1" title="Azərbaycan futbol klubları (səhifə mövcud deyil)">Azərbaycan futbol klubları</a>,
 <a href="#Yaranması"><span class="tocnumber">1</span> <span class="toctext">Yaranması</span></a>,
 <a href="#1937-1948"><span class="tocnumber">2</span> <span class="toctext">1937-1948</span></a>,
 <a href="#1949-1959"><span class="tocnumber">3</span> <span class="toctext">1949-1959</span></a>,
 <a href="#1960-1972"><span class="tocnumber">4</span> <span class="toctext">1960-1972</span></a>,
 <a href='#"Qızıl"_il'><span class="tocnumber">4.1</span> <span class="toctext">"Qızıl" il</span></a>,
 <a href="#1973-1976"><span class="tocnumber">5</span> <span class="toctext">1973-1976</span></a>,
 <a href="#1977-1988"><span class="tocnumber">6</span> <span class="toctex

In [19]:
# Obtaining the relative URLs
relative_urls = [link.get('href') for link in clean_links]
relative_urls

['#mw-head',
 '#searchInput',
 '/w/index.php?title=Az%C9%99rbaycan_futbol_klublar%C4%B1&action=edit&redlink=1',
 '#Yaranması',
 '#1937-1948',
 '#1949-1959',
 '#1960-1972',
 '#"Qızıl"_il',
 '#1973-1976',
 '#1977-1988',
 '#1989-1991',
 '#1992-2006',
 '#Klublar_beynəlxalq_arenada',
 '#Həmçinin_bax',
 '#Xarici_keçidlər',
 '/w/index.php?title=Az%C9%99rbaycanda_futbol&veaction=edit&section=1',
 '/w/index.php?title=Az%C9%99rbaycanda_futbol&action=edit&section=1',
 '/wiki/1911',
 '/wiki/Az%C9%99rbaycan',
 '/wiki/G%C3%BCrc%C3%BCstan',
 '/wiki/1914',
 '/wiki/Bak%C4%B1_Futbol_%C4%B0ttifaq%C4%B1',
 '/wiki/Bak%C4%B1',
 '/wiki/Batumi',
 '/wiki/%C4%B0r%C9%99van',
 '/wiki/Almaniya',
 '/wiki/%C4%B0ngilt%C9%99r%C9%99',
 '/w/index.php?title=Az%C9%99rbaycanda_futbol&veaction=edit&section=2',
 '/w/index.php?title=Az%C9%99rbaycanda_futbol&action=edit&section=2',
 '/wiki/%C6%8Fl%C9%99kb%C9%99r_M%C9%99mm%C9%99dov',
 '/w/index.php?title=Az%C9%99rbaycanda_futbol&veaction=edit&section=3',
 '/w/index.php?title=Az

In [20]:
# Transforming to absolute path URLs
full_urls = [urljoin(base_site, url) for url in relative_urls]
full_urls

['https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#mw-head',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#searchInput',
 'https://az.wikipedia.org/w/index.php?title=Az%C9%99rbaycan_futbol_klublar%C4%B1&action=edit&redlink=1',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#Yaranması',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1937-1948',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1949-1959',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1960-1972',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#"Qızıl"_il',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1973-1976',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1977-1988',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1989-1991',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1992-2006',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#Klublar_beynəlxalq_arenada',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycand

In [21]:
# Extracting only URLs pointing to Wikipedia (internal URLs)
internal_links = [url for url in full_urls if 'wikipedia.org' in url]
internal_links

['https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#mw-head',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#searchInput',
 'https://az.wikipedia.org/w/index.php?title=Az%C9%99rbaycan_futbol_klublar%C4%B1&action=edit&redlink=1',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#Yaranması',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1937-1948',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1949-1959',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1960-1972',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#"Qızıl"_il',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1973-1976',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1977-1988',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1989-1991',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#1992-2006',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycanda_futbol#Klublar_beynəlxalq_arenada',
 'https://az.wikipedia.org/wiki/Az%C9%99rbaycand

## Extracting tables with Beautiful Soup

Tables are marked with the 'table' tag in HTML

- 'th' marks a column heading
- 'tr' marks a table row
- 'td' marks a table cell or data point (inside a row)

Note: Row ('tr') may include both 'th' and 'td'


In [30]:
base_url = 'https://covid19.ncdc.gov.ng/'

response = requests.get(base_url)
response.status_code

200

In [31]:
html = response.content
soup = BeautifulSoup(html, "html.parser")

In [36]:
table = soup.find('table')
table

<table id="custom1">
<thead>
<tr>
<th>States Affected</th>
<th>No. of Cases (Lab Confirmed)</th>
<th>No. of Cases (on admission)</th>
<th>No. Discharged</th>
<th>No. of Deaths</th>
</tr>
</thead>
<tbody>
<tr>
<td>
Lagos
</td>
<td>104,204
</td>
<td>1,061
</td>
<td>102,372
</td>
<td>771
</td>
</tr>
<tr>
<td>
FCT
</td>
<td>29,508
</td>
<td>19
</td>
<td>29,240
</td>
<td>249
</td>
</tr>
<tr>
<td>
Rivers
</td>
<td>18,108
</td>
<td>-1
</td>
<td>17,954
</td>
<td>155
</td>
</tr>
<tr>
<td>
Kaduna
</td>
<td>11,628
</td>
<td>3
</td>
<td>11,536
</td>
<td>89
</td>
</tr>
<tr>
<td>
Oyo
</td>
<td>10,352
</td>
<td>0
</td>
<td>10,150
</td>
<td>202
</td>
</tr>
<tr>
<td>
 Plateau
</td>
<td>10,341
</td>
<td>6
</td>
<td>10,260
</td>
<td>75
</td>
</tr>
<tr>
<td>
Edo
</td>
<td>7,927
</td>
<td>4
</td>
<td>7,601
</td>
<td>322
</td>
</tr>
<tr>
<td>
Delta
</td>
<td>5,834
</td>
<td>552
</td>
<td>5,170
</td>
<td>112
</td>
</tr>
<tr>
<td>
Ogun
</td>
<td>5,810
</td>
<td>11
</td>
<td>5,717
</td>
<td>82
</td>
</tr>
<tr>

In [37]:
# Getting all rows in a given table
rows = table.find_all('tr')
rows

[<tr>
 <th>States Affected</th>
 <th>No. of Cases (Lab Confirmed)</th>
 <th>No. of Cases (on admission)</th>
 <th>No. Discharged</th>
 <th>No. of Deaths</th>
 </tr>,
 <tr>
 <td>
 Lagos
 </td>
 <td>104,204
 </td>
 <td>1,061
 </td>
 <td>102,372
 </td>
 <td>771
 </td>
 </tr>,
 <tr>
 <td>
 FCT
 </td>
 <td>29,508
 </td>
 <td>19
 </td>
 <td>29,240
 </td>
 <td>249
 </td>
 </tr>,
 <tr>
 <td>
 Rivers
 </td>
 <td>18,108
 </td>
 <td>-1
 </td>
 <td>17,954
 </td>
 <td>155
 </td>
 </tr>,
 <tr>
 <td>
 Kaduna
 </td>
 <td>11,628
 </td>
 <td>3
 </td>
 <td>11,536
 </td>
 <td>89
 </td>
 </tr>,
 <tr>
 <td>
 Oyo
 </td>
 <td>10,352
 </td>
 <td>0
 </td>
 <td>10,150
 </td>
 <td>202
 </td>
 </tr>,
 <tr>
 <td>
  Plateau
 </td>
 <td>10,341
 </td>
 <td>6
 </td>
 <td>10,260
 </td>
 <td>75
 </td>
 </tr>,
 <tr>
 <td>
 Edo
 </td>
 <td>7,927
 </td>
 <td>4
 </td>
 <td>7,601
 </td>
 <td>322
 </td>
 </tr>,
 <tr>
 <td>
 Delta
 </td>
 <td>5,834
 </td>
 <td>552
 </td>
 <td>5,170
 </td>
 <td>112
 </td>
 </tr>,
 <tr>
 <td>
 Og

In [74]:
# displaying each row in a table
for index, row in enumerate(rows):
    print(index, row.prettify())

0 <tr>
 <th>
  States Affected
 </th>
 <th>
  No. of Cases (Lab Confirmed)
 </th>
 <th>
  No. of Cases (on admission)
 </th>
 <th>
  No. Discharged
 </th>
 <th>
  No. of Deaths
 </th>
</tr>

1 <tr>
 <td>
  Lagos
 </td>
 <td>
  104,204
 </td>
 <td>
  1,061
 </td>
 <td>
  102,372
 </td>
 <td>
  771
 </td>
</tr>

2 <tr>
 <td>
  FCT
 </td>
 <td>
  29,508
 </td>
 <td>
  19
 </td>
 <td>
  29,240
 </td>
 <td>
  249
 </td>
</tr>

3 <tr>
 <td>
  Rivers
 </td>
 <td>
  18,108
 </td>
 <td>
  -1
 </td>
 <td>
  17,954
 </td>
 <td>
  155
 </td>
</tr>

4 <tr>
 <td>
  Kaduna
 </td>
 <td>
  11,628
 </td>
 <td>
  3
 </td>
 <td>
  11,536
 </td>
 <td>
  89
 </td>
</tr>

5 <tr>
 <td>
  Oyo
 </td>
 <td>
  10,352
 </td>
 <td>
  0
 </td>
 <td>
  10,150
 </td>
 <td>
  202
 </td>
</tr>

6 <tr>
 <td>
  Plateau
 </td>
 <td>
  10,341
 </td>
 <td>
  6
 </td>
 <td>
  10,260
 </td>
 <td>
  75
 </td>
</tr>

7 <tr>
 <td>
  Edo
 </td>
 <td>
  7,927
 </td>
 <td>
  4
 </td>
 <td>
  7,601
 </td>
 <td>
  322
 </td>
</tr>

8 

### Option 1

In [105]:
cells = []
for row in rows:
    entire_row = row.text.strip('\n').split('\n')
    for cell in entire_row:
        if cell == '' or cell == ' ':
            entire_row.remove(cell)
            
    cells.append(entire_row)
    
cells

[['States Affected',
  'No. of Cases (Lab Confirmed)',
  'No. of Cases (on admission)',
  'No. Discharged',
  'No. of Deaths'],
 ['Lagos', '104,204', '1,061', '102,372', '771'],
 ['FCT', '29,508', '19', '29,240', '249'],
 ['Rivers', '18,108', '-1', '17,954', '155'],
 ['Kaduna', '11,628', '3', '11,536', '89'],
 ['Oyo', '10,352', '0', '10,150', '202'],
 [' Plateau', '10,341', '6', '10,260', '75'],
 ['Edo', '7,927', '4', '7,601', '322'],
 ['Delta', '5,834', '552', '5,170', '112'],
 ['Ogun', '5,810', '11', '5,717', '82'],
 ['Kano', '5,384', '38', '5,219', '127'],
 ['Ondo', '5,173', '315', '4,749', '109'],
 ['Akwa Ibom', '5,010', '6', '4,960', '44'],
 ['Kwara', '4,691', '452', '4,175', '64'],
 ['Gombe', '3,313', '8', '3,239', '66'],
 ['Osun', '3,311', '29', '3,190', '92'],
 ['Enugu', '2,952', '13', '2,910', '29'],
 ['Anambra', '2,825', '46', '2,760', '19'],
 ['Nasarawa', '2,816', '432', '2,345', '39'],
 ['Imo', '2,691', '3', '2,630', '58'],
 ['Ekiti', '2,466', '0', '2,438', '28'],
 ['Katsin

In [106]:
import pandas as pd

In [113]:
covid_df = pd.DataFrame(cells, )
header = covid_df.iloc[0]
covid_df.columns = header
covid_df.drop(index=covid_df.index[0], axis=0, inplace=True)
covid_df

Unnamed: 0,States Affected,No. of Cases (Lab Confirmed),No. of Cases (on admission),No. Discharged,No. of Deaths
1,Lagos,104204,1061,102372,771
2,FCT,29508,19,29240,249
3,Rivers,18108,-1,17954,155
4,Kaduna,11628,3,11536,89
5,Oyo,10352,0,10150,202
6,Plateau,10341,6,10260,75
7,Edo,7927,4,7601,322
8,Delta,5834,552,5170,112
9,Ogun,5810,11,5717,82
10,Kano,5384,38,5219,127


### .get_text() method

returns the text within the tag

### .contents attribute 
returns a list with all its children elements. If the current element does not contain nested HTML elements, then .contents[0] will be just the text inside it

In [118]:
page = soup.find('p').get_text(strip=True)
page

'Get Your COVID-19 Risk Assessment'

In [125]:
for row in rows:
    print(row.get_text())


States Affected
No. of Cases (Lab Confirmed)
No. of Cases (on admission)
No. Discharged
No. of Deaths



Lagos

104,204

1,061

102,372

771




FCT

29,508

19

29,240

249




Rivers

18,108

-1

17,954

155




Kaduna

11,628

3

11,536

89




Oyo

10,352

0

10,150

202




 Plateau

10,341

6

10,260

75




Edo

7,927

4

7,601

322




Delta

5,834

552

5,170

112




Ogun

5,810

11

5,717

82




Kano

5,384

38

5,219

127




Ondo

5,173

315

4,749

109




Akwa Ibom

5,010

6

4,960

44




Kwara

4,691

452

4,175

64




Gombe

3,313

8

3,239

66




Osun

3,311

29

3,190

92




Enugu

2,952

13

2,910

29




Anambra

2,825

46

2,760

19




Nasarawa

2,816

432

2,345

39




Imo

2,691

3

2,630

58




Ekiti

2,466

0

2,438

28




Katsina

2,418

0

2,381

37




Benue

2,317

88

2,204

25




Abia

2,263

1

2,228

34




Ebonyi

2,064

28

2,004

32




Bauchi

2,028

2

2,002

24




Borno

1,629

5

1,580

44




Taraba

1,515

30

1,451

34




Bayelsa

In [115]:
soup.find('p').contents

['\n',
 <a class="btn btn-success btn-sm text-white m-0" href="https://selfassessment.ncdc.gov.ng" target="_blank">Get Your COVID-19 Risk Assessment</a>,
 '\n']

In [130]:
for row in rows:
    print(row.contents[1].text)

States Affected

Lagos


FCT


Rivers


Kaduna


Oyo


 Plateau


Edo


Delta


Ogun


Kano


Ondo


Akwa Ibom


Kwara


Gombe


Osun


Enugu


Anambra


Nasarawa


Imo


Ekiti


Katsina


Benue


Abia


Ebonyi


Bauchi


Borno


Taraba


Bayelsa


Adamawa


Niger


Cross River


Sokoto


Jigawa


Yobe


Kebbi


Zamfara


Kogi



### Option 2

In [114]:
for row in rows:
    print(row.find_all(['td', 'th']))

[<th>States Affected</th>, <th>No. of Cases (Lab Confirmed)</th>, <th>No. of Cases (on admission)</th>, <th>No. Discharged</th>, <th>No. of Deaths</th>]
[<td>
Lagos
</td>, <td>104,204
</td>, <td>1,061
</td>, <td>102,372
</td>, <td>771
</td>]
[<td>
FCT
</td>, <td>29,508
</td>, <td>19
</td>, <td>29,240
</td>, <td>249
</td>]
[<td>
Rivers
</td>, <td>18,108
</td>, <td>-1
</td>, <td>17,954
</td>, <td>155
</td>]
[<td>
Kaduna
</td>, <td>11,628
</td>, <td>3
</td>, <td>11,536
</td>, <td>89
</td>]
[<td>
Oyo
</td>, <td>10,352
</td>, <td>0
</td>, <td>10,150
</td>, <td>202
</td>]
[<td>
 Plateau
</td>, <td>10,341
</td>, <td>6
</td>, <td>10,260
</td>, <td>75
</td>]
[<td>
Edo
</td>, <td>7,927
</td>, <td>4
</td>, <td>7,601
</td>, <td>322
</td>]
[<td>
Delta
</td>, <td>5,834
</td>, <td>552
</td>, <td>5,170
</td>, <td>112
</td>]
[<td>
Ogun
</td>, <td>5,810
</td>, <td>11
</td>, <td>5,717
</td>, <td>82
</td>]
[<td>
Kano
</td>, <td>5,384
</td>, <td>38
</td>, <td>5,219
</td>, <td>127
</td>]
[<td>
Ondo
</td>, <

In [123]:
cells = []
for row in rows:
    entire_row = row.find_all(['td', 'th'])
    
    row_cells = [cell.get_text(strip=True) for cell in entire_row]
    cells.append(row_cells)
    
cells

[['States Affected',
  'No. of Cases (Lab Confirmed)',
  'No. of Cases (on admission)',
  'No. Discharged',
  'No. of Deaths'],
 ['Lagos', '104,204', '1,061', '102,372', '771'],
 ['FCT', '29,508', '19', '29,240', '249'],
 ['Rivers', '18,108', '-1', '17,954', '155'],
 ['Kaduna', '11,628', '3', '11,536', '89'],
 ['Oyo', '10,352', '0', '10,150', '202'],
 ['Plateau', '10,341', '6', '10,260', '75'],
 ['Edo', '7,927', '4', '7,601', '322'],
 ['Delta', '5,834', '552', '5,170', '112'],
 ['Ogun', '5,810', '11', '5,717', '82'],
 ['Kano', '5,384', '38', '5,219', '127'],
 ['Ondo', '5,173', '315', '4,749', '109'],
 ['Akwa Ibom', '5,010', '6', '4,960', '44'],
 ['Kwara', '4,691', '452', '4,175', '64'],
 ['Gombe', '3,313', '8', '3,239', '66'],
 ['Osun', '3,311', '29', '3,190', '92'],
 ['Enugu', '2,952', '13', '2,910', '29'],
 ['Anambra', '2,825', '46', '2,760', '19'],
 ['Nasarawa', '2,816', '432', '2,345', '39'],
 ['Imo', '2,691', '3', '2,630', '58'],
 ['Ekiti', '2,466', '0', '2,438', '28'],
 ['Katsina

In [124]:
covid_df = pd.DataFrame(cells, )
header = covid_df.iloc[0]
covid_df.columns = header
covid_df.drop(index=covid_df.index[0], axis=0, inplace=True)
covid_df

Unnamed: 0,States Affected,No. of Cases (Lab Confirmed),No. of Cases (on admission),No. Discharged,No. of Deaths
1,Lagos,104204,1061,102372,771
2,FCT,29508,19,29240,249
3,Rivers,18108,-1,17954,155
4,Kaduna,11628,3,11536,89
5,Oyo,10352,0,10150,202
6,Plateau,10341,6,10260,75
7,Edo,7927,4,7601,322
8,Delta,5834,552,5170,112
9,Ogun,5810,11,5717,82
10,Kano,5384,38,5219,127


## Using Pandas to extract tables

Pandas provides an extremely easy-to-use method for tablemextraction

It actually uses Beautiful Soup in the background, performing all the operations we executed above automatically

In [156]:
# To extract all tables on a page, use pandas.read_html()
# It takes either raw HTML or the page URL as a parameter

In [154]:
table_MN = pd.read_html('https://en.wikipedia.org/wiki/Minnesota', attrs={'class_':'toccolours'})
table_MN

[                    Historical population  \
                                    Census   
 0                                    1850   
 1                                    1860   
 2                                    1870   
 3                                    1880   
 4                                    1890   
 5                                    1900   
 6                                    1910   
 7                                    1920   
 8                                    1930   
 9                                    1940   
 10                                   1950   
 11                                   1960   
 12                                   1970   
 13                                   1980   
 14                                   1990   
 15                                   2000   
 16                                   2010   
 17                                   2020   
 18                            2021 (est.)   
 19  Source: 1910–2020[77]2021 Est

In [152]:
?? table_MN 

In [153]:
type(table_MN)

list

In [155]:
table_MN[0]

Unnamed: 0_level_0,Historical population,Historical population,Historical population,Historical population
Unnamed: 0_level_1,Census,Pop.,Unnamed: 2_level_1,%±
0,1850,6077,,—
1,1860,172023,,"2,730.7%"
2,1870,439706,,155.6%
3,1880,780773,,77.6%
4,1890,1310283,,67.8%
5,1900,1751394,,33.7%
6,1910,2075708,,18.5%
7,1920,2387125,,15.0%
8,1930,2563953,,7.4%
9,1940,2792300,,8.9%


In [None]:
table = pd.read_html(base_url, attrs = {"id": "custom1"})
table

### Scraping an item from web page

In [None]:
def scrape_product(url):
    """This function takes url of product as input and 
    returns 'name', 'price', 'img_url', and 'info' as output"""
#     your code