# The most important several ways to scrap tables on the web

## Using requests and Beautiful Soup libraries

### scrap and find the table  

In [1]:
# import libraries
import requests
import pandas
from bs4 import BeautifulSoup

In [2]:
# create url object (I use wiki population to practice)
url = 'https://en.wikipedia.org/wiki/World_population'
# I am gonna to set User-Agent to scrap as a human not a python
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'} 
# create a get requests from web using request.get (lib) by using variable like response
response = requests.get(url, headers=headers)
# so check the status code of response , if 200 or not tell me
if response.status_code == 200: 
    print(f"The connection ok")
else: print('Error connection') 

The connection ok


In [3]:
# We gonna create a soup for parsing the web page (response) and get them as a text or content
                    # (it is easy to do this with 'lxml lib')
soup = BeautifulSoup(response.text, 'lxml')
# now i can check 
print(soup.prettify()) # this is to show the content as web
                           ## but is showing it on this environment as a text not an 'html' 

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-typography-survey-disabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   World population - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature

In [4]:
# now we are trying to know how many tables on this web page
tables = soup.find_all('table') # the tag of table always begin with <table>
print(len(tables))

29


In [5]:
# to now the table we want we can explore theme from web page (may be use id attr or class or get the number of table manual) with soup.find or find all and so on.
#table = soup.find('table') # on brackets put the number of table
                 # i use copy soup.find/find_all('table', {"id":"  "}) if you found it
                 # or soup.find/find_all('table', class_ = "  ") if you found it 
# but there an easier bu using soup.select('==go to the table that you want then press on "..."  that left of table and from choices press to copy the code on "copy selector"') 
table = soup.select('#mw-content-text > div.mw-parser-output > table:nth-child(50)')[0] # [0] to remove the list of the result 
table  # so that is a good solution to choose the table you want an you can skip the transaction above.

<table class="wikitable">
<caption>Current world population and latest projection according the <a href="/wiki/United_Nations" title="United Nations">UN</a>. Population in (millions) and percent of the global population in that year.<sup class="reference" id="cite_ref-90"><a href="#cite_note-90">[90]</a></sup>
</caption>
<tbody><tr>
<th>Region</th>
<th>2022 (percent)</th>
<th>2030 (percent)</th>
<th>2050 (percent)
</th></tr>
<tr>
<td><a href="/wiki/Sub-Saharan_Africa" title="Sub-Saharan Africa">Sub-Saharan Africa</a></td>
<td><b>1,152</b> (14.51%)</td>
<td><b>1,401</b> (16.46%)</td>
<td><b>2,094</b> (21.62%)
</td></tr>
<tr>
<td><a class="mw-redirect" href="/wiki/Northern_Africa" title="Northern Africa">Northern Africa</a> and <a class="mw-redirect" href="/wiki/Western_Asia" title="Western Asia">Western Asia</a></td>
<td><b>549</b> (6.91%)</td>
<td><b>617</b> (7.25%)</td>
<td><b>771</b> (7.96%)
</td></tr>
<tr>
<td><a href="/wiki/Central_Asia" title="Central Asia">Central Asia</a> and <a

In [6]:
# after we determined our table you can make some process
# 1- get the head_of_table
head_of_table = [i.text for i in table.tbody.find_all('th')]
head_of_table

# There is another thechnique 
    #1- to get headers and get values follow these code
        # rows = table.find_all('tr')         => here we will search for all table rows
        # table_head = [i.text for i in rows[0] if i.text != '\n'] => hear we are going to search in the first rows (thead)
            # table_values =  []  
            # for row in rows[1:]:  
            #     td_tag = row.find_all('td')  
            #     values = [i.text for i in td_tag] 
            #     table_values.append(values)      
            # table_values     

['Region', '2022 (percent)', '2030 (percent)', '2050 (percent)\n']

In [7]:
# now we going to get the table's value
table_values = []  # creat a list to append the results
for row in table.find_all('tr')[1:]:  # tr = table row # i used [1:] to delete the first row {if we not remove we get the first row on the table is None}
    td_tag = row.find_all('td')   # td = table data
    values = [i.text for i in td_tag] # loop to enclose the data in a list
    table_values.append(values)      
table_values    
    

[['Sub-Saharan Africa',
  '1,152 (14.51%)',
  '1,401 (16.46%)',
  '2,094 (21.62%)\n'],
 ['Northern Africa and Western Asia',
  '549 (6.91%)',
  '617 (7.25%)',
  '771 (7.96%)\n'],
 ['Central Asia and Southern Asia',
  '2,075 (26.13%)',
  '2,248 (26.41%)',
  '2,575 (26.58%)\n'],
 ['Eastern Asia and Southeastern Asia',
  '2,342 (29.49%)',
  '2,372 (27.87%)',
  '2,317 (23.92%)\n'],
 ['Europe and Northern America',
  '1,120 (14.10%)',
  '1,129 (13.26%)',
  '1,125 (11.61%)\n'],
 ['Latin America and the Caribbean',
  '658 (8.29%)',
  '695 (8.17%)',
  '749 (7.73%)\n'],
 ['Australia and New Zealand', '31 (0.39%)', '34 (0.40%)', '38 (0.39%)\n'],
 ['Oceania', '14 (0.18%)', '15 (0.18%)', '20 (0.21%)\n'],
 ['World', '7,942', '8,512', '9,687\n']]

In [8]:
df = pandas.DataFrame(table_values, columns=head_of_table)
df

Unnamed: 0,Region,2022 (percent),2030 (percent),2050 (percent)\n
0,Sub-Saharan Africa,"1,152 (14.51%)","1,401 (16.46%)","2,094 (21.62%)\n"
1,Northern Africa and Western Asia,549 (6.91%),617 (7.25%),771 (7.96%)\n
2,Central Asia and Southern Asia,"2,075 (26.13%)","2,248 (26.41%)","2,575 (26.58%)\n"
3,Eastern Asia and Southeastern Asia,"2,342 (29.49%)","2,372 (27.87%)","2,317 (23.92%)\n"
4,Europe and Northern America,"1,120 (14.10%)","1,129 (13.26%)","1,125 (11.61%)\n"
5,Latin America and the Caribbean,658 (8.29%),695 (8.17%),749 (7.73%)\n
6,Australia and New Zealand,31 (0.39%),34 (0.40%),38 (0.39%)\n
7,Oceania,14 (0.18%),15 (0.18%),20 (0.21%)\n
8,World,7942,8512,"9,687\n"


### Extract the href (links) from the web page

In [12]:
links = soup(href=True)
links

[<link href="/w/load.php?lang=en&amp;modules=codex-search-styles%7Cext.cite.styles%7Cext.math.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cjquery.makeCollapsible.styles%7Cjquery.tablesorter.styles%7Cmediawiki.page.gallery.styles%7Cskins.vector.icons%2Cstyles%7Cwikibase.client.init&amp;only=styles&amp;skin=vector-2022" rel="stylesheet"/>,
 <link href="/w/load.php?lang=en&amp;modules=site.styles&amp;only=styles&amp;skin=vector-2022" rel="stylesheet"/>,
 <link href="//upload.wikimedia.org" rel="preconnect"/>,
 <link href="//en.m.wikipedia.org/wiki/World_population" media="only screen and (max-width: 720px)" rel="alternate"/>,
 <link href="/static/apple-touch/wikipedia.png" rel="apple-touch-icon"/>,
 <link href="/static/favicon/wikipedia.ico" rel="icon"/>,
 <link href="/w/opensearch_desc.php" rel="search" title="Wikipedia (en)" type="application/opensearchdescription+xml"/>,
 <link href="//en.wikipedia.org/w/api.php?action=rsd" rel

### Extract the href (links) from the web page with condition

In [16]:
links = [a.get('href') for a in soup('a') if a.get('href',' ').startswith('http')] # we use "if a.get('href',' ')"
               # # if condition to restrict the result and ' ' to skip the error
# [So, the empty string as the default value in a.get('href', ' ') acts as a safeguard to ensure 
                    #the presence of a default value (an empty string)
                    # when the href attribute is absent. This helps avoid potential errors
                    # when applying methods like startswith() on a None type object]
print(len(links))

397


### Find all "a" tags and put it into a list and get href (another way)

In [40]:
list = []
tag = soup('a')
for i in tag:
    href = i.get('href')
    if href and href.startswith('http'):
        list.append(i.get('href'))
list

['https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 'https://am.wikipedia.org/wiki/%E1%8B%A8%E1%8B%93%E1%88%88%E1%88%9D_%E1%8B%A8%E1%88%85%E1%8B%9D%E1%89%A5_%E1%89%A5%E1%8B%9B%E1%89%B5',
 'https://ar.wikipedia.org/wiki/%D8%AA%D8%B9%D8%AF%D8%A7%D8%AF_%D8%A7%D9%84%D8%B3%D9%83%D8%A7%D9%86_%D9%81%D9%8A_%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85',
 'https://an.wikipedia.org/wiki/Poblaci%C3%B3n_mundial',
 'https://ast.wikipedia.org/wiki/Poblaci%C3%B3n_mundial',
 'https://az.wikipedia.org/wiki/D%C3%BCnya_%C9%99halisi',
 'https://be.wikipedia.org/wiki/%D0%9D%D0%B0%D1%81%D0%B5%D0%BB%D1%8C%D0%BD%D1%96%D1%86%D1%82%D0%B2%D0%B0_%D0%97%D1%8F%D0%BC%D0%BB%D1%96',
 'https://bg.wikipedia.org/wiki/%D0%9D%D0%B0%D1%81%D0%B5%D0%BB%D0%B5%D0%BD%D0%B8%D0%B5_%D0%BD%D0%B0_%D1%81%D0%B2%D0%B5%D1%82%D0%B0',
 'https://bs.wikipedia.org/wiki/Svjetsko_stanovni%C5%A1tvo',
 'https://ca.wikipedia.org/wiki/Poblaci%C3%B3_mundial',
 '

## Using pandas libraary 

### Scrap table with parameter 'Match' to get a specific table

In [47]:
file = pandas.read_html(url ,match='Current world population and latest projection according the')[0] # [0]to get the table in a dataframe not a list
file

Unnamed: 0,Region,2022 (percent),2030 (percent),2050 (percent)
0,Sub-Saharan Africa,"1,152 (14.51%)","1,401 (16.46%)","2,094 (21.62%)"
1,Northern Africa and Western Asia,549 (6.91%),617 (7.25%),771 (7.96%)
2,Central Asia and Southern Asia,"2,075 (26.13%)","2,248 (26.41%)","2,575 (26.58%)"
3,Eastern Asia and Southeastern Asia,"2,342 (29.49%)","2,372 (27.87%)","2,317 (23.92%)"
4,Europe and Northern America,"1,120 (14.10%)","1,129 (13.26%)","1,125 (11.61%)"
5,Latin America and the Caribbean,658 (8.29%),695 (8.17%),749 (7.73%)
6,Australia and New Zealand,31 (0.39%),34 (0.40%),38 (0.39%)
7,Oceania,14 (0.18%),15 (0.18%),20 (0.21%)
8,World,7942,8512,9687


In [None]:
# When we are using the pandas.read_html we can use some parameter like :
    # skiprows=list(range(21,243)) to skip rows in rang from 21 to 243 for example
    # math = ''  to match word in table to help us to get specific table
    # index_col = 'Rank'    to set spesific column as a index
    # keep_default_na=False     => to replace nan value to empty cells
    # converters = {'Date': get_year }   to convert column type , for example:
        # from datetime import datetime
        # def get_year(data_string):
        #     return datetime.strptime(data_string, '%d %b %Y').year
        # all_tables = pd.read_html(url,match='Rank',skiprows=list(range(21,243)),index_col = 'Rank',converters = {'Date': get_year })

In [None]:
# that is link describe the rols  'https://www.scrapehero.com/web-scraping-with-pandas/'