In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#The below url contains html tables with data about world population.
url = "https://en.wikipedia.org/wiki/World_population"

In [5]:
# get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text

In [6]:
soup = BeautifulSoup(data,"html.parser")

In [7]:
#find all html tables in the web page
tables = soup.find_all('table') # in html table is represented by the tag <table>

In [8]:
# we can see how many tables were found by checking the length of the tables list
len(tables)

26

In [9]:
for index, table in enumerate(tables):
    if("10 most densely populated countries" in str(table)):
        table_index=index
print(table_index)

5


In [12]:
print(tables[table_index].prettify())

<table class="wikitable sortable" style="text-align:right">
 <caption>
  10 most densely populated countries
  <small>
   (with population above 5 million)
  </small>
 </caption>
 <tbody>
  <tr>
   <th>
    Rank
   </th>
   <th>
    Country
   </th>
   <th>
    Population
   </th>
   <th>
    Area
    <br/>
    <small>
     (km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
   <th>
    Density
    <br/>
    <small>
     (pop/km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
  </tr>
  <tr>
   <td>
    1
   </td>
   <td align="left">
    <span class="flagicon">
     <img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapore.svg/23px-Flag_of_Singapore.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapore.svg/35px-Flag_of_Singapore.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapo

In [13]:
population_data = pd.DataFrame(columns=["Rank", "Country", "Population", "Area", "Density"])

for row in tables[table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        rank = col[0].text
        country = col[1].text
        population = col[2].text.strip()
        area = col[3].text.strip()
        density = col[4].text.strip()
        population_data = population_data.append({"Rank":rank, "Country":country, "Population":population, "Area":area, "Density":density}, ignore_index=True)

population_data

Unnamed: 0,Rank,Country,Population,Area,Density
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,172510000,143998,1198
2,3,\n Palestine\n\n,5266785,6020,847
3,4,Lebanon,6856000,10452,656
4,5,Taiwan,23604000,36193,652
5,6,South Korea,51781000,99538,520
6,7,Rwanda,12374000,26338,470
7,8,Haiti,11578000,27065,428
8,9,Netherlands,17710000,41526,426
9,10,Israel,9500000,22072,430


In [19]:
#Scrape data from HTML tables into a DataFrame using BeautifulSoup and read_html

pd.read_html(str(tables[5]), flavor='bs4')

[   Rank      Country  Population  Area(km2)  Density(pop/km2)
 0     1    Singapore     5704000        710              8033
 1     2   Bangladesh   172510000     143998              1198
 2     3    Palestine     5266785       6020               847
 3     4      Lebanon     6856000      10452               656
 4     5       Taiwan    23604000      36193               652
 5     6  South Korea    51781000      99538               520
 6     7       Rwanda    12374000      26338               470
 7     8        Haiti    11578000      27065               428
 8     9  Netherlands    17710000      41526               426
 9    10       Israel     9500000      22072               430]

In [25]:
population_data_read_html=pd.read_html(str(tables[5]), flavor='bs4')[0]
population_data_read_html

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,172510000,143998,1198
2,3,Palestine,5266785,6020,847
3,4,Lebanon,6856000,10452,656
4,5,Taiwan,23604000,36193,652
5,6,South Korea,51781000,99538,520
6,7,Rwanda,12374000,26338,470
7,8,Haiti,11578000,27065,428
8,9,Netherlands,17710000,41526,426
9,10,Israel,9500000,22072,430


In [28]:
#Scrape data from HTML tables into a DataFrame using read_html

#We can also use the read_html function to directly get DataFrames from a url.

dataframe_list = pd.read_html(url, flavor='bs4')

In [29]:
len(dataframe_list)

26

In [34]:
dataframe_list[5]

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,172510000,143998,1198
2,3,Palestine,5266785,6020,847
3,4,Lebanon,6856000,10452,656
4,5,Taiwan,23604000,36193,652
5,6,South Korea,51781000,99538,520
6,7,Rwanda,12374000,26338,470
7,8,Haiti,11578000,27065,428
8,9,Netherlands,17710000,41526,426
9,10,Israel,9500000,22072,430


In [38]:
dataframe_list[13]

Unnamed: 0,Year,World,Asia,Africa,Europe,Latin America/Caribbean,Northern America,Oceania
0,2000,6144,"3,741 (60.9%)",811 (13.2%),726 (11.8%),522 (8.5%),312 (5.1%),31 (0.5%)
1,2005,6542,"3,978 (60.8%)",916 (14.0%),729 (11.2%),558 (8.5%),327 (5.0%),34 (0.5%)
2,2010,6957,"4,210 (60.5%)","1,039 (14.9%)",736 (10.6%),591 (8.5%),343 (4.9%),37 (0.5%)
3,2015,7380,"4,434 (60.1%)","1,182 (16.0%)",743 (10.1%),624 (8.5%),357 (4.8%),40 (0.5%)
4,2020,7795,"4,641 (59.5%)","1,341 (17.2%)",748 (9.6%),654 (8.4%),369 (4.7%),43 (0.6%)
5,2025,8184,"4,823 (58.9%)","1,509 (18.4%)",746 (9.1%),682 (8.3%),380 (4.6%),45 (0.6%)
6,2030,8549,"4,974 (58.2%)","1,688 (19.8%)",741 (8.7%),706 (8.3%),391 (4.6%),48 (0.6%)
7,2035,8888,"5,096 (57.3%)","1,878 (21.1%)",735 (8.3%),726 (8.2%),401 (4.5%),50 (0.6%)
8,2040,9199,"5,189 (56.4%)","2,077 (22.6%)",728 (7.9%),742 (8.1%),410 (4.5%),53 (0.6%)
9,2045,9482,"5,253 (55.4%)","2,282 (24.1%)",720 (7.6%),754 (8.0%),418 (4.4%),55 (0.6%)


In [39]:
#We can also use the match parameter to select the specific table we want. If the table contains a string matching the text it will be read.

pd.read_html(url, match="10 most densely populated countries", flavor='bs4')[0]

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,172510000,143998,1198
2,3,Palestine,5266785,6020,847
3,4,Lebanon,6856000,10452,656
4,5,Taiwan,23604000,36193,652
5,6,South Korea,51781000,99538,520
6,7,Rwanda,12374000,26338,470
7,8,Haiti,11578000,27065,428
8,9,Netherlands,17710000,41526,426
9,10,Israel,9500000,22072,430
