# LSE Course 2 Week 5: Convener tutorial videos

## Web scraping with BeautifulSoup

### Video brief:
Scraping a website with BeautifulSoup: This tutorial is a demonstration of how to scrape data from a single source using the BeautifulSoup library. Leading up to the video, we introduce web scraping with Python and HTML, as well as BeautifulSoup.

# 

In [16]:
# install libraries
!pip install requests
!pip install bs4
!pip install lxml



In [17]:
# import packages
import bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [18]:
# import the url
url = "https://www.worldometers.info/world-population/population-by-country/"

page = requests.get(url)
page

<Response [200]>

In [19]:
if page.status_code == 200:
    html_doc = page.text
    
soup = BeautifulSoup(html_doc)

print(soup.prettify())

<!DOCTYPE html>
<!--[if IE 8]> <html lang="en" class="ie8"> <![endif]-->
<!--[if IE 9]> <html lang="en" class="ie9"> <![endif]-->
<!--[if !IE]><!-->
<html lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Population by Country (2022) - Worldometer
  </title>
  <meta content="List of countries and dependencies in the world ranked by population, from the most populated. Growth rate, median age, fertility rate, area, density, population density, urbanization, urban population, share of world population." name="description"/>
  <link href="/favicon/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="/favicon/apple-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
  <link href="/favicon/apple-icon-60x60.png" rel="apple-touch-icon" sizes="60x60"/>
  <link href="/favicon/apple-icon-72x72.png" rel="apple-touch-icon" siz

In [20]:
# extracting the contents of the table with the table id : 
table = soup.find('table', attrs={'id': 'example2'})
print(table.prettify())

<table cellspacing="0" class="table table-striped table-bordered" id="example2" width="100%">
 <thead>
  <tr>
   <th>
    #
   </th>
   <th>
    Country (or dependency)
   </th>
   <th>
    Population
    <br/>
    (2020)
   </th>
   <th>
    Yearly
    <br/>
    Change
   </th>
   <th>
    Net
    <br/>
    Change
   </th>
   <th>
    Density
    <br/>
    (P/Km²)
   </th>
   <th>
    Land Area
    <br/>
    (Km²)
   </th>
   <th>
    Migrants
    <br/>
    (net)
   </th>
   <th>
    Fert.
    <br/>
    Rate
   </th>
   <th>
    Med.
    <br/>
    Age
   </th>
   <th>
    Urban
    <br/>
    Pop %
   </th>
   <th>
    World
    <br/>
    Share
   </th>
  </tr>
 </thead>
 <tbody>
  <tr>
   <td>
    1
   </td>
   <td style="font-weight: bold; font-size:15px; text-align:left">
    <a href="/world-population/china-population/">
     China
    </a>
   </td>
   <td style="font-weight: bold;">
    1,439,323,776
   </td>
   <td>
    0.39 %
   </td>
   <td>
    5,540,090
   </td>
   <td>
    1

In [21]:
# all of the rows of the table
rows = table.find_all("tr")
rows

[<tr> <th>#</th> <th>Country (or dependency)</th> <th>Population<br/> (2020)</th> <th>Yearly<br/> Change</th> <th>Net<br/> Change</th> <th>Density<br/> (P/Km²)</th> <th>Land Area<br/> (Km²)</th> <th>Migrants<br/> (net)</th> <th>Fert.<br/> Rate</th> <th>Med.<br/> Age</th> <th>Urban<br/> Pop %</th> <th>World<br/> Share</th> </tr>,
 <tr> <td>1</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/china-population/">China</a></td> <td style="font-weight: bold;">1,439,323,776</td> <td>0.39 %</td> <td>5,540,090</td> <td>153</td> <td>9,388,211</td> <td>-348,399</td> <td>1.7</td> <td>38</td> <td>61 %</td> <td>18.47 %</td> </tr>,
 <tr> <td>2</td> <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/india-population/">India</a></td> <td style="font-weight: bold;">1,380,004,385</td> <td>0.99 %</td> <td>13,586,631</td> <td>464</td> <td>2,973,190</td> <td>-532,687</td> <td>2.2</td> <td>28</td> <td>35 %</td> <td>17.70 %</t

In [22]:
# storage for the extracted data
output = []

for country in rows:
    country_data = country.find_all("td")
    if country_data:
        # extract the text within each element
        country_text = [x.text for x in country_data]
        output.append(dict(
            zip(["Country (or dependency)", "Population (2020)",
                 "Yearly Change", "Net Change", "Density (P/Km2)",
                 "Land Area (Km2)", "Migrants (net)", "Fert. Rate",
                 "Med. Age", "Urbn Pop", "World Share"], country_text)
        ))

output

[{'Country (or dependency)': '1',
  'Population (2020)': 'China',
  'Yearly Change': '1,439,323,776',
  'Net Change': '0.39 %',
  'Density (P/Km2)': '5,540,090',
  'Land Area (Km2)': '153',
  'Migrants (net)': '9,388,211',
  'Fert. Rate': '-348,399',
  'Med. Age': '1.7',
  'Urbn Pop': '38',
  'World Share': '61 %'},
 {'Country (or dependency)': '2',
  'Population (2020)': 'India',
  'Yearly Change': '1,380,004,385',
  'Net Change': '0.99 %',
  'Density (P/Km2)': '13,586,631',
  'Land Area (Km2)': '464',
  'Migrants (net)': '2,973,190',
  'Fert. Rate': '-532,687',
  'Med. Age': '2.2',
  'Urbn Pop': '28',
  'World Share': '35 %'},
 {'Country (or dependency)': '3',
  'Population (2020)': 'United States',
  'Yearly Change': '331,002,651',
  'Net Change': '0.59 %',
  'Density (P/Km2)': '1,937,734',
  'Land Area (Km2)': '36',
  'Migrants (net)': '9,147,420',
  'Fert. Rate': '954,806',
  'Med. Age': '1.8',
  'Urbn Pop': '38',
  'World Share': '83 %'},
 {'Country (or dependency)': '4',
  'Popu

In [29]:
# transform data into json and csv
import json
import pandas as pd

# parse as a json object
output_json = json.dumps(output)

In [24]:
# read json using pandas, output to .csv
pd.read_json(output_json).to_csv("countries.csv", index=False)

In [25]:
# save the json file to .json
with open("countries.json", "w") as f:
    json.dump(output, f)

In [28]:
# import csv file with pandas
# data = pd.read_json("countries.json")
data = pd.read_csv("countries.csv")

# view
data

Unnamed: 0,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km2),Land Area (Km2),Migrants (net),Fert. Rate,Med. Age,Urbn Pop,World Share
0,1,China,1439323776,0.39 %,5540090,153,9388211,-348399,1.7,38,61 %
1,2,India,1380004385,0.99 %,13586631,464,2973190,-532687,2.2,28,35 %
2,3,United States,331002651,0.59 %,1937734,36,9147420,954806,1.8,38,83 %
3,4,Indonesia,273523615,1.07 %,2898047,151,1811570,-98955,2.3,30,56 %
4,5,Pakistan,220892340,2.00 %,4327022,287,770880,-233379,3.6,23,35 %
...,...,...,...,...,...,...,...,...,...,...,...
230,231,Montserrat,4992,0.06 %,3,50,100,,N.A.,N.A.,10 %
231,232,Falkland Islands,3480,3.05 %,103,0,12170,,N.A.,N.A.,66 %
232,233,Niue,1626,0.68 %,11,6,260,,N.A.,N.A.,46 %
233,234,Tokelau,1357,1.27 %,17,136,10,,N.A.,N.A.,0 %
