In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Practice web scraping

As you've seen, scraping the internet is a skill that can get you all sorts of information. Here are some little challenges that you can try to gain more experience in the field:

Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'
Find the number of titles that have changed in the United States Code since its last release point: url = 'http://uscode.house.gov/download/download.shtml'
Create a Python list with the top ten FBI's Most Wanted names: url = 'https://www.fbi.gov/wanted/topten'
Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: url = 'https://www.emsc-csem.org/Earthquake/'
List all language names and number of related articles in the order they appear in wikipedia.org: url = 'https://www.wikipedia.org/'
A list with the different kind of datasets available in data.gov.uk: url = 'https://data.gov.uk/'
Display the top 10 languages by number of native speakers stored in a pandas dataframe: url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

## Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'

In [2]:
url_wiki = "https://en.wikipedia.org/wiki/Python"

In [3]:
response = requests.get(url_wiki)
response.status_code

200

In [4]:
soup_wiki = BeautifulSoup(response.content,'html.parser')

In [9]:
soup_wiki.select('a')



[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="extiw" href="https://en.wiktionary.org/wiki/Python" title="wiktionary:Python">Python</a>,
 <a class="extiw" href="https://en.wiktionary.org/wiki/python" title="wiktionary:python">python</a>,
 <a class="mw-redirect" href="/wiki/Pythons" title="Pythons">Pythons</a>,
 <a href="/wiki/Python_(genus)" title="Python (genus)"><i>Python</i> (genus)</a>,
 <a href="#Computing"><span class="tocnumber">1</span> <span class="toctext">Computing</span></a>,
 <a href="#People"><span class="tocnumber">2</span> <span class="toctext">People</span></a>,
 <a href="#Roller_coasters"><span class="tocnumber">3</span> <span class="toctext">Roller coasters</span></a>,
 <a href="#Vehicles"><span class="tocnumber">4</span> <span class="toctext">Vehicles</span></a>,
 <a href="#Weaponry"><span class="tocnumber">5</span> <span class="toctext">Weaponry</span></

In [18]:
href_links = []
for i in soup_wiki.select('li a'):
    href_links.append(i['href'])

In [19]:
href_links

['/wiki/Pythons',
 '/wiki/Python_(genus)',
 '#Computing',
 '#People',
 '#Roller_coasters',
 '#Vehicles',
 '#Weaponry',
 '#Other_uses',
 '#See_also',
 '/wiki/Python_(programming_language)',
 '/wiki/CMU_Common_Lisp',
 '/wiki/PERQ#PERQ_3',
 '/wiki/Python_of_Aenus',
 '/wiki/Python_(painter)',
 '/wiki/Python_of_Byzantium',
 '/wiki/Python_of_Catana',
 '/wiki/Python_Anghelo',
 '/wiki/Python_(Efteling)',
 '/wiki/Python_(Busch_Gardens_Tampa_Bay)',
 '/wiki/Python_(Coney_Island,_Cincinnati,_Ohio)',
 '/wiki/Python_(automobile_maker)',
 '/wiki/Python_(Ford_prototype)',
 '/wiki/Python_(missile)',
 '/wiki/Python_(nuclear_primary)',
 '/wiki/Colt_Python',
 '/wiki/PYTHON',
 '/wiki/Python_(film)',
 '/wiki/Python_(mythology)',
 '/wiki/Monty_Python',
 '/wiki/Python_(Monty)_Pictures',
 '/wiki/Cython',
 '/wiki/Pyton',
 '/wiki/Pithon',
 '/wiki/Category:Disambiguation_pages',
 '/wiki/Category:Human_name_disambiguation_pages',
 '/wiki/Category:Disambiguation_pages_with_given-name-holder_lists',
 '/wiki/Category

 ## Find the number of titles that have changed in the United States Code since its last release point: url = 'http://uscode.house.gov/download/download.shtml' 

In [20]:
url_us = "http://uscode.house.gov/download/download.shtml"

In [21]:
response = requests.get(url_us)
response.status_code

200

In [22]:
soup_us = BeautifulSoup(response.content,'html.parser')

In [29]:
string = soup_us.select("#us\/usc\/t54")

In [31]:
string[0].get_text()

'\n\n          Title 54 - National Park Service and Related Programs ٭\n'

## Create a Python list with the top ten FBI's Most Wanted names: url = 'https://www.fbi.gov/wanted/topten'

In [32]:
url_fbi = "https://www.fbi.gov/wanted/topten"

In [33]:
response = requests.get(url_fbi)
response.status_code

200

In [34]:
soup_fbi = BeautifulSoup(response.content,'html.parser')

In [39]:
soup_fbi.select('h3 a')

[<a href="https://www.fbi.gov/wanted/topten/eugene-palmer">EUGENE PALMER</a>,
 <a href="https://www.fbi.gov/wanted/topten/rafael-caro-quintero">RAFAEL CARO-QUINTERO</a>,
 <a href="https://www.fbi.gov/wanted/topten/bhadreshkumar-chetanbhai-patel">BHADRESHKUMAR CHETANBHAI PATEL</a>,
 <a href="https://www.fbi.gov/wanted/topten/robert-william-fisher">ROBERT WILLIAM FISHER</a>,
 <a href="https://www.fbi.gov/wanted/topten/alejandro-castillo">ALEJANDRO ROSALES CASTILLO</a>,
 <a href="https://www.fbi.gov/wanted/topten/arnoldo-jimenez">ARNOLDO JIMENEZ</a>,
 <a href="https://www.fbi.gov/wanted/topten/jason-derek-brown">JASON DEREK BROWN</a>,
 <a href="https://www.fbi.gov/wanted/topten/alexis-flores">ALEXIS FLORES</a>,
 <a href="https://www.fbi.gov/wanted/topten/jose-rodolfo-villarreal-hernandez">JOSE RODOLFO VILLARREAL-HERNANDEZ</a>,
 <a href="https://www.fbi.gov/wanted/topten/yaser-abdel-said">YASER ABDEL SAID</a>]

In [45]:
most_wanted = []

for i in range(10):
    most_wanted.append(soup_fbi.select('h3 a')[i].get_text())
    

In [46]:
most_wanted

['EUGENE PALMER',
 'RAFAEL CARO-QUINTERO',
 'BHADRESHKUMAR CHETANBHAI PATEL',
 'ROBERT WILLIAM FISHER',
 'ALEJANDRO ROSALES CASTILLO',
 'ARNOLDO JIMENEZ',
 'JASON DEREK BROWN',
 'ALEXIS FLORES',
 'JOSE RODOLFO VILLARREAL-HERNANDEZ',
 'YASER ABDEL SAID']

## Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: url = 'https://www.emsc-csem.org/Earthquake/' 

In [47]:
url_20_eq = 'https://www.emsc-csem.org/Earthquake/'

In [63]:
response = requests.get(url_20_eq)
response.status_code


200

In [64]:
soup_eq = BeautifulSoup(response.content,'html.parser')

In [65]:
soup_eq.select('table tbody tr td.tabev6 a')

[<a href="/Earthquake/earthquake.php?id=979040">2021-05-02   20:18:23.9</a>,
 <a href="/Earthquake/earthquake.php?id=979041">2021-05-02   20:10:18.0</a>,
 <a href="/Earthquake/earthquake.php?id=979039">2021-05-02   20:08:30.0</a>,
 <a href="/Earthquake/earthquake.php?id=979038">2021-05-02   19:53:39.0</a>,
 <a href="/Earthquake/earthquake.php?id=979036">2021-05-02   19:35:15.0</a>,
 <a href="/Earthquake/earthquake.php?id=979035">2021-05-02   19:31:18.1</a>,
 <a href="/Earthquake/earthquake.php?id=979033">2021-05-02   19:26:23.3</a>,
 <a href="/Earthquake/earthquake.php?id=979042">2021-05-02   19:26:03.7</a>,
 <a href="/Earthquake/earthquake.php?id=979034">2021-05-02   19:16:08.0</a>,
 <a href="/Earthquake/earthquake.php?id=979032">2021-05-02   19:05:30.0</a>,
 <a href="/Earthquake/earthquake.php?id=979030">2021-05-02   18:56:32.0</a>,
 <a href="/Earthquake/earthquake.php?id=979029">2021-05-02   18:51:20.9</a>,
 <a href="/Earthquake/earthquake.php?id=979028">2021-05-02   18:48:47.9</a>,

In [67]:
soup_eq.select('table tbody tr td.tabev1')

[<td class="tabev1">36.45 </td>,
 <td class="tabev1">27.13 </td>,
 <td class="tabev1">22.00 </td>,
 <td class="tabev1">68.50 </td>,
 <td class="tabev1">19.21 </td>,
 <td class="tabev1">155.41 </td>,
 <td class="tabev1">26.43 </td>,
 <td class="tabev1">92.24 </td>,
 <td class="tabev1">20.29 </td>,
 <td class="tabev1">69.27 </td>,
 <td class="tabev1">40.84 </td>,
 <td class="tabev1">28.12 </td>,
 <td class="tabev1">45.95 </td>,
 <td class="tabev1">14.39 </td>,
 <td class="tabev1">5.64 </td>,
 <td class="tabev1">145.48 </td>,
 <td class="tabev1">17.02 </td>,
 <td class="tabev1">121.28 </td>,
 <td class="tabev1">8.67 </td>,
 <td class="tabev1">83.33 </td>,
 <td class="tabev1">26.87 </td>,
 <td class="tabev1">91.94 </td>,
 <td class="tabev1">37.50 </td>,
 <td class="tabev1">179.96 </td>,
 <td class="tabev1">19.22 </td>,
 <td class="tabev1">155.37 </td>,
 <td class="tabev1">24.00 </td>,
 <td class="tabev1">67.32 </td>,
 <td class="tabev1">40.29 </td>,
 <td class="tabev1">37.93 </td>,
 <td cl

In [69]:
date_time = []
coordinates = []
location = []

for i in range(len(soup_eq.select('table tbody tr'))):
    
    date_time.append(soup_eq.select('td.tabev6 a')[i].get_text())
    coordinates.append(soup_eq.select('td.tabev1')[i].get_text())
    location.append(soup_eq.select('td.tb_region')[i].get_text())

In [72]:
location


['\xa0DODECANESE IS.-TURKEY BORDER REG',
 '\xa0ANTOFAGASTA, CHILE',
 '\xa0ISLAND OF HAWAII, HAWAII',
 '\xa0ASSAM, INDIA',
 '\xa0TARAPACA, CHILE',
 '\xa0WESTERN TURKEY',
 '\xa0SLOVENIA',
 '\xa0EASTERN NEW GUINEA REG., P.N.G.',
 '\xa0LUZON, PHILIPPINES',
 '\xa0COSTA RICA',
 '\xa0ASSAM, INDIA',
 '\xa0OFF E. COAST OF N. ISLAND, N.Z.',
 '\xa0ISLAND OF HAWAII, HAWAII',
 '\xa0SALTA, ARGENTINA',
 '\xa0CENTRAL TURKEY',
 '\xa0OFFSHORE ATACAMA, CHILE',
 '\xa0WESTERN TURKEY',
 '\xa0PANAMA-COSTA RICA BORDER REGION',
 '\xa0KEP. MENTAWAI REGION, INDONESIA',
 '\xa0STRAIT OF GIBRALTAR',
 '\xa0OFFSHORE ANTOFAGASTA, CHILE',
 '\xa0ASSAM, INDIA',
 '\xa0OFF E. COAST OF N. ISLAND, N.Z.',
 '\xa0TARAPACA, CHILE',
 '\xa0VANUATU',
 '\xa0DOMINICAN REPUBLIC',
 '\xa0OFFSHORE ANTOFAGASTA, CHILE',
 '\xa0SAN JUAN, ARGENTINA',
 '\xa0SOUTHERN CALIFORNIA',
 '\xa0OFF COAST OF TARAPACA, CHILE',
 '\xa0FLORES REGION, INDONESIA',
 '\xa0GREECE',
 '\xa0ANTOFAGASTA, CHILE',
 '\xa0SULAWESI, INDONESIA',
 '\xa0MOLUCCA SEA',
 '\xa0S

In [73]:
eq_df = pd.DataFrame({'DateTime':date_time,'coordinates': coordinates,'location':location})

In [74]:
eq_df

Unnamed: 0,DateTime,coordinates,location
0,2021-05-02 20:18:23.9,36.45,DODECANESE IS.-TURKEY BORDER REG
1,2021-05-02 20:10:18.0,27.13,"ANTOFAGASTA, CHILE"
2,2021-05-02 20:08:30.0,22.0,"ISLAND OF HAWAII, HAWAII"
3,2021-05-02 19:53:39.0,68.5,"ASSAM, INDIA"
4,2021-05-02 19:35:15.0,19.21,"TARAPACA, CHILE"
5,2021-05-02 19:31:18.1,155.41,WESTERN TURKEY
6,2021-05-02 19:26:23.3,26.43,SLOVENIA
7,2021-05-02 19:26:03.7,92.24,"EASTERN NEW GUINEA REG., P.N.G."
8,2021-05-02 19:16:08.0,20.29,"LUZON, PHILIPPINES"
9,2021-05-02 19:05:30.0,69.27,COSTA RICA


## Display the top 10 languages by number of native speakers stored in a pandas dataframe: url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

In [75]:
#mw-content-text > div.mw-parser-output > table:nth-child(14) > tbody > tr:nth-child(1) > td:nth-child(2) > a

In [76]:
url_lan = "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers"

In [77]:
response = requests.get(url_lan)
response.status_code

200

In [78]:
soup_lan = BeautifulSoup(response.content,'html.parser')

In [92]:
soup_lan.select('td:nth-child(2) > a')

[<a href="/wiki/Mandarin_Chinese" title="Mandarin Chinese">Mandarin Chinese</a>,
 <a href="/wiki/Spanish_language" title="Spanish language">Spanish</a>,
 <a href="/wiki/English_language" title="English language">English</a>,
 <a href="/wiki/Hindi" title="Hindi">Hindi</a>,
 <a href="/wiki/Hindustani_language" title="Hindustani language">Hindustani</a>,
 <a href="/wiki/Bengali_language" title="Bengali language">Bengali</a>,
 <a href="/wiki/Portuguese_language" title="Portuguese language">Portuguese</a>,
 <a href="/wiki/Russian_language" title="Russian language">Russian</a>,
 <a href="/wiki/Japanese_language" title="Japanese language">Japanese</a>,
 <a href="/wiki/Punjabi_language" title="Punjabi language">Western Punjabi</a>,
 <a href="/wiki/Marathi_language" title="Marathi language">Marathi</a>,
 <a href="/wiki/Telugu_language" title="Telugu language">Telugu</a>,
 <a href="/wiki/Wu_Chinese" title="Wu Chinese">Wu Chinese</a>,
 <a href="/wiki/Turkish_language" title="Turkish language">Tur

In [99]:
lang = []

for i in range(10):
    lang.append(soup_lan.select('td:nth-child(2) > a')[i].get_text())

In [100]:
lang

['Mandarin Chinese',
 'Spanish',
 'English',
 'Hindi',
 'Hindustani',
 'Bengali',
 'Portuguese',
 'Russian',
 'Japanese',
 'Western Punjabi']

In [101]:
soup_lan.select("td:nth-child(3)")

[<td>918
 </td>,
 <td>480
 </td>,
 <td>379
 </td>,
 <td>341
 </td>,
 <td>228
 </td>,
 <td>221
 </td>,
 <td>154
 </td>,
 <td>128
 </td>,
 <td>92.7
 </td>,
 <td>83.1
 </td>,
 <td>82.0
 </td>,
 <td>81.4
 </td>,
 <td>79.4
 </td>,
 <td>77.3
 </td>,
 <td>77.2
 </td>,
 <td>76.1
 </td>,
 <td>76.0
 </td>,
 <td>75.0
 </td>,
 <td>73.1
 </td>,
 <td>68.6
 </td>,
 <td>68.3
 </td>,
 <td>64.8
 </td>,
 <td>64.6
 </td>,
 <td>56.4
 </td>,
 <td>52.8
 </td>,
 <td>52.2
 </td>,
 <td>50.1
 </td>,
 <td>48.2
 </td>,
 <td>46.9
 </td>,
 <td>43.9
 </td>,
 <td>43.6
 </td>,
 <td>43.4
 </td>,
 <td>39.7
 </td>,
 <td>37.8
 </td>,
 <td>37.3
 </td>,
 <td>37.1
 </td>,
 <td>34.5
 </td>,
 <td>33.9
 </td>,
 <td>32.9
 </td>,
 <td>32.6
 </td>,
 <td>32.4
 </td>,
 <td>31.9
 </td>,
 <td>29.4
 </td>,
 <td>27.5
 </td>,
 <td>27.3
 </td>,
 <td>27.0
 </td>,
 <td>25.1
 </td>,
 <td>24.6
 </td>,
 <td>24.6
 </td>,
 <td>24.3
 </td>,
 <td>23.6
 </td>,
 <td>23.1
 </td>,
 <td>22.4
 </td>,
 <td>22.1
 </td>,
 <td>21.9
 </td>,
 <td>20.9
 </td>,


In [102]:
speakers_numbers = []

for i in range(10):
    speakers_numbers.append(soup_lan.select('td:nth-child(3)')[i].get_text())

In [103]:
lang_df = pd.DataFrame({'language name':lang,'no of speakers in millions':speakers_numbers})

In [104]:
lang_df

Unnamed: 0,language name,no of speakers in millions
0,Mandarin Chinese,918\n
1,Spanish,480\n
2,English,379\n
3,Hindi,341\n
4,Hindustani,228\n
5,Bengali,221\n
6,Portuguese,154\n
7,Russian,128\n
8,Japanese,92.7\n
9,Western Punjabi,83.1\n
