In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'

In [2]:
url_wiki ='https://en.wikipedia.org/wiki/Python'

In [3]:
# download html with a get request
response=requests.get(url_wiki)
response.status_code

200

In [4]:
# parse html (create the 'soup')
soup_wiki = BeautifulSoup(response.content, "html.parser")

In [42]:
links = soup_wiki.find_all("a",href=True)
#links

In [6]:
#extract the href from each link
link_urls = [link['href'] for link in links if link['href'].startswith('http')]


In [41]:
#link_urls[:10]

## Create a Python list with the top ten FBI's Most Wanted names: url = 'https://www.fbi.gov/wanted/topten'

In [8]:
#query-results-0f737222c5054a81a120bce207b0446a > ul > li:nth-child(3) > h3 > a

In [9]:
url_FBI ='https://www.fbi.gov/wanted/topten'

In [10]:
# download html with a get request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url_FBI, headers=headers)
response.status_code

200

In [11]:
# parse html (create the 'soup')
soup_FBI = BeautifulSoup(response.content, "html.parser")

In [12]:
#soup_FBI

In [13]:
soup_FBI.select("#query-results-0f737222c5054a81a120bce207b0446a > ul > li > h3 > a")

[<a href="https://www.fbi.gov/wanted/topten/wilver-villegas-palomino">WILVER VILLEGAS-PALOMINO</a>,
 <a href="https://www.fbi.gov/wanted/topten/vitelhomme-innocent">VITEL'HOMME INNOCENT</a>,
 <a href="https://www.fbi.gov/wanted/topten/alejandro-castillo">ALEJANDRO ROSALES CASTILLO</a>,
 <a href="https://www.fbi.gov/wanted/topten/alexis-flores">ALEXIS FLORES</a>,
 <a href="https://www.fbi.gov/wanted/topten/arnoldo-jimenez">ARNOLDO JIMENEZ</a>,
 <a href="https://www.fbi.gov/wanted/topten/omar-alexander-cardenas">OMAR ALEXANDER CARDENAS</a>,
 <a href="https://www.fbi.gov/wanted/topten/yulan-adonay-archaga-carias">YULAN ADONAY ARCHAGA CARIAS</a>,
 <a href="https://www.fbi.gov/wanted/topten/bhadreshkumar-chetanbhai-patel">BHADRESHKUMAR CHETANBHAI PATEL</a>,
 <a href="https://www.fbi.gov/wanted/topten/donald-eugene-fields-ii">DONALD EUGENE FIELDS II</a>,
 <a href="https://www.fbi.gov/wanted/topten/ruja-ignatova">RUJA IGNATOVA</a>]

In [14]:
names=soup_FBI.find_all("h3",class_="title")

In [40]:
#names

In [16]:
top_ten_most_wanted = []

In [17]:
for name in names:
    top_ten_most_wanted.append(name.text.strip())

In [18]:
top_ten_most_wanted

['WILVER VILLEGAS-PALOMINO',
 "VITEL'HOMME INNOCENT",
 'ALEJANDRO ROSALES CASTILLO',
 'ALEXIS FLORES',
 'ARNOLDO JIMENEZ',
 'OMAR ALEXANDER CARDENAS',
 'YULAN ADONAY ARCHAGA CARIAS',
 'BHADRESHKUMAR CHETANBHAI PATEL',
 'DONALD EUGENE FIELDS II',
 'RUJA IGNATOVA']

## A list with the different kind of datasets available in data.gov.uk: url = 'https://data.gov.uk/'

In [19]:
url_gov = "https://data.gov.uk/"

In [20]:
# download html with a get request
response=requests.get(url_gov)
response.status_code

200

In [21]:
soup_gov = BeautifulSoup(response.content, "html.parser")

In [22]:
#soup_gov

In [23]:
#main-content > div:nth-child(3) > div
#soup_gov.select("#main-content > div > div")

In [24]:
dataset_types = []

In [25]:
categories_elements = soup_gov.find_all("h3", class_ = "govuk-heading-s dgu-topics__heading")

In [26]:
#categories_elements

In [27]:
for category in categories_elements:
    dataset_types.append(category.text)


In [28]:
dataset_types

['Business and economy',
 'Crime and justice',
 'Defence',
 'Education',
 'Environment',
 'Government',
 'Government spending',
 'Health',
 'Mapping',
 'Society',
 'Towns and cities',
 'Transport',
 'Digital service performance',
 'Government reference data']

## Display the top 10 languages by number of native speakers stored in a pandas dataframe: url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

In [29]:
url_language = "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers"

In [30]:
# download html with a get request
response=requests.get(url_language)
response.status_code

200

In [31]:
soup_language = BeautifulSoup(response.content , "html.parser")

In [32]:
# find the tables that we need
table = soup_language.find('table', {'class': 'wikitable'}) # first table of the page


In [39]:
#table

In [34]:
rows = table.find_all("tr") # another select of the select 

In [35]:
data = []


In [36]:
for row in rows[1:11]:  # Skip the header row and get the next 10 rows
    cols = row.find_all('td')
    if cols:  # This check ensures that there are enough columns in the row
        language = cols[0].text.strip()
        speakers = cols[1].text.strip().replace(',', '')  # Remove commas from numbers
        data.append((language, speakers))


In [37]:
df = pd.DataFrame(data,columns=["Language","Native Speakers"])

In [38]:
df

Unnamed: 0,Language,Native Speakers
0,Mandarin Chinese,939.0
1,Spanish,485.0
2,English,380.0
3,Hindi,345.0
4,Portuguese,236.0
5,Bengali,234.0
6,Russian,147.0
7,Japanese,123.0
8,Yue Chinese,86.1
9,Vietnamese,85.0
