# Lab | Web Scraping Multiple Pages

# 1. Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: 
# url ='https://en.wikipedia.org/wiki/Python'

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re


In [2]:
# Find url and store it in a variable
url = "https://en.wikipedia.org/wiki/Python"

In [3]:
# Download html with a get request
response = requests.get(url)

In [4]:

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the div element with id="mw-content-text"
    content_div = soup.find('div', id='mw-content-text')
    
    # Initialize a list to store links
    links = []
    
    # Find all <a> tags within the content_div
    for link in content_div.find_all('a', href=True):
        links.append(link['href'])
    
    # Display the list of links
    print(links)
else:
    print('Failed to retrieve data from the website')

['https://en.wiktionary.org/wiki/Python', 'https://en.wiktionary.org/wiki/python', '/w/index.php?title=Python&action=edit&section=1', '/wiki/Pythonidae', '/wiki/Python_(genus)', '/wiki/Python_(mythology)', '/w/index.php?title=Python&action=edit&section=2', '/wiki/Python_(programming_language)', '/wiki/CMU_Common_Lisp', '/wiki/PERQ#PERQ_3', '/w/index.php?title=Python&action=edit&section=3', '/wiki/Python_of_Aenus', '/wiki/Python_(painter)', '/wiki/Python_of_Byzantium', '/wiki/Python_of_Catana', '/wiki/Python_Anghelo', '/w/index.php?title=Python&action=edit&section=4', '/wiki/Python_(Efteling)', '/wiki/Python_(Busch_Gardens_Tampa_Bay)', '/wiki/Python_(Coney_Island,_Cincinnati,_Ohio)', '/w/index.php?title=Python&action=edit&section=5', '/wiki/Python_(automobile_maker)', '/wiki/Python_(Ford_prototype)', '/w/index.php?title=Python&action=edit&section=6', '/wiki/Python_(missile)', '/wiki/Python_(nuclear_primary)', '/wiki/Colt_Python', '/w/index.php?title=Python&action=edit&section=7', '/wiki

# 2.Find the number of titles that have changed in the United States Code since its last release point
#: url = 'https://uscode.house.gov/download/download.shtml'

In [5]:
# Find url and store it in a variable
url = "https://uscode.house.gov/download/download.shtml"

In [6]:
# Download html with a get request
response2 = requests.get(url)

In [7]:
# Check if the request was successful (status code 200)
response2.status_code # 200 status code means OK!

200

In [8]:
# Parse html (create the 'soup')
soup2 = BeautifulSoup(response2.content, "html.parser")

In [9]:
# Check that the html code looks like it should
# print(soup2.prettify())

In [10]:
soup2.select("div.usctitlechanged")

[<div class="usctitlechanged" id="us/usc/t2">
 
           Title 2 - The Congress
 
         </div>,
 <div class="usctitlechanged" id="us/usc/t5">
 
           Title 5 - Government Organization and Employees <span class="footnote"><a class="fn" href="#fn">٭</a></span>
 </div>,
 <div class="usctitlechanged" id="us/usc/t6">
 
           Title 6 - Domestic Security
 
         </div>,
 <div class="usctitlechanged" id="us/usc/t18">
 
           Title 18 - Crimes and Criminal Procedure <span class="footnote"><a class="fn" href="#fn">٭</a></span>
 </div>,
 <div class="usctitlechanged" id="us/usc/t19">
 
           Title 19 - Customs Duties
 
         </div>,
 <div class="usctitlechanged" id="us/usc/t42">
 
           Title 42 - The Public Health and Welfare
 
         </div>]

# 3. Display the top 10 languages by number of native speakers stored in a pandas dataframe: 
# url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

In [11]:
# Send a GET request to the URL
url = "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers"
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table with class 'wikitable sortable'
table = soup.find('table', class_='wikitable sortable')

# Extract the rows from the table
rows = table.find_all('tr')[1:11]  # Skip the header row, take only top 10 languages

# Initialize lists to store language data
languages = []
native_speakers = []

# Iterate through each row and extract language data
for row in rows:
    columns = row.find_all('td')
    language = columns[0].text.strip()
    speakers = columns[1].text.strip()
    languages.append(language)
    native_speakers.append(speakers)

# Create a pandas DataFrame
data = {'Language Rank': languages, 'Native Speakers': native_speakers}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

  Language Rank   Native Speakers
0             1  Mandarin Chinese
1             2           Spanish
2             3           English
3             3            Arabic
4             5             Hindi
5             6           Bengali
6             7        Portuguese
7             8           Russian
8             9          Japanese
9            10   Western Punjabi


# 4.A list with the different kind of datasets available in data.gov.uk: url = 'https://data.gov.uk/'

In [12]:
# Send a GET request to the URL
url = "https://www.data.gov.uk/"
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find the unordered list with class 'govuk-list dgu-topics__list'
ul = soup.find('ul', class_='govuk-list dgu-topics__list')

# Extract all list items from the unordered list
dataset_topics = ul.find_all('li')

# Iterate through each list item and print the topic and description
for topic in dataset_topics:
    topic_name = topic.find('h3', class_='govuk-heading-s dgu-topics__heading').text.strip()
    description = topic.find('p', class_='govuk-body').text.strip()
    print("Topic:", topic_name)
    print("Description:", description)
    print()

Topic: Business and economy
Description: Small businesses, industry, imports, exports and trade

Topic: Crime and justice
Description: Courts, police, prison, offenders, borders and immigration

Topic: Defence
Description: Armed forces, health and safety, search and rescue

Topic: Education
Description: Students, training, qualifications and the National Curriculum

Topic: Environment
Description: Weather, flooding, rivers, air quality, geology and agriculture

Topic: Government
Description: Staff numbers and pay, local councillors and department business plans

Topic: Government spending
Description: Includes all payments by government departments over £25,000

Topic: Health
Description: Includes smoking, drugs, alcohol, medicine performance and hospitals

Topic: Mapping
Description: Addresses, boundaries, land ownership, aerial photographs, seabed and land terrain

Topic: Society
Description: Employment, benefits, household finances, poverty and population

Topic: Towns and cities
De