# Exercises XP

##🌟 Exercise 1 : Parsing HTML With BeautifulSoup


In [6]:
from bs4 import BeautifulSoup
import requests

In [8]:
# Read the HTML content of the page
html = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports World</title>
    <style>
        body { font-family: Arial, sans-serif; }
        header, nav, section, article, footer { margin: 20px; padding: 15px; }
        nav { background-color: #333; }
        nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
        nav a:hover { background-color: #ddd; color: black; }
        .video { text-align: center; margin: 20px 0; }
    </style>
</head>
<body>

    <header>
        <h1>Welcome to Sports World</h1>
        <p>Your one-stop destination for the latest sports news and videos.</p>
    </header>

    <nav>
        <a href="#football">Football</a>
        <a href="#basketball">Basketball</a>
        <a href="#tennis">Tennis</a>
    </nav>

    <section id="football">
        <h2>Football</h2>
        <article>
            <h3>Latest Football News</h3>
            <p>Read about the latest football matches and player news.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/football-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="basketball">
        <h2>Basketball</h2>
        <article>
            <h3>NBA Highlights</h3>
            <p>Watch highlights from the latest NBA games.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/basketball-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="tennis">
        <h2>Tennis</h2>
        <article>
            <h3>Grand Slam Updates</h3>
            <p>Get the latest updates from the world of Grand Slam tennis.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/tennis-video-id" frameborder="0" allowfullscreen></iframe>
            </div>
        </article>
    </section>

    <footer>
        <form action="mailto:contact@sportsworld.com" method="post" enctype="text/plain">
            <label for="name">Name:</label><br>
            <input type="text" id="name" name="name"><br>
            <label for="email">Email:</label><br>
            <input type="email" id="email" name="email"><br>
            <label for="message">Message:</label><br>
            <textarea id="message" name="message" rows="4" cols="50"></textarea><br><br>
            <input type="submit" value="Send">
        </form>
    </footer>

</body>
</html>'''

In [11]:
# Create a BeautifulSoup object to parse this HTML
soup = BeautifulSoup(html, 'html.parser')

In [12]:
# Find the title of the webpage
title = soup.title.text
print(f"Website Title: {title}")

Website Title: Sports World


In [16]:
# Extract all paragraphs (<p> tags) from the page
paragraphs = soup.find_all('p')
print("Paragraphs on the page:")
for paragraph in paragraphs:
    print(paragraph.text)

Paragraphs on the page:
Your one-stop destination for the latest sports news and videos.
Read about the latest football matches and player news.
Watch highlights from the latest NBA games.
Get the latest updates from the world of Grand Slam tennis.


In [15]:
# Retrieve all links (URLs in <a href=""> tags) on the page
links = soup.find_all('a', href=True)
print("Links on the page:")
for link in links:
    print(link['href'])

Links on the page:
#football
#basketball
#tennis


##🌟 Exercise 2 : Scraping Robots.Txt From Wikipedia

Write a Python program to download and display the content of robot.txt for en.wikipedia.org



In [28]:
# URL
url = "https://en.wikipedia.org/robots.txt"

# Get request to fetch the content
response = requests.get(url)

print(response.text[:300])
websoup = BeautifulSoup(response.text, 'html.parser')

﻿# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https:/


##🌟 Exercise 3 : Extracting Headers From Wikipedia’s Main Page

Write a Python program to extract and display all the header tags from en.wikipedia.org/wiki/Main_Page.

In [31]:
# URL
url = "https://en.wikipedia.org/wiki/Main_Page"

# Get request to fetch the content
response = requests.get(url)

websoup = BeautifulSoup(response.text, 'html.parser')

In [35]:
# display all the header tags
websoup.find_all("h1")

[<h1 class="firstHeading mw-first-heading" id="firstHeading" style="display: none"><span class="mw-page-title-main">Main Page</span></h1>,
 <h1><span class="mw-headline" id="Welcome_to_Wikipedia">Welcome to <a href="/wiki/Wikipedia" title="Wikipedia">Wikipedia</a></span></h1>]

## 🌟 Exercise 4 : Checking For Page Title

Write a Python program to check whether a page contains a title or not.

In [36]:
def has_title(url):
    # Get request to fetch the HTML content
    response = requests.get(url)

    # Check if the page contains a title tag
    title_tag = soup.title
    if title_tag is not None:
      print(f"The page has a title: {title_tag.text}")
    else:
      print("The page does not contain a title.")


# URL of the webpage
url = "https://en.wikipedia.org/wiki/Main_Page"

has_title(url)

The page has a title: Sports World
