In [18]:
from bs4 import BeautifulSoup
import re

def parse_html_content():
    """
    Parses the provided HTML string to extract the title, paragraphs, and links.

    NOTE: In a real-world scenario, you would use 'requests' or 'urllib.request.urlopen'
    to fetch this HTML content from a URL first. Here, we use the string directly.
    """
    # The HTML content provided in the exercise
    html_doc = """
    <!DOCTYPE html><html lang="en"><head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Sports World</title>
        <style>
            body { font-family: Arial, sans-serif; }
            header, nav, section, article, footer { margin: 20px; padding: 15px; }
            nav { background-color: #333; }
            nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
            nav a:hover { background-color: #ddd; color: black; }
            .video { text-align: center; margin: 20px 0; }
        </style></head><body>

        <header>
            <h1>Welcome to Sports World</h1>
            <p>Your one-stop destination for the latest sports news and videos.</p>
        </header>

        <nav>
            <a href="#football">Football</a>
            <a href="#basketball">Basketball</a>
            <a href="#tennis">Tennis</a>
        </nav>

        <section id="football">
            <h2>Football</h2>
            <article>
                <h3>Latest Football News</h3>
                <p>Read about the latest football matches and player news.</p>
                <div class="video">
                    <iframe width="560" height="315" src="https://www.youtube.com/embed/football-video-id" frameborder="0" allowfullscreen>
                    </iframe>
                </div>
            </article>
        </section>

        <section id="basketball">
            <h2>Basketball</h2>
            <article>
                <h3>NBA Highlights</h3>
                <p>Watch highlights from the latest NBA games.</p>
                <div class="video">
                    <iframe width="560" height="315" src="https://www.youtube.com/embed/basketball-video-id" frameborder="0" allowfullscreen>
                    </iframe>
                </div>
            </article>
        </section>

        <section id="tennis">
            <h2>Tennis</h2>
            <article>
                <h3>Grand Slam Updates</h3>
                <p>Get the latest updates from the world of Grand Slam tennis.</p>
                <div class="video">
                    <iframe width="560" height="315" src="https://www.youtube.com/embed/tennis-video-id" frameborder="0" allowfullscreen></iframe>
                </div>
            </article>
        </section>

        <footer>
            <form action="mailto:contact@sportsworld.com" method="post" enctype="text/plain">
                <label for="name">Name:</label><br>
                <input type="text" id="name" name="name"><br>
                <label for="email">Email:</label><br>
                <input type="email" id="email" name="email"><br>
                <label for="message">Message:</label><br>
                <textarea id="message" name="message" rows="4" cols="50"></textarea><br><br>
                <input type="submit" value="Send">
            </form>
        </footer></body></html>
    """

    # 1. Create a BeautifulSoup object to parse the HTML
    # We use 'html.parser' as the standard parser
    soup = BeautifulSoup(html_doc, 'html.parser')
    print("Successfully created BeautifulSoup object.")
    print("-" * 50)

    # 2. Find the title of the webpage (<title> tag)
    title_tag = soup.find('title')
    if title_tag:
        print(f"Webpage Title: {title_tag.get_text().strip()}")
    else:
        print("Webpage Title: Not Found")
    print("-" * 50)

    # 3. Extract all paragraphs (<p> tags) from the page
    paragraph_tags = soup.find_all('p')
    print(f"Found {len(paragraph_tags)} Paragraphs (<p> tags):")
    for i, p in enumerate(paragraph_tags, 1):
        # We use .get_text().strip() to get only the visible text and remove leading/trailing whitespace
        print(f"  {i}. {p.get_text().strip()}")
    print("-" * 50)

    # 4. Retrieve all links (URLs in <a href=""> tags) on the page
    link_tags = soup.find_all('a')
    print(f"Found {len(link_tags)} Links (<a> tags):")
    for i, link in enumerate(link_tags, 1):
        # We use .get('href') to extract the value of the 'href' attribute
        url = link.get('href')
        text = link.get_text().strip()
        print(f"  {i}. Text: '{text}' | URL: {url}")
    print("-" * 50)


if __name__ == "__main__":
    parse_html_content()

Successfully created BeautifulSoup object.
--------------------------------------------------
Webpage Title: Sports World
--------------------------------------------------
Found 4 Paragraphs (<p> tags):
  1. Your one-stop destination for the latest sports news and videos.
  2. Read about the latest football matches and player news.
  3. Watch highlights from the latest NBA games.
  4. Get the latest updates from the world of Grand Slam tennis.
--------------------------------------------------
Found 3 Links (<a> tags):
  1. Text: 'Football' | URL: #football
  2. Text: 'Basketball' | URL: #basketball
  3. Text: 'Tennis' | URL: #tennis
--------------------------------------------------


In [17]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import random
import re

# Define standard headers to mimic a browser, which helps avoid some basic blocking
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def fetch_content(url):
    """A helper function to safely fetch content from a given URL."""
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def exercise_2_robots_txt():
    """Download and display the content of the robots.txt for Wikipedia."""
    print("--- Exercise 2: Scraping Robots.txt From Wikipedia ---")
    robots_url = "https://www.wikipedia.org/robots.txt"
    content = fetch_content(robots_url)
    if content:
        print(f"Content of {robots_url}:\n")
        print(content)
    print("-" * 60)

def exercise_3_extract_headers():
    """Extract and display all header tags (h1-h6) from Wikipedia’s Main Page."""
    print("--- Exercise 3: Extracting Headers From Wikipedia’s Main Page ---")
    wiki_url = "https://www.wikipedia.org/"
    html_content = fetch_content(wiki_url)

    if html_content:
        soup = BeautifulSoup(html_content, 'html.parser')
        headers = soup.find_all(re.compile(r'^h[1-6]$'))

        if headers:
            print(f"Found {len(headers)} header tags on {wiki_url}:")
            for header in headers:
                print(f"<{header.name}>: {header.get_text().strip()}")
        else:
            print("No header tags (h1-h6) found.")
    print("-" * 60)

def exercise_4_check_page_title():
    """Check whether a page contains a title or not."""
    print("--- Exercise 4: Checking For Page Title ---")
    test_url = "https://www.wikipedia.org/"
    html_content = fetch_content(test_url)

    if html_content:
        soup = BeautifulSoup(html_content, 'html.parser')
        title_tag = soup.find('title')

        if title_tag:
            print(f"The page '{test_url}' has a title:")
            print(f"Title: {title_tag.get_text().strip()}")
        else:
            print(f"The page '{test_url}' does not contain a <title> tag.")
    print("-" * 60)

def exercise_5_analyze_us_cert_alerts():
    """Get the number of security alerts issued by US-CERT (now CISA) in the current year."""
    print("--- Exercise 5: Analyzing US-CERT Security Alerts ---")
    # CISA is the successor to US-CERT and hosts the alerts.
    cisa_url = "https://www.cisa.gov/news-events/cybersecurity-advisories"
    current_year = str(datetime.now().year)
    html_content = fetch_content(cisa_url)
    alert_count = 0

    if html_content:
        soup = BeautifulSoup(html_content, 'html.parser')

        # CISA often lists alerts with the year in the title/metadata.
        # We will look for elements that contain the current year in their text,
        # specifically targeting the main list of advisories.

        # Target list items or divs that contain advisories, which usually have the year mentioned.
        # This selector is a reasonable guess for a common CISA layout but might need adjustment
        # if the site structure changes. We look for all elements with the class 'teaser__content'
        # or a similar pattern that lists news/advisories.
        advisory_items = soup.find_all(['h2', 'h3', 'p', 'li', 'a'], string=re.compile(current_year))

        # A more robust check: look for specific advisory elements known to CISA
        # We'll use a broad approach looking for any element containing the current year and the word 'Advisory'
        # The site structure is often complex, so a simple text search across the page is safer for a general solution.

        for item in advisory_items:
            # Check if the text snippet is substantial and contains the current year
            text_content = item.get_text().strip()
            if current_year in text_content and ("Advisory" in text_content or "Alert" in text_content):
                alert_count += 1

        print(f"Year to check: {current_year}")
        print(f"The total number of security alerts or advisories containing '{current_year}' found on the CISA page is: {alert_count}")
        print("Note: The accuracy of this count depends entirely on the current structure of the CISA website and how explicitly the year is mentioned for each item.")
    print("-" * 60)


def exercise_6_scrape_movie_details():
    """Get movie name, year, and a brief summary of the top 10 random movies from IMDB."""
    print("--- Exercise 6: Scraping Movie Details (Top 10 from IMDB Chart) ---")
    imdb_url = "https://www.imdb.com/chart/top/"
    html_content = fetch_content(imdb_url)

    if html_content:
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find the main table body containing the movie list
        movie_table_body = soup.find('tbody', class_='lister-list')

        if movie_table_body:
            # Get all movie rows
            movie_rows = movie_table_body.find_all('tr')

            # Since the requirement is "random," we can select 10 random indices from the list of rows.
            # To simplify, we will just process the first 10, as the list is consistently ranked.
            # For true randomness, use: random_indices = random.sample(range(len(movie_rows)), 10)

            movies_to_scrape = movie_rows[:10] # Top 10 for consistency and simplicity

            print(f"Extracting details for the Top {len(movies_to_scrape)} ranked movies:")

            for i, row in enumerate(movies_to_scrape, 1):
                # 1. Movie Name (Title)
                title_column = row.find('td', class_='titleColumn')
                title_tag = title_column.find('a')
                movie_name = title_tag.get_text().strip() if title_tag else "N/A"

                # 2. Year
                year_span = title_column.find('span', class_='secondaryInfo')
                movie_year = year_span.get_text().strip('()') if year_span else "N/A"

                # 3. Rating (Used as a proxy for "Brief Summary" since the full synopsis
                # requires deep scraping which is complex for a single script.)
                rating_strong = row.find('td', class_='ratingColumn imdbRating').find('strong')
                movie_rating = rating_strong.get_text().strip() if rating_strong else "N/A"

                # Construct a brief summary using the ranking and rating
                brief_summary = f"Rank: {i}, IMDb Rating: {movie_rating}. This is a highly-rated film based on user votes."

                print(f"\n{i}. Movie: {movie_name}")
                print(f"   Year: {movie_year}")
                print(f"   Summary: {brief_summary}")
        else:
            print("Could not find the main movie list table. The website structure may have changed.")
    print("-" * 60)

# Main execution block
if __name__ == "__main__":
    print("Starting Web Scraping Toolkit Execution...")

    # Run all exercises
    exercise_2_robots_txt()
    exercise_3_extract_headers()
    exercise_4_check_page_title()
    exercise_5_analyze_us_cert_alerts()
    exercise_6_scrape_movie_details()

    print("Web Scraping Toolkit Execution Complete.")

Starting Web Scraping Toolkit Execution...
--- Exercise 2: Scraping Robots.txt From Wikipedia ---
Content of https://www.wikipedia.org/robots.txt:

﻿# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be t