### Instructions
1.1) Write a function to Get and parse html content from a Wikipedia page

1.2) Write a function to Extract article title

1.3) Write a function to Extract article text for each paragraph with their respective

headings. Map those headings to their respective paragraphs in the dictionary.

1.4) Write a function to collect every link that redirects to another Wikipedia page

1.5) Wrap all the previous functions into a single function that takes as parameters a Wikipedia link

1.6) Test the last function on a Wikipedia page of your choice

In [None]:
pip install requests beautifulsoup4

In [14]:
import requests  
from bs4 import BeautifulSoup  

# Fetch HTML content from the specified URL with error handling  
def get_html_content(url):  
    try:  
        response = requests.get(url)  
        response.raise_for_status()  # Raise an error for bad responses (4xx or 5xx)  
        return BeautifulSoup(response.content, 'html.parser')  
    except requests.RequestException as e:  
        print(f"Error fetching {url}: {e}")  
        return None  

# Extract the title of the article  
def extract_article_title(html_content):  
    title = html_content.find('h1').text.strip()  # Strip extra whitespace  
    return title  

# This function extracts headings and paragraphs into a dictionary  
def extract_article_paragraphs(html_content):  
    soup = html_content  
    headings = soup.find_all(['h2', 'h3'])  # h2 and h3 are typically used for headings  
    content_dict = {}  
    
    for heading in headings:  
        heading_name = heading.text.strip()  
        content_dict[heading_name] = []  
        
        # Find the next elements until the next heading  
        for element in heading.find_all_next():  
            if element.name in ['h2', 'h3']:  
                break  # Stop if we hit another heading  
            if element.name == 'p':  
                content_dict[heading_name].append(element.text.strip())  # Append paragraph  

    return content_dict  

# Collect all links that are redirects to Wikipedia pages  
def collect_wikipedia_links(html_content):  
    soup = html_content  
    links = soup.find_all('a', href=True)  
    wiki_links = set()  # Store unique links in a set  
    
    for link in links:  
        if link['href'].startswith('/wiki/') and ':' not in link['href']:  
            wiki_links.add(f"https://en.wikipedia.org{link['href']}")  

    return list(wiki_links)  # Convert back to list for returning  

# Combine above functions into a single function that takes a Wikipedia URL as input  
def scrape_wikipedia_page(url):  
    html_content = get_html_content(url)  
    
    if not html_content:  # Check if content was successfully fetched  
        return None  

    title = extract_article_title(html_content)  
    paragraphs = extract_article_paragraphs(html_content)  
    links = collect_wikipedia_links(html_content)  
    
    return {  
        'title': title,  
        'paragraphs': paragraphs,  
        'links': links  
    }  

# Test the function on the specified Wikipedia page  
if __name__ == "__main__":  
    url = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area'  
    result = scrape_wikipedia_page(url)  
    
    if result:  # Check if the result is not None  
        print("Title:", result['title'])  
        print("\nParagraphs:")  
        for heading, paras in result['paragraphs'].items():  
            print(f"\n{heading}:")  
            for para in paras:  
                print(f"- {para}")  

        print("\nWikipedia Links:")  
        for link in result['links']:  
            print(link)

Title: List of countries and dependencies by area

Paragraphs:

Contents:
- 
- 
- This is a list of the world's countries and their dependencies by land, water, and total area, ranked by total area.
- The entries in this list include, but are not limited to, those in the ISO 3166-1 standard, which includes sovereign states and dependent territories. All 193 member states of the United Nations plus the two observer states are given a rank number. Largely unrecognised states not in ISO 3166-1 are included in the list in ranked order. The areas of such largely unrecognised states are in most cases also included in the areas of the more widely recognised states that claim the same territory; see the notes in the "Notes" column for each country for clarification.
- Not included in the list are individual country claims to parts of the continent of Antarctica or entities such as the European Union[a] that have some degree of sovereignty but do not consider themselves to be sovereign countrie