In [None]:
1) Write a function to Get and parse html content from a Wikipedia page

2) Write a function to Extract article title

3) Write a function to Extract article text for each paragraph with their respective

headings. Map those headings to their respective paragraphs in the dictionary.

4) Write a function to collect every link that redirects to another Wikipedia page

5) Wrap all the previous functions into a single function that takes as parameters a Wikipedia link

6) Test the last function on a Wikipedia page of your choice

In [1]:
!pip install streamlit




In [None]:
import requests  # To fetch Wikipedia pages
from bs4 import BeautifulSoup  # To parse HTML
from urllib.parse import urljoin  # To handle relative URLs

def get_html_content(url):
    """
    Fetches and parses HTML content from a Wikipedia page.
    
    Parameters:
    url (str): The Wikipedia page URL.
    
    Returns:
    BeautifulSoup: Parsed HTML content or None if an error occurs.
    """
    try:
        response = requests.get(url)  # Fetch the page
        response.raise_for_status()  # Raise an error for failed requests
        return BeautifulSoup(response.text, 'html.parser')  # Parse the page
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page: {e}")
        return None

def extract_title(soup):
    """Extracts the Wikipedia article title."""
    if soup:
        title_tag = soup.find('h1', {'id': 'firstHeading'})
        return title_tag.text.strip() if title_tag else "No title found"
    return "No title found"

def extract_text(soup):
    """
    Extracts text content from a Wikipedia article, associating headings with their paragraphs.
    
    Returns:
    dict: A dictionary where keys are headings and values are lists of paragraphs.
    """
    if not soup:
        return {"Error": ["No content available"]}

    content = {}  # Dictionary to store headings and their paragraphs
    current_heading = "Introduction"
    content[current_heading] = []

    for element in soup.find('div', {'id': 'mw-content-text'}).find_all(['h2', 'h3', 'p']):
        if element.name in ['h2', 'h3']:  # If it's a heading
            current_heading = element.text.strip()
            content[current_heading] = []
        elif element.name == 'p':  # If it's a paragraph
            content[current_heading].append(element.text.strip())

    return content

def extract_internal_links(soup, base_url):
    """Extracts all internal Wikipedia links."""
    if not soup:
        return []

    links = set()
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('/wiki/') and ':' not in href:
            full_url = urljoin(base_url, href)
            links.add(full_url)

    return list(links)

# Main execution
if __name__ == "__main__":
    wikipedia_url = input("Enter Wikipedia Article URL: ").strip()
    
    # Fetch and parse content
    soup = get_html_content(wikipedia_url)

    if soup:
        print("\n📌 Title:")
        print(extract_title(soup))

        print("\n📖 Article Content:")
        extracted_text = extract_text(soup)
        for heading, paragraphs in extracted_text.items():
            print(f"\n### {heading} ###")
            for paragraph in paragraphs:
                print(paragraph)

        print("\n🔗 Internal Wikipedia Links:")
        links = extract_internal_links(soup, wikipedia_url)
        if links:
            print("\n".join(links[:10]))  # Show first 10 links
        else:
            print("No internal links found.")



📌 Title:
United States

📖 Article Content:

### Introduction ###

The United States of America (USA), commonly known as the United States (U.S.) or America, is a country primarily located in North America. It is a federal union of 50 states and Washington, D.C. as its federal capital district. The 48 contiguous states border Canada to the north and Mexico to the south, with the semi-exclavic state of Alaska in the northwest and the archipelagic state of Hawaii in the Pacific Ocean. The U.S. also asserts sovereignty over five major island territories and various uninhabited islands.[k] It is a megadiverse country, with the world's third-largest land area[d] and third-largest population, exceeding 340 million.[l] Its three largest metropolitan areas are New York, Los Angeles, and Chicago, and its three most populous states are California, Texas, and Florida.
Paleo-Indians migrated to North America across the Bering land bridge more than 12,000 years ago, and formed various civilizations