In [1]:
from bs4 import BeautifulSoup
import requests
import json
from urllib.parse import urljoin, urlparse

def get_all_links(base_url, start):
    """Extract all unique links from the base URL."""
    response = requests.get(base_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        links = set()

        # Extract all <a> tags with href attributes
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            # Convert relative URLs to absolute URLs
            full_url = urljoin(base_url, href)
            # Filter links to include only those within the same domain
            if full_url.startswith(start):
                links.add(full_url)

        return list(links)
    else:
        print(f"Failed to retrieve links from {base_url}. Status code: {response.status_code}")
        return []
def scrape_website(url):
    """Scrape the headers, body paragraphs, policy, and description from a single webpage."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove 'related content' section
        for related in soup.find_all('div', class_='related-topics_bg'):
            related.decompose()  # Remove the entire element

        # Remove copyright section
        for copyright_div in soup.find_all('div', class_='master-column3'):
            copyright_div.decompose()

        # Remove Wookieepedia section
        for wookieepedia_div in soup.find_all('div', class_='master-column2'):
            wookieepedia_div.decompose()

        # Extract header paragraphs
        header_paragraphs = []
        for div in soup.find_all('div', class_='textblock'):
            inner_div = div.find('div')
            if inner_div:
                style = inner_div.get('style', '')
                if 'font-size: 24px' in style:  # Adjust condition for headers
                    text = ''.join(inner_div.stripped_strings)
                    if text:  # Ensure it's not empty
                        header_paragraphs.append(text)

        # Extract body paragraphs
        body_paragraphs = []
        for div in soup.find_all('div', class_='textblock'):
            inner_div = div.find('div')
            if inner_div:
                style = inner_div.get('style', '')
                if 'font-size: 24px' not in style:  # Exclude headers
                    text = ''.join(inner_div.stripped_strings)
                    if text:  # Ensure it's not empty
                        body_paragraphs.append(text)

        # Extract policy (bold text)
        policy = []
        for div in soup.find_all('div', style=lambda value: value and 'font-weight: bold' in value):
            span = div.find('span')
            if span and span.text.strip():
                policy.append(span.text.strip())

        # Extract description (normal text)
        description = []
        for div in soup.find_all('div', style=lambda value: value and 'font-weight: normal' in value):
            span = div.find('span')
            if span and span.text.strip():
                description.append(span.text.strip())

        return {
            'header_paragraphs': header_paragraphs,
            'body_paragraphs': body_paragraphs,
            'policy': policy,
            'description': description
        }
    else:
        print(f"Failed to scrape {url}. Status code: {response.status_code}")
        return {
            'header_paragraphs': [],
            'body_paragraphs': [],
            'policy': [],
            'description': []
        }

In [5]:
# Example usage
url = "https://en.wikipedia.org/wiki/Example"
all_links=get_all_links(url, "https://en.wikipedia.org/wiki/Example")
print(len(all_links))


7


In [7]:
def scrape_all_subpages(base_url):
    """Scrape all subpages of the given website."""
    # Get all subpage links
    links = get_all_links(base_url, "https://en.wikipedia.org/wiki/Example")
    all_content = {}

    for link in links:
        print(f"Scraping: {link}")
        page_content = scrape_website(link)
        if page_content:
            # Extract the last part of the URL to use as the dictionary key
            last_part = link.rstrip('/').split('/')[-1]  # Get the last segment of the URL
            page_content = scrape_website(link)
            # Add the scraped content to the dictionary with the last part of the URL as the key
            all_content[last_part] = page_content

    return all_content
web_content=scrape_all_subpages(url)

Scraping: https://en.wikipedia.org/wiki/Example#See_also
Scraping: https://en.wikipedia.org/wiki/Example#bodyContent
Scraping: https://en.wikipedia.org/wiki/Example.com
Scraping: https://en.wikipedia.org/wiki/Example#Arts
Scraping: https://en.wikipedia.org/wiki/Example
Scraping: https://en.wikipedia.org/wiki/Example_(musician)
Scraping: https://en.wikipedia.org/wiki/Example_(album)
