In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import uuid
import re

class WebCrawler:
    def __init__(self):
        self.visited_urls = set()
        self.pages_crawled = 0
        self.output_folder = "crawled_pages"

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

    def crawl(self, seed_url, max_pages=1000):
        self._crawl_page(seed_url, max_pages)

    
    def _crawl_page(self, url, max_pages):
        if self.pages_crawled >= max_pages:
            return
    
        if url in self.visited_urls:
            return
        self.visited_urls.add(url)
    
        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                self._save_page(url, soup)
                self.pages_crawled += 1
                print(f"Crawled {url}. Number of Pages crawled: {self.pages_crawled}")
                for link in soup.find_all("a", href=True):
                    absolute_url = urljoin(url, link["href"])
                    self._crawl_page(absolute_url, max_pages)
        except Exception as e:
            print(f"Error crawling {url}: {e}")
            # Continue crawling other pages even if an error occurs
            for link in soup.find_all("a", href=True):
                absolute_url = urljoin(url, link["href"])
                self._crawl_page(absolute_url, max_pages)


    def _save_page(self, url, soup):
        if not self._is_page_english(soup):
            return
    
        unique_id = str(uuid.uuid4())
    
        title = soup.title.string.strip() if soup.title else ""
        
        # Extract headings
        headings = [heading.text.strip() for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
    
        # Extract a portion of the page body (e.g., first three paragraphs)
        page_body = ""
        for paragraph in soup.find_all("p")[:3]:  # Extract first three paragraphs
            page_body += paragraph.get_text().strip() + "\n"
    
        # Save page
        filename = f"page{self.pages_crawled + 1}.txt"  # Modify filename format
        filepath = os.path.join(self.output_folder, filename)
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(f"URL: {url}\n")
            f.write(f"Title: {title}\n")
            f.write(f"Headings:\n")
            for heading in headings:
                f.write(f"- {heading}\n")
            f.write(f"\nPage Body:\n{page_body}")

    def _is_page_english(self, soup):
        lang_attr = soup.html.get("lang")
        if lang_attr and lang_attr.startswith("en"):
            return True
        return False


In [3]:
if __name__ == "__main__":
    # Create a WebCrawler instance
    crawler = WebCrawler()
    
    # Define the seed URL
    seed_url = "https://en.wikipedia.org/wiki/Plant"
    
    # Specify the maximum number of pages to crawl
    max_pages = 1000
    
    # Start crawling
    crawler.crawl(seed_url, max_pages)
    
    # Print the root directory for crawled pages
    print("Root directory for crawled pages:", os.path.abspath(crawler.output_folder))


Crawled https://en.wikipedia.org/wiki/Plant. Number of Pages crawled: 1
Crawled https://en.wikipedia.org/wiki/Plant#bodyContent. Number of Pages crawled: 2
Crawled https://en.wikipedia.org/wiki/Main_Page. Number of Pages crawled: 3
Crawled https://en.wikipedia.org/wiki/Main_Page#bodyContent. Number of Pages crawled: 4
Crawled https://en.wikipedia.org/wiki/Wikipedia:Contents. Number of Pages crawled: 5
Crawled https://en.wikipedia.org/wiki/Wikipedia:Contents#bodyContent. Number of Pages crawled: 6
Crawled https://en.wikipedia.org/wiki/Portal:Current_events. Number of Pages crawled: 7
Crawled https://en.wikipedia.org/wiki/Portal:Current_events#bodyContent. Number of Pages crawled: 8
Crawled https://en.wikipedia.org/wiki/Special:Random. Number of Pages crawled: 9
Crawled https://en.wikipedia.org/wiki/Special:Random#bodyContent. Number of Pages crawled: 10
Crawled https://en.wikipedia.org/wiki/Wikipedia:About. Number of Pages crawled: 11
Crawled https://en.wikipedia.org/wiki/Wikipedia:Abou



Error crawling https://en.planet.wikimedia.org/rss20.xml: 'NoneType' object has no attribute 'get'
Crawled https://ar.planet.wikimedia.org/. Number of Pages crawled: 510
Crawled https://ar.planet.wikimedia.org/index.html. Number of Pages crawled: 511
Error crawling https://ar.planet.wikimedia.org/rss20.xml: 'NoneType' object has no attribute 'get'
Crawled https://cs.planet.wikimedia.org/. Number of Pages crawled: 512
Crawled https://cs.planet.wikimedia.org/index.html. Number of Pages crawled: 513
Error crawling https://cs.planet.wikimedia.org/rss20.xml: 'NoneType' object has no attribute 'get'
Crawled https://de.planet.wikimedia.org/. Number of Pages crawled: 514
Crawled https://de.planet.wikimedia.org/index.html. Number of Pages crawled: 515
Error crawling https://de.planet.wikimedia.org/rss20.xml: 'NoneType' object has no attribute 'get'
Crawled https://el.planet.wikimedia.org/. Number of Pages crawled: 516
Crawled https://el.planet.wikimedia.org/index.html. Number of Pages crawled: 