In [12]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import time
import hashlib

class WebScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.visited = set()
        self.output_dir = 'scraped_content'

        # Create output directory if it doesn't exist
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

    def scrape(self, url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Request failed: {e}")
            return ""

    def parse(self, html):
        return BeautifulSoup(html, "html.parser")

    def extract_links(self, soup):
        links = set()
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            full_url = urljoin(self.base_url, href)
            if full_url.startswith(self.base_url):
                links.add(full_url)
        return links

    def save_content(self, url, content):
        # Create a unique filename based on the URL
        parsed_url = urlparse(url)
        filename = hashlib.md5(parsed_url.path.encode('utf-8')).hexdigest() + '.html'
        filepath = os.path.join(self.output_dir, filename)

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"Saved: {filepath}")

    def run(self, start_url):
        to_visit = [start_url]
        while to_visit:
            current_url = to_visit.pop()
            if current_url in self.visited:
                continue

            print(f"Visiting: {current_url}")
            html = self.scrape(current_url)
            if not html:
                print(f"{current_url} is not html")
                continue

            soup = self.parse(html)
            self.visited.add(current_url)
            self.save_content(current_url, html)
            links = self.extract_links(soup)
            to_visit.extend(links - self.visited)

            print(f"Websites to visit: {len(to_visit)}")
            print(f"Visited Webiste: {len(self.visited)}")

            # Sleep to avoid overwhelming the server
            time.sleep(1)

        print("Scraping complete. Visited pages:")
        for url in self.visited:
            print(url)

if __name__ == "__main__": 
    base_url = "https://u.ae/en/information-and-services" 
    scraper = WebScraper(base_url)
    scraper.run(base_url)


Visiting: https://u.ae/en/information-and-services
Saved: scraped_content\15c26cb7c7a0bd8dda3820db46cc664d.html
Websites to visit: 23
Visited Webiste: 1
Visiting: https://u.ae/en/information-and-services/finance-and-investment
Saved: scraped_content\677209f0414a9c7aa45577fbfe533d91.html
Websites to visit: 50
Visited Webiste: 2
Visiting: https://u.ae/en/information-and-services/finance-and-investment/financial-markets
Saved: scraped_content\b10ff66237bbb21f262a99421db7ecf2.html
Websites to visit: 61
Visited Webiste: 3
Visiting: https://u.ae/en/information-and-services/business
Saved: scraped_content\cd07c92e9cbbbed52770292ca91fc8d2.html
Websites to visit: 82
Visited Webiste: 4
Visiting: https://u.ae/en/information-and-services/business/financial-credibility-for-individuals-and-companies
Saved: scraped_content\98e5e60705467134d77cfe5fc8d1f39b.html
Websites to visit: 86
Visited Webiste: 5
Visiting: https://u.ae/en/information-and-services/top-government-services
Saved: scraped_content\281