In [5]:
pip install beautifulsoup4


Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 8.4 MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.5-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.12.3 soupsieve-2.5
You should consider upgrading via the '/Users/atatuna/.pyenv/versions/3.10.0/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque

In [17]:

visited_urls = set()  # Initialize the set of visited URLs

def is_gov_uk_domain(url):
    """
    Check if the URL belongs to the gov.uk domain.
    """
    parsed_url = urlparse(url)
    return parsed_url.netloc.endswith('gov.uk')

def save_text_to_file(text, filename):
    """
    Save the extracted text to a text file, removing empty lines.
    """
    with open(filename, 'a', encoding='utf-8') as file:
        lines = text.split('\n')
        # Filter out empty lines
        non_empty_lines = [line.strip() for line in lines if line.strip()]
        cleaned_text = '\n'.join(non_empty_lines)
        file.write(cleaned_text + '\n\n')

def scrape_pages_breadth_first(start_url, max_pages):
    global visited_urls  # Access the global visited_urls variable
    queue = deque([(start_url, 0)])  # Each item in the queue is a tuple (URL, depth)

    while queue and len(visited_urls) < max_pages:
        # Sort the links based on whether they contain "tax" in the URL
        queue = deque(sorted(queue, key=lambda x: 'tax' in x[0], reverse=True))

        url, depth = queue.popleft()  # Dequeue the next URL
        if url in visited_urls or not is_gov_uk_domain(url):
            continue

        print(f"Scraping {url}")
        visited_urls.add(url)

        try:
            response = requests.get(url)
            if response.status_code != 200:
                continue

            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract and save the text content of the current page
            page_text = soup.get_text()
            save_text_to_file(page_text, 'scraped_text.txt')

            if depth < max_pages:
                # Find all links on the page and add them to the queue with increased depth
                links = soup.find_all('a', href=True)
                for link in links:
                    absolute_link = urljoin(url, link['href'])
                    queue.append((absolute_link, depth + 1))

        except requests.exceptions.RequestException:
            continue

# Starting URL
start_url = 'https://www.gov.uk/browse/tax'

# Maximum number of pages to visit
max_pages = 10000

# Run the breadth-first traversal with prioritization
scrape_pages_breadth_first(start_url, max_pages)

# Save the visited URLs to a file
with open('visited_urls.txt', 'w', encoding='utf-8') as visited_file:
    for url in visited_urls:
        visited_file.write(url + '\n')


Scraping https://www.gov.uk/browse/tax
Scraping https://www.gov.uk/browse/tax#content
Scraping https://www.gov.uk/browse/tax/capital-gains
Scraping https://www.gov.uk/browse/tax/court-claims-debt-bankruptcy
Scraping https://www.gov.uk/browse/tax/dealing-with-hmrc
Scraping https://www.gov.uk/browse/tax/income-tax
Scraping https://www.gov.uk/browse/tax/inheritance-tax
Scraping https://www.gov.uk/browse/tax/national-insurance
Scraping https://www.gov.uk/browse/tax/self-assessment
Scraping https://www.gov.uk/browse/tax/vat
Scraping https://www.gov.uk/browse/tax/capital-gains#content
Scraping https://www.gov.uk/capital-gains-tax
Scraping https://www.gov.uk/report-and-pay-your-capital-gains-tax
Scraping https://www.gov.uk/tax-sell-property
Scraping https://www.gov.uk/tax-sell-home
Scraping https://www.gov.uk/tax-relief-selling-home
Scraping https://www.gov.uk/tax-live-abroad-sell-uk-home
Scraping https://www.gov.uk/guidance/capital-gains-tax-for-non-residents-uk-residential-property
Scraping

KeyboardInterrupt: 

In [16]:

(visited_urls)

{'https://www.gov.uk/browse/tax',
 'https://www.gov.uk/browse/tax#content',
 'https://www.gov.uk/browse/tax/capital-gains',
 'https://www.gov.uk/browse/tax/court-claims-debt-bankruptcy',
 'https://www.gov.uk/browse/tax/dealing-with-hmrc',
 'https://www.gov.uk/browse/tax/income-tax',
 'https://www.gov.uk/browse/tax/inheritance-tax',
 'https://www.gov.uk/browse/tax/national-insurance',
 'https://www.gov.uk/browse/tax/self-assessment',
 'https://www.gov.uk/browse/tax/vat'}