In [6]:
pip install requests beautifulsoup4




Libraries

In [19]:
# Import necessary libraries
from queue import Queue
from threading import Thread
from bs4 import BeautifulSoup
import unittest
import time
import requests

In [20]:
# to fetch real-worl websites
def fetch_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return None


Define producer function as required with multithreading in mind

In [21]:
def marker_producer(url_list, html_queue, max_queue_size=10):
    def fetch_and_enqueue(url):
        try:
            html_content = fetch_html(url)
            while html_queue.qsize() >= max_queue_size: #implemented queue trimming
                try:
                    html_queue.get_nowait()
                except Queue.Empty:
                    continue
            html_queue.put((url, html_content))
        except Exception as e:
            print(f"Error in producer for URL {url}: {e}")

    threads = []
    for url in url_list:
        thread = Thread(target=fetch_and_enqueue, args=(url,))
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    html_queue.put(None)  # Put a None marker to signal that the producer is done


Define consumer function

In [22]:
def marker_consumer(html_queue, output_dict):
    while True:
        item = html_queue.get()
        if item is None:
            break
        url, html_content = item
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            links = [a['href'] for a in soup.find_all('a', href=True)]
            output_dict[url] = links
        except Exception as e:
            print(f"Error in consumer for URL {url}: {e}")


Main execution

In [23]:
# Initialize queue and o/p dictionary
html_queue = Queue()
output_dict = {}

#  list of URLs
url_list = ["https://www.linkedin.com", "https://www.otta.com","https://www.google.com"] #more sites can be added, change queue size if more than 10 sites

# Start Producer thread
producer_thread = Thread(target=marker_producer, args=(url_list, html_queue))
producer_thread.start()

# Start Consumer thread
consumer_thread = Thread(target=marker_consumer, args=(html_queue, output_dict))
consumer_thread.start()

# Wait for producer and consumer to finish
producer_thread.join()
consumer_thread.join()

# Display the output dictionary
print(output_dict)

{'https://www.google.com': ['https://www.google.com/imghp?hl=en&tab=wi', 'https://maps.google.com/maps?hl=en&tab=wl', 'https://play.google.com/?hl=en&tab=w8', 'https://www.youtube.com/?tab=w1', 'https://news.google.com/?tab=wn', 'https://mail.google.com/mail/?tab=wm', 'https://drive.google.com/?tab=wo', 'https://www.google.com/intl/en/about/products?tab=wh', 'http://www.google.com/history/optout?hl=en', '/preferences?hl=en', 'https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=GAZAAQ', '/advanced_search?hl=en&authuser=0', '/intl/en/ads/', '/services/', '/intl/en/about.html', '/intl/en/policies/privacy/', '/intl/en/policies/terms/'], 'https://www.linkedin.com': ['#main-content', '/?trk=guest_homepage-basic_nav-header-logo', 'https://www.linkedin.com/pulse/topics/home/?trk=guest_homepage-basic_guest_nav_menu_articles', 'https://www.linkedin.com/pub/dir/+/+?trk=guest_homepage-basic_guest_nav_menu_people', 'https://www.linkedin.com/learning/searc

In [24]:
#test to check fetch function
def test_fetch_html():
    url1 = "https://www.google.com"
    assert isinstance(fetch_html(url1), str), f"Failed to fetch HTML content from {url1}"

    url2 = "https://unreachable-website.com"
    assert fetch_html(url2) is None, f"Unexpected result when fetching from {url2}"

    print("test_fetch_html passed")

#test for marker_producer function
def test_marker_producer():
    url_list = ["https://www.google.com"]
    test_queue = Queue()
    marker_producer(url_list, test_queue)
    assert not test_queue.empty(), "Queue should not be empty"
    print("test_marker_producer passed")

#test for marker_consumer function
def test_marker_consumer():
    url = "https://www.google.com"
    html_content = "<html><body><a href='/link1'></a><a href='/link2'></a></body></html>"
    test_queue = Queue()
    test_queue.put((url, html_content))
    test_queue.put(None)

    test_output_dict = {}
    marker_consumer(test_queue, test_output_dict)

    assert url in test_output_dict, f"URL {url} not in output dictionary"
    print("test_marker_consumer passed")


In [25]:
# Run the unit tests
test_fetch_html()
test_marker_producer()
test_marker_consumer()

Failed to fetch https://unreachable-website.com: HTTPSConnectionPool(host='unreachable-website.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x796966f830a0>: Failed to resolve 'unreachable-website.com' ([Errno -2] Name or service not known)"))
test_fetch_html passed
test_marker_producer passed
test_marker_consumer passed
