In [21]:
import multiprocessing, requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse

URL_base = "https://www.tevetron.hr/hr/webshop/ic/16/"
#URL_base = "https://www.geeksforgeeks.org/"

In [22]:
class MTWebCrawler:

    def __init__(self, URL_base) -> None:
        self.URL_base = URL_base
        self.rootURL = '{}://{}'.format(urlparse(self.URL_base).scheme,
                                        urlparse(self.URL_base).netloc)
        self.pool = ThreadPoolExecutor(max_workers=6)
        self.scrapedPages = set([])
        self.crawlQueue = Queue()
        self.crawlQueue.put(self.URL_base)
        print(f'Root URL initialized to {self.rootURL}')

    def scrapePage(self, url):
        try:
            res = requests.get(url, timeout=(3, 30))
            return res
        except requests.RequestException:
            return

    def runWebCrawler(self):
        while True:
            try:
                print("\n Name of current process: ", multiprocessing.current_process().name, "\n")
                targetURL = self.crawlQueue.get(timeout=60)

                if targetURL not in self.scrapedPages:
                    print("Scraping URL: {}".format(targetURL))
                    self.scrapedPages.add(targetURL)
                    job = self.pool.submit(self.scrapePage, targetURL)
                    job.add_done_callback(self.postScrapeCallback)
            except Empty:
                return
            except Exception as e:
                print(e)
                continue
    
    def scrapeInfo(self, html):
        soup = BeautifulSoup(html, "html5lib")
        web_page_paragraph_contents = soup('p')
        text = ''
        
        for para in web_page_paragraph_contents:
            if not ('https:' in str(para.text)):
                text = text + str(para.text).strip()
        print('\n <-----Text Present in The WebPage is--->\n', text, '\n')
        return
    
    def parseLinks(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        Anchor_Tags = soup.find_all('a', href=True)
        
        for link in Anchor_Tags:
            url = link['href']
            
            if url.startswith('/') or url.startswith(self.rootURL):
                url = urljoin(self.rootURL, url)
                
                if url not in self.scrapedPages:
                    self.crawlQueue.put(url)

    def postScrapeCallback(self, res):
        result = res.result()
        
        if result and result.status_code == 200:
            self.parseLinks(result.text)
            self.scrapeInfo(result.text)

In [23]:
if __name__ == '__main__':
    cc = MTWebCrawler("https://www.geeksforgeeks.org/")
    cc.runWebCrawler()
    cc.info()


 <-----Text Present in The WebPage is--->
Root URL initialized to https://www.geeksforgeeks.org

 Name of current process:  MainProcess 

Scraping URL: https://www.geeksforgeeks.org/
 A stack is a linear data structure in which the insertion of a new element and removal of an existing element takes place at the same end represented as the top of the stack.To implement the stack, it is required to maintain the pointer to the top of the stack, which is the last element to be inserted because we can access the elements only on the top of the stack.LIFO( Last In First Out ):This strategy states that the element that is inserted last will come out first. You can take a pile of plates kept on top of each other as a real-life example. The plate which we put last is on the top and since we remove the plate that is at the top, we can say that the plate that was put last comes out first.In order to make manipulations in a stack, there are certain operations provided to us.StackAdds an item to t

KeyboardInterrupt: 


 <-----Text Present in The WebPage is--->
 A queue is a linear data structure that is open at both ends and the operations are performed in First In First Out (FIFO) order.We define a queue to be a list in which all additions to the list are made at one end, and all deletions from the list are made at the other end.  The element which is first pushed into the order, the delete operation is first performed on that.FIFO property of queueLike stacks, Queues can also be represented in an array: In this representation, the Queue is implemented using the array. Variables used in this case areArray representation of queue:A queue can also be represented using following entities:There are different types of queues:To learn more about different types of queues, read the article on “Types of Queues“.Some of the basic operations for Queue in Data Structure are:There are a few supporting operations (auxiliary operations):Enqueue() operation in Queue adds (or stores) an element to the end of the q


 <-----Text Present in The WebPage is--->
 When it comes to searching and sorting data, one of the most fundamental data structures is the binary search tree. However, the performance of a binary search tree is highly dependent on its shape, and in the worst case, it can degenerate into a linear structure with a time complexity of O(n). This is where Red Black Trees come in, they are a type of balanced binary search tree that use a specific set of rules to ensure that the tree is always balanced. This balance guarantees that the time complexity for operations such as insertion, deletion, and searching is always O(log n), regardless of the initial shape of the tree.Red Black Trees are self-balancing, meaning that the tree adjusts itself automatically after each insertion or deletion operation. It uses a simple but powerful mechanism to maintain balance, by coloring each node in the tree either red or black.Red Black Tree-Red-Black tree is a binary search tree in which every node is col