# 41443  - Anuj Mutha

In [1]:
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

In [2]:
logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

In [3]:
class Crawler:

    def __init__(self, urls=[]):
        self.visited_urls = []
        self.urls_to_visit = urls

    def download_url(self, url):
        return requests.get(url).text

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path

    def add_url_to_visit(self, url):
        if url not in self.visited_urls and url not in self.urls_to_visit:
            self.urls_to_visit.append(url)

    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            logging.info(f'Crawling: {url}')
            try:
                self.crawl(url)
            except Exception:
                logging.exception(f'Failed to crawl: {url}')
            finally:
                self.visited_urls.append(url)


In [None]:
Crawler(urls=['https://www.imdb.com/']).run()

2022-11-02 11:46:14,917 INFO:Crawling: https://www.imdb.com/
2022-11-02 11:46:17,881 INFO:Crawling: https://www.imdb.com/?ref_=nv_home
2022-11-02 11:46:19,505 INFO:Crawling: https://www.imdb.com/calendar/?ref_=nv_mv_cal
2022-11-02 11:46:23,611 INFO:Crawling: https://www.imdb.com/chart/top/?ref_=nv_mv_250
2022-11-02 11:46:25,549 INFO:Crawling: https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm
2022-11-02 11:46:26,964 INFO:Crawling: https://www.imdb.com/feature/genre/?ref_=nv_ch_gr
2022-11-02 11:46:27,830 INFO:Crawling: https://www.imdb.com/chart/boxoffice/?ref_=nv_ch_cht
2022-11-02 11:46:28,919 INFO:Crawling: https://www.imdb.com/showtimes/?ref_=nv_mv_sh
2022-11-02 11:46:31,067 INFO:Crawling: https://www.imdb.com/news/movie/?ref_=nv_nw_mv
2022-11-02 11:46:32,242 INFO:Crawling: https://www.imdb.com/india/toprated/?ref_=nv_mv_in
2022-11-02 11:46:33,509 INFO:Crawling: https://www.imdb.com/whats-on-tv/?ref_=nv_tv_ontv
2022-11-02 11:46:34,491 INFO:Crawling: https://www.imdb.com/chart/topt

2022-11-02 11:47:50,798 INFO:Crawling: https://www.imdb.com/video/vi1537328409/?listId=ls025720609&ref_=hm_edcio_org_banshees_ots_13_i
2022-11-02 11:47:51,581 INFO:Crawling: https://www.imdb.com/video/vi1537328409/?listId=ls025720609&ref_=hm_edcio_org_banshees_ots_13_t
2022-11-02 11:47:52,711 INFO:Crawling: https://www.imdb.com/video/vi2041365785/?listId=ls025720609&ref_=hm_edcio_og_bq_goodnurse_15_i
2022-11-02 11:47:53,714 INFO:Crawling: https://www.imdb.com/video/vi2041365785/?listId=ls025720609&ref_=hm_edcio_og_bq_goodnurse_15_t
2022-11-02 11:47:55,134 INFO:Crawling: https://www.imdb.com/video/vi1497874969/?listId=ls025720609&ref_=hm_edcio_og_nsp_ada_17_i
2022-11-02 11:47:56,065 INFO:Crawling: https://www.imdb.com/video/vi1497874969/?listId=ls025720609&ref_=hm_edcio_og_nsp_ada_17_t
2022-11-02 11:47:56,949 INFO:Crawling: https://www.imdb.com/video/vi1203356953/?listId=ls025720609&ref_=hm_edcio_og_bq_ba_19_i
2022-11-02 11:47:58,059 INFO:Crawling: https://www.imdb.com/video/vi120335695

2022-11-02 11:49:25,121 INFO:Crawling: https://www.imdb.com/title/tt15548886/?ref_=rlm
2022-11-02 11:49:26,814 INFO:Crawling: https://www.imdb.com/title/tt13304584/?ref_=rlm
2022-11-02 11:49:28,294 INFO:Crawling: https://www.imdb.com/title/tt21627952/?ref_=rlm
2022-11-02 11:49:29,835 INFO:Crawling: https://www.imdb.com/title/tt9114286/?ref_=rlm
2022-11-02 11:49:31,552 INFO:Crawling: https://www.imdb.com/title/tt15309708/?ref_=rlm
2022-11-02 11:49:33,123 INFO:Crawling: https://www.imdb.com/title/tt20838498/?ref_=rlm
2022-11-02 11:49:34,816 INFO:Crawling: https://www.imdb.com/title/tt17736704/?ref_=rlm
2022-11-02 11:49:36,819 INFO:Crawling: https://www.imdb.com/title/tt15710136/?ref_=rlm
2022-11-02 11:49:38,631 INFO:Crawling: https://www.imdb.com/title/tt21386020/?ref_=rlm
2022-11-02 11:49:39,979 INFO:Crawling: https://www.imdb.com/title/tt22033570/?ref_=rlm
2022-11-02 11:49:41,458 INFO:Crawling: https://www.imdb.com/title/tt21874884/?ref_=rlm
2022-11-02 11:49:43,429 INFO:Crawling: https