In [1]:
import multiprocessing 
import threading

import time
import datetime
import re
import requests
from bs4 import BeautifulSoup

from time import sleep
from bs4 import BeautifulSoup

import graphviz
from graphviz import Digraph

In [2]:
class SiteMap:
    
    def __init__(self, url):
        self._name = url[8:-4]
        self.url = url
        self.page = self.get_page(url)
        self.sitemap = None
        self.sitemap_hrefs_only = None
        self.counter = 0
        self.href_list = []
        
    def get_page(self, url):
        response = requests.get(url)
        return BeautifulSoup(response.content)
        
    def clean_list(self, array):
        if not isinstance(array, list):
            return array
        for idx, elem in enumerate(array):
            if isinstance(elem, list) and \
            (all(map(lambda x: x is None, elem)) or \
             len(elem) == 0):
                array[idx] = None    
        while None in array:
            array.remove(None)
        for idx, elem in enumerate(array):
            if isinstance(elem, list):
                array[idx] = self.clean_list(elem)
        if len(array) == 1:
            array = array[0]
        return array

    @staticmethod
    def has_children(elem):
        try:
            return len(list(elem.children)) > 0
        except AttributeError:
            return False

    @staticmethod    
    def find_name(elem):
        elems = [elem] + list(elem.descendants)
        for elem in elems:
            try:
                name = elem.string.strip()
                if len(name) > 0:
                    return name
            except AttributeError:
                pass
        return
      
    def get_href(self, elem):
        try:
            name = self.find_name(elem)
            href = elem['href']
            if not href.startswith('https'):
                href = self.url + href
            return (name, href) if len(name) > 0 else None
        except:
            return
    
    def _map(self, elem):
        try:
            if elem.name == 'script' or elem.name == 'style' :
                return
        except AttributeError:
            pass
        href = self.get_href(elem)
        if href:
            self.href_list.append(href[1])
            return href
        elif elem.string is not None and not self.has_children(elem):
            name = elem.string.strip()
            return name if len(name) > 0 else None
        my_map = [self._map(el) for el in elem.children]
        return my_map if len(my_map) > 0 else None
      
    def _map_short(self, elem):
        href = self.get_href(elem)
        if href:
            self.href_list.append(href[1])
            return href
        elif self.has_children(elem):
            my_map = [self._map(el) for el in elem.children]
            return my_map if len(my_map) > 0 else None
    
    def _graph(self, graph, head, array):
        graph.attr('node',fontcolor='black')
        if isinstance(array, list) and len(array) > 1:
            if isinstance(array[0], str):
                head_2 = array[0]
                array = array[1:]
            else:
                head_2 = f'_{self.counter}'
                self.counter += 1
                graph.attr('node', fontcolor='white')
            if not head_2 == head:
                graph.edge(head, head_2)
            for elem in array:
                get_graph(graph, head_2, elem)
        elif isinstance(array, str):
            array = array.replace('(', '')
            array = array.replace(')', '')
            graph.edge(head, array)
        elif isinstance(array, tuple):
            array = array[0]
            array = array.replace(')', '')
            array = array.replace('(', '')
            graph.edge(head, array)
            
    def get_map(self):
        my_map = self._map(self.page)
        self.sitemap = self.clean_list(my_map)
        return self.sitemap
    
    def get_map_hrefs_only(self):
        my_map = self._map_short(self.page)
        self.sitemap_hrefs_only  = self.clean_list(my_map)
        return self.sitemap_hrefs_only
    
    def get_graph(self):
        if self.sitemap_hrefs_only is None:
            self.get_map_hrefs_only()
        g = Graph('G', filename=f'{self._name}.gv')
        g.attr(size='8') 
        self._graph(g, '', self.sitemap_hrefs_only)
        g = g.unflatten(stagger=25)
        g.view()
        
    def __call__(self):
        self.get_graph()
        
    def url_number(self):
        return len(self.href_list)
        
    

In [3]:
%%time
sof = SiteMap(url = 'https://stackoverflow.com')
sof.get_map_hrefs_only()
sof.url_number()

CPU times: user 1.04 s, sys: 23 ms, total: 1.06 s
Wall time: 1.65 s


198

In [4]:
%%time
ya = SiteMap(url='https://yandex.ru')
ya.get_map_hrefs_only()
ya.url_number()

CPU times: user 125 ms, sys: 8.34 ms, total: 133 ms
Wall time: 567 ms


113

In [5]:
%%time
vk = SiteMap(url = 'https://vk.com')
vk.get_map_hrefs_only()
vk.url_number()

CPU times: user 93.4 ms, sys: 7.45 ms, total: 101 ms
Wall time: 445 ms


8

In [6]:
%%time
google = SiteMap(url='http://google.com/')
google.get_map_hrefs_only()
google.url_number()

CPU times: user 25.9 ms, sys: 6.04 ms, total: 31.9 ms
Wall time: 345 ms


18

In [7]:
%%time
crawler = SiteMap(url='http://crawler-test.com/')
crawler.get_map_hrefs_only()
crawler.url_number()

CPU times: user 543 ms, sys: 6.97 ms, total: 550 ms
Wall time: 991 ms


37

In [8]:
import json
with open('stackoverflow.json', 'w', encoding='utf-8') as f:
    json.dump(sof.sitemap_hrefs_only, f, ensure_ascii=False)
with open('yandex.json', 'w', encoding='utf-8') as f:
    json.dump(ya.sitemap_hrefs_only, f, ensure_ascii=False)
with open('google.json', 'w', encoding='utf-8') as f:
    json.dump(google.sitemap_hrefs_only, f, ensure_ascii=False)
with open('vk.json', 'w', encoding='utf-8') as f:
    json.dump(vk.sitemap_hrefs_only, f, ensure_ascii=False)
with open('crawler.json', 'w', encoding='utf-8') as f:
    json.dump(crawler.sitemap_hrefs_only, f, ensure_ascii=False)