In [1]:
import time
from time import sleep
import threading
import multiprocessing 

import re
import requests
from bs4 import BeautifulSoup

import graphviz
from graphviz import Digraph
from collections import defaultdict
import copy

import tqdm
from tqdm.notebook import trange, tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
class SiteMap:
    
    def __init__(self, url):
        self._name = self.get_sitename(url)
        self.main_url = url
        self.url = None
        self.main_page = self.get_page(url)
        self.sitemap = None
        self.sitemap_hrefs_only = None
        self.href_list = defaultdict(list)
        self.counter = 0
        self.maps = defaultdict(list)
        self.edges = []
        self.subgraphs = {}
        
    @staticmethod
    def get_sitename(url):
        name = re.sub('http.?://', '', url)
        name = re.sub('\.com', '', name)
        name = re.sub('\.ru', '', name)
        name = re.sub('\/', '', name)
        return name
        
    def get_page(self, url):
        '''
        Загружает страницу по ссылке url 
        и возвращает дерево BeautifulSoup - 
        документ в виде вложенной структуры данных
        '''
        response = requests.get(url)
        return BeautifulSoup(response.content)
        
    def clean_list(self, array):
        '''
        Избавляет массив от пустых элементов 
        - None или пустых листов - 
        и избавляет массив от излишней вложенности
        '''
        if not isinstance(array, list):
            return array
        for idx, elem in enumerate(array):
            if isinstance(elem, list) and \
            (all(map(lambda x: x is None, elem)) or \
             len(elem) == 0):
                array[idx] = None  
        if not isinstance(array, tuple):
            array = [item for item in array \
                     if item is not None and item != []]
        for idx, elem in enumerate(array):
            if isinstance(elem, list):
                array[idx] = self.clean_list(elem)
        if len(array) == 1:
            array = array[0]
        if not isinstance(array, tuple):
            array = [item for item in array \
                     if item is not None and item != []]
        return array

    @staticmethod
    def has_children(elem):
        '''Проверяет, есть ли у элемента html-страницы "потомки"'''
        try:
            return len(list(elem.children)) > 0
        except AttributeError:
            return False

    @staticmethod    
    def find_name(elem):
        '''
        Проверяет, еcть ли среди потомков элемента 
        дочерний элемент NavigableString
        '''
        elems = [elem] + list(elem.descendants)
        for elem in elems:
            try:
                name = elem.string.strip()
                if len(name) > 0:
                    return name
            except AttributeError:
                pass
        return
      
    def get_href(self, elem):
        '''
        Проверяет, содержит ли элемент ссылку 
        и возвращает найденную ссылку и ее
        текстовое представление
        '''
        try:
            name = self.find_name(elem)
            href = elem['href']
            if not href.startswith('https'):
                href = self.url + href
            return (name, href) if len(name) > 0 else None
        except:
            return
    
    def _map(self, elem):
        '''
        Строит карту сайта, содержащую все текстовые элементы и ссылки
        '''
        try:
            if elem.name == 'script' or elem.name == 'style' :
                return
        except AttributeError:
            pass
        href = self.get_href(elem)
        if href:
            self.href_list[self.url].append(href[1])
            return href
        elif elem.string is not None and not self.has_children(elem):
            name = elem.string.strip()
            return name if len(name) > 0 else None
        my_map = [self._map(el) for el in elem.children]
        return my_map if len(my_map) > 0 else None
      
    def _map_short(self, elem):
        '''
        Строит карту сайта, содержащую только ссылки 
        и их текстовые представления
        '''
        href = self.get_href(elem)
        if href:
            self.href_list[self.url].append(href[1])
            return href
        elif self.has_children(elem):
            my_map = [self._map_short(el) for el in elem.children]
            return my_map if len(my_map) > 0 else None
        
    def make_edge(self, node1, node2, graph):
        '''Связывает два узла в графе'''
        node1 = self.clean_string(node1)
        node2 = self.clean_string(node2)
        if not node1 == node2 and \
        tuple(set([node1, node2])) not in self.edges:
            graph.edge(node1, node2)
            self.edges.append(tuple(set([node1, node2])))
            
    def _subgraph(self, graph, array, head):
        '''Строит под-граф'''
        if head in self.subgraphs:
            name = self.subgraphs[head]
            label = None
        else:
            name = f'cluster_{self.counter}'
            self.subgraphs[head] = name
            self.counter += 1
            label = head
        if label is not None and label.startswith('head_'):
            label = None
        with graph.subgraph(name=name) as c:
            c.attr(label=label)
            self._graph(c, array, head)
            
    @staticmethod
    def clean_string(line):
        """ Очищает строку от ненужных символов """
        line = line.strip()
        line = re.sub('[\)\(=]', '', line)
        return line
        
    def _graph(self, graph, array, head):
        """Строит граф - карту сайта"""
        
        head = self.clean_string(head)
        
        if isinstance(array, list) and len(array) == 1:
            self._graph(graph, array[0], head)
                 
        elif isinstance(array, list) and len(array) > 1:
            if isinstance(array[0], str):
                self.make_edge(head, array[0], graph)
                head_2 = array[0]
            elif isinstance(array[0], tuple):
                node = array[0][0]
                href = array[0][1]
                self.make_edge(head, node, graph)
                head_2 = node
            elif isinstance(array[0], list):
                # Создаем пустые узлы, чтобы объединить в одно 
                # под-дерево группу ссылок
                head_2 = f'head_{self.counter}'
                graph.attr('node', fillcolor = 'white', \
                           fontcolor='white')
                self.make_edge(head, head_2, graph)
                graph.attr('node', style = 'filled', \
                           fillcolor='lightblue2', \
                           fontcolor='black')
                self.counter += 1
            else:
                head_2 = head
            
            for elem in array:
                self._subgraph(graph, elem, head_2)
                
        elif isinstance(array, str):
            self.make_edge(head, array, graph)
                                      
        elif isinstance(array, tuple):
            
            # Не будем записывать в граф сами ссылки, т.к. 
            # они очень длинные
            
            node = array[0]
            href = array[1]
            self.make_edge(head, node, graph)
            
            # Если у нас есть карта сайта для этой ссылки,
            # создадим для нее под-граф
            
            if href in self.maps.keys() and \
            len(self.maps[href]) > 0 and \
            href != self.main_url and \
            node not in self.subgraphs.keys():
                self._subgraph(graph, self.maps[href], node)
            
    def get_map(self, url=None):
        """
        Метод вызова функции для создания 
        карты сайта с текстом и ссылками
        """

        if url is None:
            self.url = self.main_url
            page = self.main_page
        else:
            self.url = url
            page = self.get_page(url) 
            
        my_map = self._map(page)
        my_map = self.clean_list(my_map)
        
        self.maps[url] = my_map
        if url is None:
            self.sitemap = my_map
            
        return self.sitemap
    
    def get_map_hrefs_only(self, url=None):
        """
        Метод вызова функции для создания 
        карты сайта только с ссылками
        """
        if url is None:
            self.url = self.main_url
            page = self.main_page
        else:
            self.url = url
            page = self.get_page(url) 
            
        my_map = self._map_short(page)
        my_map = self.clean_list(my_map)
        
        if url is None:
            url = self.main_url
            self.sitemap_hrefs_only = my_map
        self.maps[url] = my_map
        
        return my_map
    
    def get_graph(self):
        """
        Метод вызова функции для построения графа
        Сохраняет граф в отдельном файле pdf
        """
        if self.sitemap_hrefs_only is None:
            self.get_map_hrefs_only()
            
        g = Digraph('G', filename=f'{self._name}.gv')
        g.attr(size='8',  page="8.5,11", ) 
        g.attr('node', style = 'filled', \
                fillcolor='lightblue2', fontcolor='black')
        
        self._graph(g, self.sitemap_hrefs_only, '')
        g = g.unflatten(stagger=25)
        g.view()
        
    def __call__(self):
        """При вызове экземпляра класса показывает граф сайта"""
        self.get_graph()
        
    def url_number(self):
        """Показывает количество собранных ссылок"""
        urls = [inner for item in self.href_list.values() \
                for inner in item]
        urls = list(set(urls))
        return len(urls)
    
    def make_map(self, url):
        """
        Вспомогательная функция 
        Добавляет карту сайта в словарь
        """
        self.maps[url] = self.get_map_hrefs_only(url)
        
    def make_proc(self, x):
        """Запускает многопоточност"""
        pr1 = threading.Thread(target=self.make_map, args=(x,))
        pr1.start()
        pr1.join()
        return pr1
        
    def recursion(self, depth=1):
        """
        Идет вглубь сайта
        depth - "глубина", на которую нужно опуститься
        """
        def check_url(href):
            var1 = self.main_url.strip('https://')
            var2 = self.main_url.strip('http://')
            return var1 in href or var2 in href
        
        queue = list()
        
        if len(self.href_list) == 0:
            self.get_map_hrefs_only()
        cntr = 0
        for i in trange(depth):
            keys = copy.copy(list(self.href_list.keys()))
            if len(keys) == cntr:
                break
            cntr = len(keys)
            for key in keys:
                for url in self.href_list[key]:
                    if check_url(url) and \
                    url not in self.maps.keys():
                        try:
                            pr = self.make_proc(url)
                            queue.append(pr)
                        except:
                            pass
                        
            # Не будем создавать multiprocessing.Queue, т.к. хотим, чтобы 
            # все ссылки обрабатывались параллельно
            # Вместо этого просто подождем, пока обработаются все ссылки
            # "одного уровня"
            
            while len(queue) > 0:
                queue = [el for el in queue if el.is_alive()]
  

**Построим карту начальной страницы для всех сайтов и посчитаем количество ссылок**

In [3]:
%%time
vk = SiteMap(url = 'https://vk.com')
vk.get_map_hrefs_only()
vk.url_number()

CPU times: user 108 ms, sys: 13.6 ms, total: 122 ms
Wall time: 523 ms


8

In [4]:
vk.get_graph()

In [5]:
vk.sitemap_hrefs_only

[[[[('Установить приложение', 'https://vk.me/?act=dl'),
    [('Забыли пароль?', 'https://static.vk.com/restore'),
     ('Зарегистрироваться', 'https://vk.com/join'),
     ('Войти через Facebook', 'https://vk.com/login?act=fb_sign')]],
   [[('Українська',
      'https://vk.com/settings?act=change_regional&hash=c27b6a83443af39504&lang_id=1'),
     ('English',
      'https://vk.com/settings?act=change_regional&hash=c27b6a83443af39504&lang_id=3'),
     ('all languages »', 'https://vk.com/settings?act=select_lang')],
    ('Версия для компьютера',
     'https://vk.com/fv?to=%2F%3F_fm%3D1%26_fm2%3D1')]]]]

In [6]:
vk.url_number()

8

In [7]:
%%time
sof = SiteMap(url = 'https://stackoverflow.com')
sof.get_map_hrefs_only()
sof.url_number()

CPU times: user 1.59 s, sys: 53.7 ms, total: 1.64 s
Wall time: 2.85 s


164

In [8]:
sof.get_graph()



In [9]:
%%time
ya = SiteMap(url='https://yandex.ru')
ya.get_map_hrefs_only()
ya.url_number()

CPU times: user 180 ms, sys: 18.9 ms, total: 199 ms
Wall time: 592 ms


79

In [10]:
ya.get_graph()



In [11]:
%%time
google = SiteMap(url='http://google.com')
google.get_map_hrefs_only()
google.url_number()

CPU times: user 34.2 ms, sys: 7.95 ms, total: 42.1 ms
Wall time: 432 ms


18

In [12]:
google.get_graph()

In [13]:
%%time
crawler = SiteMap(url='http://crawler-test.com')
crawler.get_map_hrefs_only()
crawler.url_number()

CPU times: user 626 ms, sys: 16.9 ms, total: 643 ms
Wall time: 1.09 s


37

In [14]:
crawler.get_graph()

**Пойдем "вглубь" всех сайтов**

In [15]:
%%time 
vk.recursion(depth = 3)
vk.url_number()

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


CPU times: user 27.9 s, sys: 2.01 s, total: 29.9 s
Wall time: 6min 15s


1240

In [16]:
%%time 
sof.recursion(1)  # Слишком глубоко все же не пойдем

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Exception in thread Thread-225:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests/models.py", line 379, in prepare_url
    scheme, auth, host, port, path, query, fragment = parse_url(url)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/util/url.py", line 401, in parse_url
    return six.raise_from(LocationParseError(source_url), None)
  File "<string>", line 3, in raise_from
urllib3.exceptions.LocationParseError: Failed to parse: https://stackoverflow.comjavascript:void(0)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "<

Exception in thread Thread-237:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connection.py", line 157, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/util/connection.py", line 61, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/socket.py", line 748, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen
    chunked=chunked,
  Fi


CPU times: user 1min 6s, sys: 634 ms, total: 1min 7s
Wall time: 1min 36s


In [17]:
sof.url_number()

3374

In [18]:
%%time 
google.recursion()

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Exception in thread Thread-256:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connection.py", line 157, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/util/connection.py", line 61, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/socket.py", line 748, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen
    chunked=chunked,
  Fi

Exception in thread Thread-259:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connection.py", line 157, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/util/connection.py", line 61, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/socket.py", line 748, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen
    chunked=chunked,
  Fi

Exception in thread Thread-268:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connection.py", line 157, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/util/connection.py", line 61, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/socket.py", line 748, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen
    chunked=chunked,
  Fi


CPU times: user 31.9 s, sys: 350 ms, total: 32.3 s
Wall time: 1min 54s


In [19]:
google.url_number()

438

In [20]:
%%time 
ya.recursion()

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


CPU times: user 38.9 s, sys: 576 ms, total: 39.5 s
Wall time: 1min 8s


In [21]:
ya.url_number()

3660

In [25]:
%%time 
crawler.recursion(2)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


CPU times: user 2.79 s, sys: 396 ms, total: 3.19 s
Wall time: 3min 13s


In [26]:
crawler.url_number()

250

**Сохраним результаты**

In [27]:
import json
with open('stackoverflow.json', 'w', encoding='utf-8') as f:
    json.dump(sof.maps, f, ensure_ascii=False)
with open('yandex.json', 'w', encoding='utf-8') as f:
    json.dump(ya.maps, f, ensure_ascii=False)
with open('google.json', 'w', encoding='utf-8') as f:
    json.dump(google.maps, f, ensure_ascii=False)
with open('vk.json', 'w', encoding='utf-8') as f:
    json.dump(vk.maps, f, ensure_ascii=False)
with open('crawler.json', 'w', encoding='utf-8') as f:
    json.dump(crawler.maps, f, ensure_ascii=False)