In [81]:
import ipynbname
import os

def get_this_ipynb():
    """Gets this ipynb absolute path

    Returns:
        str: path
    """
    try:
        nb_path = ipynbname.path()
        return str(nb_path)
    except:
        
        return globals()['__vsc_ipynb_file__']

def list_files(directory):
    files = []
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        if os.path.isfile(item_path):
            files.append(item)
    return files

def ensure_directory_exists(directory_path):
    # Проверяем, существует ли директория [[6]]
    if not os.path.isdir(directory_path):
        # Создаем директорию (включая родительские каталоги при необходимости)
        os.makedirs(directory_path)
    return directory_path

def append_to_file(file_path, text):
    """
    Дописывает строку в конец файла. Если файл не существует - создает его.
    
    :param file_path: Путь к файлу
    :param text: Добавляемая строка
    """
    with open(file_path, 'a') as file:  # Режим 'a' для добавления в конец [[3]][[8]]
        file.write(text + '\n')  # Добавляем перенос строки [[3]][[9]]

# Ver. 1
slow AF

In [2]:
import logging
import requests
import re
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse
import networkx as nx
from pyvis.network import Network
from typing import Set, Dict, Any, Optional
from requests_cache import CachedSession
from collections import deque
from colorsys import hsv_to_rgb
from tqdm import tqdm
from multiprocessing import Pool
from sklearn.metrics.pairwise import cosine_similarity
import time
import threading
from concurrent.futures import ThreadPoolExecutor
from queue import Queue, Empty

# Настройка логгирования
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("website_graph.log", encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def html_title(html):
    soup = BeautifulSoup(html, 'html.parser')
    container = soup.new_tag('div')
    container.append(soup)
    return container


class WebsiteGraph:
    def __init__(
        self,
        start_url: str,
        max_depth: int = 2,
        domain_filter: Optional[str] = None,
        path_regex: Optional[str] = None,
        node_size: str = "degree",
        layout: Optional[Dict[str, Any]] = None,
        max_links: int = 10,
        expire_after: int = 3000
    ):
        """Класс для построения и визуализации графа веб-сайта.
    
    Args:
        start_url (str): Начальный URL для парсинга.
        max_depth (int, optional): Максимальная глубина обхода. Defaults to 2.
        domain_filter (str, optional): Домен для фильтрации ссылок. Defaults to None.
        path_regex (str, optional): Регулярное выражение для путей. Defaults to None.
        node_size (str, optional): Метод расчета размера узлов. Defaults to "degree".
        layout (Dict[str, Any], optional): Параметры визуализации. Defaults to None.
        max_links (int, optional): Максимум ссылок на страницу. Defaults to 10.
        """
        logger.info(f"Инициализация парсера с параметрами: "
                    f"start_url={start_url}, max_depth={max_depth}, "
                    f"domain_filter={domain_filter}, path_regex={path_regex}, "
                    f"node_size={node_size}, max_links={max_links}")
        
        self.graph = nx.DiGraph()
        self.start_url = self._normalize_url(start_url)
        self.max_depth = max_depth
        self.domain = urlparse(self.start_url).netloc
        self.domain_filter = domain_filter or self.domain
        self.path_regex = re.compile(path_regex) if path_regex else None
        self.node_size = node_size
        self.layout = layout or {"physics": True, "hierarchical": False}
        self.max_links = max_links
        self.expire_after = expire_after
        self.visited = set()
                # Инициализация кэшированной сессии
        self.session = CachedSession(
            cache_name=f'cache/{urlparse(self.start_url).netloc}',  # Отдельный кэш для каждого домена [[2]]
            expire_after=self.expire_after,
            allowable_methods=('GET',)  # Кэшируем только GET-запросы [[7]]
        )
        # Отключаем ненужные проверки для ускорения
        self.session.verify = True  # Используйте с осторожностью! Для HTTPS лучше включить проверку
         
    def _normalize_url(self, url: str) -> str:
        """Нормализует URL, удаляя якори и дублирующие слеши.
        
        Args:
            url (str): Исходный URL.
            
        Returns:
            str: Нормализованный URL.
        """
        parsed = urlparse(url)
        normalized = urlunparse((
            parsed.scheme,
            parsed.netloc,
            parsed.path.rstrip('/'),  # Удаляем trailing slash
            '',  # params
            parsed.query,  # Сохраняем query-параметры
            ''   # fragment
        ))
        return normalized

    def _get_article_title(self, url: str) -> str:
        """Упрощенная версия с использованием walrus operator"""
        if (match := re.search(r'/wiki/([^/]+)', url)) and (title := match.group(1)):
            return title.replace('_', ' ')
        return url.split('//')[-1].split('/')[0]  # Альтернатива для не-wiki URL

    def _is_valid_url(self, url: str) -> bool:
        """Проверяет валидность URL согласно фильтрам.
        
        Args:
            url (str): Проверяемый URL.
            
        Returns:
            bool: True, если URL соответствует условиям.
        """
        parsed = urlparse(url)
        is_main_page = parsed.path.lower().endswith('main_page')
        valid = (
            self.domain_filter in parsed.netloc and
            not is_main_page and
            (not self.path_regex or self.path_regex.search(parsed.path))
        )
        if not valid:
            logger.debug(f"URL отклонен: {url} (домен: {parsed.netloc}, путь: {parsed.path})")
        return valid
    
    def _extract_links(self, url: str):
        """Извлекает ссылки со страницы с кэшированием.
        Args:
            url (str): URL страницы для парсинга.
        Returns:
            Tuple[Set[str], Optional[int]]: Множество валидных ссылок и HTTP-статус.
        """
        logger.info(f"Парсинг ссылок с: {url}")
        try:
            response = self.session.get(url, timeout=5)
            response.raise_for_status()  # Вызывает HTTPError для 4xx/5xx
            soup = BeautifulSoup(response.text, "html.parser")
            links = set()
            for link in soup.find_all("a", href=True):
                full_url = urljoin(url, link["href"])
                normalized_url = self._normalize_url(full_url)
                if self._is_valid_url(normalized_url):
                    links.add(normalized_url)
                    if len(links) >= self.max_links:
                        logger.debug(f"Достигнут лимит ссылок ({self.max_links}) для {url}")
                        break
            logger.debug(f"Найдено {len(links)} валидных ссылок на странице")
            return links, response.status_code  # Возвращаем статус успешного ответа

        except requests.exceptions.HTTPError as e:
            # Обработка HTTP-ошибок (4xx/5xx)
            status_code = e.response.status_code if e.response else None
            logger.error(f"HTTP ошибка {status_code} для {url}: {str(e)}", exc_info=True)
            return set(), status_code

        except requests.exceptions.RequestException as e:
            # Обработка сетевых ошибок (timeout, connection error)
            logger.error(f"Сетевая ошибка для {url}: {str(e)}", exc_info=True)
            return set(), None

        except Exception as e:
            # Обработка остальных исключений
            logger.error(f"Непредвиденная ошибка для {url}: {str(e)}", exc_info=True)
            return set(), None

    def _crawl(self, force_reload: bool = False) -> None:
        """Рекурсивно обходит сайт, строя граф ссылок."""
        self.start_crawl_time = time.time()
        if not force_reload and self.graph.nodes:
            logger.info("Используем существующий граф")
            return
        logger.info("Начинаем обход сайта")
        queue = deque([(self.start_url, 0)])
        current_depth = 0  # Начальная глубина
        with tqdm(total=self.max_depth, desc="Глубина обхода") as pbar:
            while queue:
                url, depth = queue.popleft()
                logger.debug(f"Обрабатываем URL: {url} (глубина: {depth}/{self.max_depth})")
                
                # Обновляем прогресс-бар при переходе на новый уровень глубины
                if depth > current_depth:
                    pbar.update(depth - current_depth)
                    current_depth = depth
                    logger.info(f"Переход на глубину: {current_depth}")
                    
                if depth > self.max_depth or url in self.visited:
                    logger.debug(f"Пропуск URL: {url} (посещено: {url in self.visited}, глубина: {depth})")
                    continue
                    
                self.visited.add(url)
                logger.info(f"Добавление узла: {url} (глубина: {depth})")
                
                links, status_code = self._extract_links(url)
                if status_code:  # Если статус получен (успешный запрос)
                    self.graph.add_node(
                        url,
                        title=url,
                        label=self._get_article_title(url),
                        size=self._calculate_node_size(url),
                        status_code=status_code,  # Сохраняем статус
                        depth=depth 
                    )
                else:  # Если произошла ошибка
                    self.graph.add_node(
                        url,
                        title=url,
                        label=self._get_article_title(url),
                        size=self._calculate_node_size(url),
                        status_code=0,  # Или другое значение по умолчанию
                        depth=depth 
                    )
                
                '''links, status = self._extract_links(url)
                self.graph.add_node(
                    url,
                    title=url,
                    label=self._get_article_title(url),
                    size=self._calculate_node_size(url)
                )
                '''
                for link in links:
                    if url != link:
                        logger.debug(f"Добавление связи: {url} -> {link}")
                        #print(f"Добавление связи: {url} -> {link}")
                        self.graph.add_edge(url, link)
                        queue.extend((link, depth+1) for link in links)
        self.end_crawl_time = time.time()

    def _calculate_node_size(self, node: str) -> int:
        """Рассчитывает размер узла на основе входящих связей.
        
        Args:
            node (str): URL узла.
            
        Returns:
            int: Размер узла в пикселях.
        """
        in_degree = dict(self.graph.in_degree()).get(node, 0)
        size = 10 + 3 * in_degree if self.node_size == "degree" else 10
        logger.debug(f"Размер узла {node} рассчитан как {size} (входящих связей: {in_degree})")
        return size
      
    def _get_node_color(self, node: str) -> str:
        """Вычисляет цвет узла от светло-зеленого до рыжего
        Args:
            node (str): URL узла
        Returns:
            str: HEX-код цвета
        """
        if self.max_degree == self.min_degree:
            return "#D4EDD4"  # Светло-зеленый для графа без связей [[2]]
        
        t = (self.graph.in_degree(node) - self.min_degree) / (self.max_degree - self.min_degree)
        
        # Интерполяция от зеленого (h=0.33) к рыжему (h=0.08) [[6]]
        h = 0.33 - 0.25 * t  # 0.33 (зеленый) → 0.08 (рыжий)
        s = 0.8 + 0.2 * t    # Увеличение насыщенности для темных оттенков
        v = 0.9 + 0.1 * t    # Увеличение яркости для светлых участков
        
        r, g, b = hsv_to_rgb(h, s, v)
        return '#{:02x}{:02x}{:02x}'.format(
            int(r*255), int(g*255), int(b*255)
        )

    def visualize(self, rebuild: bool = False, force_reload: bool = False) -> None:
        """Создает HTML-визуализацию графа с использованием pyvis."""
        if rebuild or force_reload or not self.graph.nodes:
            self._crawl(force_reload=force_reload)
        logger.info("Запуск визуализации графа")
        
        
        if not self.graph.nodes:
            logger.warning("Граф пуст - визуализация невозможна")
            return
        
        # Расчет границ градиента
        self.degree_map = dict(self.graph.in_degree())
        self.min_degree = min(self.degree_map.values()) if self.degree_map else 1
        self.max_degree = max(self.degree_map.values()) if self.degree_map else 1
        logger.debug(f"Градиент границы: min={self.min_degree}, max={self.max_degree}")

        net = Network(
            notebook=False,
            directed=True,
            height="800px",
            width="100%",
            cdn_resources="remote"
        )
        
        # Настройки подсветки
        options = {
            "edges": {
                "color": {
                    "color": "#2B7CE9",  # Стандартный цвет
                    "highlight": "#FF0000",  # Цвет подсветки
                    "hover": "#FF0000"       # Цвет при наведении
                },
                "selectionWidth": 3,  # Толщина выделенных рёбер
                "smooth": False
            },
            "interaction": {
                "hoverConnectedEdges": True,
                "selectConnectedEdges": True,  # Автоматический выбор связанных рёбер
                "multiselect": True,
                "navigationButtons": True,
                "keyboard":True,
                "hover": True,
                "click": True,
            },
            "physics": {
                "enabled": True,
                "forceAtlas2Based": {
                    "gravitationalConstant": -200,  # Увеличьте отталкивание (от -50 до -500)
                    "springLength": 500,           # Длина связей между узлами (от 100 до 500)
                    "springConstant": 0.001,        # Жесткость связей (от 0.001 до 0.1)
                    "damping": 0.3,                # Затухание движения (0-1)
                    "avoidOverlap": 1              # Избегать пересечений (0-1)
                },
                "stabilization": {
                    "iterations": 500,             # Итераций для стабилизации
                    "updateInterval": 50
                }
            },
            "nodes": {
                "allow_html": True,  # Включаем поддержку HTML
                "shape": "box",  # Обязательно для кликабельности [[8]]
                "font": {"size": 10},
                "color": {
                    "border": "#2B7CE9",
                    "background": "#97C2FC",
                    "highlight": {
                        "border": "#FF0000",  # Цвет границы узла при выделении
                        "background": "#FFFF00"
                    }
                },
                "chosen": True,
                "style": "cursor: pointer;",
                "shapeProperties": {
                    "allowHtml": True  # Правильный параметр вместо allow_html [[9]]
                    }
            },
        
            "configure": {
                "enabled": False,
                "filter": "nodes,edges",
                "showButton": False
            },
            "version": "9.1.2" 
            }

        net.set_options(json.dumps(options))
        for node in self.graph.nodes:
            
            # Формирование HTML-подсказки
            status_code = self.graph.nodes[node].get('status_code', 0)
            in_degree = self.graph.in_degree(node)
            status_color = "#e6ffe6"  # Зеленый фон по умолчанию
            
            tooltip = (
            f"<div style='padding: 8px; background: {status_color}'>"
            f"<b>URL:</b> {node}<br>"
            f"<b>Status:</b> {status_code}<br>"
            f"<b>In-Degree:</b> {in_degree}<br>"
            f"</div>"
            )
            
            
            title = self._get_article_title(node)
            if status_code and 400 <= int(status_code) < 600:
                color = "#ffcccc"  # Красный фон для ошибок
            else:
                color = self._get_node_color(node)
            color = self._get_node_color(node)  # <- Новое вычисление цвета
            logger.debug(f"Добавление узла в визуализацию: {node} (заголовок: {title})")
            logger.debug(f"Цвет узла {node}: {color} (степень: {self.graph.in_degree(node)})")
            
            # В цикле добавления узлов
            full_url = node if node.startswith(("http://", "https://")) else f"http://{node}"
            escaped_url = full_url.replace("'", "\\'")  # Экранируем одинарные кавычки [[4]]

            
            net.add_node(
                node,
                label=title,
                title=tooltip,
                size=self._calculate_node_size(node),
                color=color,
                url=full_url,
                allow_html=True,
                # Добавляем обработчик клика через JavaScript
                onclick=f"window.open('{escaped_url}', '_blank');",
                shapeProperties={
                "allowHtml": True  # Корректное название опции [[1]]
            },
            )

        for edge in self.graph.edges:
            logger.debug(f"Добавление связи в визуализацию: {edge[0]} -> {edge[1]}")
            net.add_edge(edge[0], edge[1])

        ipynb_dir =  '\\'.join(get_this_ipynb().split('\\')[:-1])
        directory = ensure_directory_exists(ipynb_dir + '\\graphs')
        try:
            graph_num = [int(i.split('.')[0].replace('graph','')) for i in list_files(directory)][-1]+1
        except:
            graph_num = 0
        logger.info("Сохранение графа в HTML-файл")
        file = f"{directory}\\graph{graph_num}.html"
        
        text = f'{file} | max_depth: {self.max_depth} | max_links: {self.max_links} | crawl time: {self.end_crawl_time - self.start_crawl_time}'
        
        net.write_html(file, open_browser=True)
        append_to_file(ipynb_dir+f'\\{self.__name__().split('.')[-1]}.txt',text)
        
        
        logger.info(f"Graph saved as {file} and opened in browser")
        
        
        print(text)
        
        
        # Нужно закрыть сессию)
        self.session.close()
    
    def __name__(self):
        return 'WebsiteGraph'

In [3]:
'''# Пример использования
logging.disable(logging.CRITICAL)
for i in [1,2,3]:
    logger.info("Запуск программы")

    graph = WebsiteGraph(
        start_url="https://en.wikipedia.org/wiki/Data_science",
        max_depth=i,
        max_links=10,
        path_regex=r"^/wiki/[A-Za-z_]+$",
        layout={"physics": True}
    )

    graph._crawl()
    graph.visualize()'''

'# Пример использования\nlogging.disable(logging.CRITICAL)\nfor i in [1,2,3]:\n    logger.info("Запуск программы")\n\n    graph = WebsiteGraph(\n        start_url="https://en.wikipedia.org/wiki/Data_science",\n        max_depth=i,\n        max_links=10,\n        path_regex=r"^/wiki/[A-Za-z_]+$",\n        layout={"physics": True}\n    )\n\n    graph._crawl()\n    graph.visualize()'

results:

c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph0.html | max_depth: 1 | max_links: 10 | crawl time: 1.6000185012817383
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph2.html | max_depth: 2 | max_links: 10 | crawl time: 12.905709743499756
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph4.html | max_depth: 3 | max_links: 10 | crawl time: 84.55941534042358
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph0.html | max_depth: 1 | max_links: 10 | crawl time: 3.058067798614502
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph1.html | max_depth: 2 | max_links: 10 | crawl time: 20.718218088150024
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph2.html | max_depth: 3 | max_links: 10 | crawl time: 123.30372500419617

# Ver 2
fast af
multiprocessing

In [85]:

import logging
import re
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse, urlunparse
from bs4 import BeautifulSoup
import networkx as nx
from pyvis.network import Network
from requests_cache import CachedSession
from colorsys import hsv_to_rgb
import requests
import numpy as np
import time
import pandas as pd
from statsmodels.api import OLS
from sklearn.preprocessing import PolynomialFeatures

# Logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("website_graph.log", encoding="utf-8"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class WebsiteGraphMP: # MP for MultiProcessing
    def __init__(self,
                 start_url: str,
                 max_depth: int = 2,
                 max_links: int = 5,
                 path_regex: str = None,
                 workers: int = 10,
                 expire_after: int = 3000,
                 node_size: str = "degree",
                 layout: dict = None,
                 ):
        self.start_url = self._normalize_url(start_url)
        self.max_depth = max_depth
        self.max_links = max_links
        self.domain = urlparse(self.start_url).netloc
        self.path_regex = re.compile(path_regex) if path_regex else None
        self.workers = workers
        self.expire_after = expire_after
        self.node_size = node_size
        self.layout = layout or {"physics": True, "hierarchical": False}
        self.graph = nx.DiGraph()
        
        try:
            self.results = self._prev_results()
        except:
            pass

    def detect_communities(self):
        from networkx.algorithms import community
        return list(community.greedy_modularity_communities(self.graph))
     
    def _normalize_url(self, url: str) -> str:
        parsed = urlparse(url)
        normalized = urlunparse((
            parsed.scheme,
            parsed.netloc,
            parsed.path.rstrip('/'),
            "",
            parsed.query,
            ""
        ))
        return normalized

    def _is_valid_url(self, url: str) -> bool:
        parsed = urlparse(url)
        # Avoid main page and require same domain
        is_main_page = parsed.path.lower().endswith("main_page")
        valid = (self.domain in parsed.netloc and not is_main_page and 
                 (not self.path_regex or self.path_regex.search(parsed.path)))
        return valid

    def _get_article_title(self, url: str) -> str:
        match = re.search(r'/wiki/([^/]+)', url)
        if match:
            return match.group(1).replace('_', ' ')
        return url.split('//')[-1].split('/')[0]

    def fetch_page(self, url: str, depth: int, session: CachedSession):
        """Fetch a page, extract valid links, and return node data with found links."""
        logger.info(f"Fetching (depth {depth}): {url}")
        try:
            response = session.get(url, timeout=5)
            response.raise_for_status()
            html = response.text
            soup = BeautifulSoup(html, "html.parser")
            links = set()
            for link in soup.find_all("a", href=True):
                full_url = urljoin(url, link["href"])
                normalized_url = self._normalize_url(full_url)
                if self._is_valid_url(normalized_url):
                    links.add(normalized_url)
                    if len(links) >= self.max_links:
                        break
            node_data = {
                "title": self._get_article_title(url),
                "label": self._get_article_title(url),
                "status_code": response.status_code,
                "depth": depth
            }
            return node_data, links
        except requests.exceptions.HTTPError as e:
            status_code = e.response.status_code if e.response else None
            logger.error(f"HTTP error {status_code} for {url}: {str(e)}")
        except requests.exceptions.RequestException as e:
            logger.error(f"Network error for {url}: {str(e)}")
        except Exception as e:
            logger.error(f"Unexpected error for {url}: {str(e)}")
        # In case of error, return minimal data with no links.
        node_data = {
            "title": self._get_article_title(url),
            "label": self._get_article_title(url),
            "status_code": None,
            "depth": depth
        }
        return node_data, set()

    def process_url(self, url: str, depth: int, session: CachedSession, visited: set):
        """Process a URL and update the graph. Always return a set of links for further processing."""
        if url in visited:
            return set()
        visited.add(url)
        node_data, links = self.fetch_page(url, depth, session)
        # Remove self-loops and avoid duplicate edges
        valid_links = {link for link in links if link != url}
        self.graph.add_node(url, **node_data)
        for link in valid_links:
            # Avoid duplicate edge creation
            if not self.graph.has_edge(url, link):
                self.graph.add_edge(url, link)
        # Only return links for further processing if within max_depth.
        return valid_links if depth < self.max_depth else set()

    def crawl(self):
        """Crawl the website using a ThreadPoolExecutor."""
        try:
            print(f'Predicted time for crawling: {self._predict_time(self.max_depth,self.max_links,self.workers)}')
        except:
            pass # Because it only works when _prev_results works
        
        
        self.start_crawl_time = time.time()
        logger.info("Starting crawl")
        session = CachedSession(
            cache_name=f"cache/{self.domain}",
            expire_after=self.expire_after,
            allowable_methods=("GET",)
        )
        session.verify = True
        
        visited = set()
        frontier = [(self.start_url, 0)]
        with ThreadPoolExecutor(max_workers=self.workers) as executor:
            while frontier:
                futures = {}
                for url, depth in frontier:
                    if url not in visited:
                        future = executor.submit(self.process_url, url, depth, session, visited)
                        futures[future] = depth
                frontier = []
                for future in futures:
                    links = future.result() or set()
                    current_depth = futures[future]
                    if current_depth < self.max_depth:
                        for link in links:
                            if link not in visited:
                                frontier.append((link, current_depth + 1))
        self.end_crawl_time = time.time()                    
        # Нужно закрыть сессию)
        session.close()
        logger.info("Crawl complete")
        
    
    def _calculate_node_size(self, node: str) -> int:
        in_degree = self.graph.in_degree(node)
        size = 10 + 3 * in_degree if self.node_size == "degree" else 10
        return size

    def _get_node_color(self, node: str) -> str:
        
        if self.max_degree == self.min_degree:
            return "#D4EDD4"
        t = (self.graph.in_degree(node) - self.min_degree) / (self.max_degree - self.min_degree)
        h = 0.33 - 0.25 * t
        s = 0.8 + 0.2 * t
        v = 0.9 + 0.1 * t
        r, g, b = hsv_to_rgb(h, s, v)
        return '#{:02x}{:02x}{:02x}'.format(int(r * 255), int(g * 255), int(b * 255))
    
    def _prev_results(self):
        with open('WebsiteGraphMP.txt') as f:
            lines = f.readlines()

        names = ['name','max_depth', 'max_links','crawl_time','workers']


        results = []
        for i in list(map(lambda x: x.split("|"), lines)):
            if len(i) == len(names):
                results.append([_.split(':')[-1] for _ in i])
        results = pd.DataFrame(results, columns=names)
        results['name'] = results['name'].transform(lambda x: x.split('\\')[-1])
        results['max_depth'] = pd.to_numeric(results['max_depth'])
        results['max_links'] = pd.to_numeric(results['max_links'])
        results['crawl_time'] = pd.to_numeric(results['crawl_time'])
        results['workers'] = pd.to_numeric(results['workers'].transform(lambda x: x[:-1]))
        results.set_index('name', inplace=True)
        results.sort_values('crawl_time', inplace=True)

        return results

    def _predict_time(self,max_depth,max_links,workers):
        gran_mean_times = self.results.groupby(['max_depth','max_links','workers'])[['crawl_time']].agg(['mean'])
        noworkers_mean_times = self.results.groupby(['max_depth','max_links'])[['crawl_time']].agg(['mean'])
        
        if (max_depth,max_links,workers) in gran_mean_times.index:
            return gran_mean_times.loc[max_depth,max_links,workers].iloc[0]
        
        elif (max_depth,max_links) in noworkers_mean_times.index:
            return noworkers_mean_times.loc[max_depth,max_links].iloc[0]
        
        else:
            X = self.results[['max_depth','max_links','workers']].values
            y = self.results['crawl_time'].values

            X = PolynomialFeatures(degree=2).fit_transform(X)
            model1 = OLS(y,X)
            model1 = model1.fit()


            mean_times = self.results[self.results.max_links==10].groupby('max_depth')[['crawl_time']].agg(['mean'])
            x = PolynomialFeatures(degree=2).fit_transform(mean_times.index.values.reshape(-1,1))

            model2 = OLS(mean_times.values,x)
            model2 = model2.fit()

            return (model1.predict(PolynomialFeatures(degree=2).fit_transform([[max_depth,max_links,workers]]))[0] + model2.predict(PolynomialFeatures(degree=2).fit_transform([[max_depth]]))[0])/2
        
    def visualize(self, rebuild: bool = False, force_reload: bool = False, force_file_name : str = ''):
        if rebuild or force_reload or not self.graph.nodes:
            self.crawl()
        degrees = [self.graph.in_degree(n) for n in self.graph.nodes()]
        self.min_degree = min(degrees) if degrees else 0
        self.max_degree = max(degrees) if degrees else 1
        logger.info("Starting visualization")

        net = Network(
            notebook=False, 
            directed=True, 
            height="800px", 
            width="100%", 
            cdn_resources="remote",
            bgcolor = '#000000',
            font_color = '#ffffff'
        )

        
        # Example custom physics to help prevent node overlap
        net.repulsion(
            node_distance=500,
            central_gravity=0.2,
            spring_length=200,
            spring_strength=0.05,
            damping=0.09
        )
        net.font_color
        #net.set_options(json.dumps(options))
        for node in self.graph.nodes:
            data = self.graph.nodes[node]
            status_code = data.get("status_code", 0)
            in_degree = self.graph.in_degree(node)
            tooltip = (
                f"<div style='padding: 8px; background: #e6ffe6'>"
                f"<b>URL:</b> {node}<br>"
                f"<b>Status:</b> {status_code}<br>"
                f"<b>In-Degree:</b> {in_degree}<br>"
                f"</div>"
            )
            title = data.get("label", node)
            color = "#ffcccc" if status_code and 400 <= int(status_code) < 600 else self._get_node_color(node)
            full_url = node if node.startswith(("http://", "https://")) else f"http://{node}"
            escaped_url = full_url.replace("'", "\\'")
            net.add_node(
                node,
                label=self._get_article_title(title),
                title=tooltip,
                size=self._calculate_node_size(node),
                font={"size": self._calculate_node_size(node)},
                color=color,
                url=full_url,
                onclick=f"window.open('{escaped_url}', '_blank');",
                shapeProperties={"allowHtml": True}
            )

        # Avoid duplicate edges by checking existing net.edges (list of dicts)
        for edge in self.graph.edges:
            src, dst = edge
            if src != dst:
                # Check if there's an existing edge from src to dst
                if not any(e["from"] == src and e["to"] == dst for e in net.edges):
                    net.add_edge(
                        src,
                        dst,
                        color={
                            "color": "#2B7CE9",
                            "highlight": "#FF0000",
                            "hover": "#FF0000"
                            },
                        selectionWidth=3,
                        smooth=False
                        )
                    
        ipynb_dir =  '\\'.join(get_this_ipynb().split('\\')[:-1])
        directory = ensure_directory_exists(ipynb_dir + '\\graphs')
        
        if not len(force_file_name):
            
            try:
                graph_num = max([int(i.split('.')[0].replace('graph','')) for i in list_files(directory) if 'graph' in i.split('.')[0]])+1
            except:
                graph_num = 0
            
            file = f"{directory}\\graph{graph_num}.html"
        else:
            file = f"{directory}\\{force_file_name}.html"
        logger.info("Сохранение графа в HTML-файл")
        
        
        try:
            text = f'{file} | max_depth: {self.max_depth} | max_links: {self.max_links} | crawl time: {self.end_crawl_time - self.start_crawl_time} | workers: {self.workers}'
        except:
            text = f'{file} | max_depth: {self.max_depth} | max_links: {self.max_links} | workers: {self.workers}'
        net.write_html(file, open_browser=True)
        append_to_file(ipynb_dir+f'\\{self.__name__().split('.')[-1]}.txt',text)
        
        
        logger.info(f"Graph saved as {file} and opened in browser")
        
        
        print(text)
    
        # Magic Methods
    def __str__(self):
        return (f"WebsiteGraphMP(start_url='{self.start_url}', nodes={self.graph.number_of_nodes()}, "
                f"edges={self.graph.number_of_edges()})")

    def __repr__(self):
        return (f"WebsiteGraphMP(start_url='{self.start_url}', max_depth={self.max_depth}, "
                f"max_links={self.max_links}, workers={self.workers})")

    def __eq__(self, other):
        if not isinstance(other, WebsiteGraphMP):
            return NotImplemented
        # Compare start_url and basic graph structure (nodes and edges)
        return (self.start_url == other.start_url and
                nx.is_isomorphic(self.graph, other.graph))

    def __ne__(self, other):
        return not self.__eq__(other)
    
    def __add__(self, other):
            if not isinstance(other, WebsiteGraphMP):
                return NotImplemented

            # Use NetworkX's compose to get the union of the two graphs.
            # (Nodes present in both graphs will be merged; edge sets are united.)
            new_graph = nx.compose(self.graph, other.graph)

            # Invent union logic for parameters:
            # For example, use the start_url from self, and pick the more permissive (max) values.
            new_max_depth = max(self.max_depth, other.max_depth)
            new_max_links = max(self.max_links, other.max_links)
            new_workers = max(self.workers, other.workers)
            new_expire_after = max(self.expire_after, other.expire_after)

            # For regex pattern, choose self's pattern if available, otherwise other.
            new_path_regex = None
            if self.path_regex and other.path_regex:
                new_path_regex = self.path_regex.pattern if len(self.path_regex.pattern) >= len(other.path_regex.pattern) else other.path_regex.pattern
            elif self.path_regex:
                new_path_regex = self.path_regex.pattern
            elif other.path_regex:
                new_path_regex = other.path_regex.pattern

            # Create a new WebsiteGraph instance with unioned parameters.
            new_instance = WebsiteGraphMP(
                start_url=self.start_url,  # you can decide which one to use
                max_depth=new_max_depth,
                max_links=new_max_links,
                path_regex=new_path_regex,
                workers=new_workers,
                expire_after=new_expire_after,
                layout=self.layout  # or merge layouts if needed
            )
            # Set the union graph
            new_instance.graph = new_graph

            return new_instance
    
    def __iadd__(self, other):
        # In-place union: merge other's graph into self
        if not isinstance(other, WebsiteGraphMP):
            return NotImplemented
        self.graph = nx.compose(self.graph, other.graph)
        self.max_depth = max(self.max_depth, other.max_depth)
        self.max_links = max(self.max_links, other.max_links)
        self.workers = max(self.workers, other.workers)
        self.expire_after = max(self.expire_after, other.expire_after)
        # For path_regex and layout, you can choose to keep self's parameters.
        return self

    def __sub__(self, other):
        if not isinstance(other, WebsiteGraphMP):
            return NotImplemented
        # Subtract nodes found in the other graph from self.graph
        new_instance = WebsiteGraphMP(
            start_url=self.start_url,
            max_depth=self.max_depth,
            max_links=self.max_links,
            path_regex=self.path_regex.pattern if self.path_regex else None,
            workers=self.workers,
            expire_after=self.expire_after,
            layout=self.layout
        )
        new_instance.graph = self.graph.copy()
        for node in other.graph.nodes():
            if node in new_instance.graph:
                new_instance.graph.remove_node(node)
        return new_instance

    def __iter__(self):
        # Iterate over nodes as (node, attributes) tuples.
        return iter(self.graph.nodes(data=True))

    def __len__(self):
        return self.graph.number_of_nodes()

    def __getitem__(self, key):
        # Allow indexing by node ID to get node attributes.
        return self.graph.nodes[key]

    def __contains__(self, key):
        return key in self.graph

    def __bool__(self):
        return self.graph.number_of_nodes() > 0

    def __call__(self):
        # Calling the instance triggers a re-crawl.
        self.crawl()
        return self        
    
    def __name__(self):
        return 'WebsiteGraphMP'

In [87]:

logger.info("Starting program")
graph1 = WebsiteGraphMP(
    start_url="https://en.wikipedia.org/wiki/Data_science",
    max_depth=1,
    max_links=10,
    path_regex=r"^/wiki/[A-Za-z_]+$",
    workers=10,
    layout={"physics": True}
)()
# Call instance to crawl

print(graph1)  # Uses __str__
print(repr(graph1))  # Uses __repr__
# Iterate over nodes
for node, data in graph1:
    print(node, data)
# Check length
print("Total nodes:", len(graph1))
# Check membership
print("Contains 'https://en.wikipedia.org/wiki/Data_science':", "https://en.wikipedia.org/wiki/Data_science" in graph1)

# graph1.visualize()

2025-03-25 11:01:09,920 - INFO - Starting program
2025-03-25 11:01:09,931 - INFO - Starting crawl
2025-03-25 11:01:09,938 - INFO - Fetching (depth 0): https://en.wikipedia.org/wiki/Data_science
2025-03-25 11:01:10,020 - INFO - Fetching (depth 1): https://en.wikipedia.org/wiki/Information_science
2025-03-25 11:01:10,021 - INFO - Fetching (depth 1): https://en.wikipedia.org/wiki/Scientific_method
2025-03-25 11:01:10,027 - INFO - Fetching (depth 1): https://en.wikipedia.org/wiki/Interdisciplinary
2025-03-25 11:01:10,070 - INFO - Fetching (depth 1): https://en.wikipedia.org/wiki/Comet_NEOWISE


Predicted time for crawling: 2.0742762088775635


2025-03-25 11:01:10,102 - INFO - Fetching (depth 1): https://en.wikipedia.org/wiki/Scientific_computing
2025-03-25 11:01:10,508 - INFO - Fetching (depth 1): https://en.wikipedia.org/wiki/Computer_science
2025-03-25 11:01:10,586 - INFO - Fetching (depth 1): https://en.wikipedia.org/wiki/Astronomical_survey
2025-03-25 11:01:10,647 - INFO - Fetching (depth 1): https://en.wikipedia.org/wiki/Statistics
2025-03-25 11:01:10,874 - INFO - Fetching (depth 1): https://en.wikipedia.org/wiki/Space_telescope
2025-03-25 11:01:11,881 - INFO - Crawl complete


WebsiteGraphMP(start_url='https://en.wikipedia.org/wiki/Data_science', nodes=86, edges=90)
WebsiteGraphMP(start_url='https://en.wikipedia.org/wiki/Data_science', max_depth=1, max_links=10, workers=10)
https://en.wikipedia.org/wiki/Data_science {'title': 'Data science', 'label': 'Data science', 'status_code': 200, 'depth': 0}
https://en.wikipedia.org/wiki/Information_science {'title': 'Information science', 'label': 'Information science', 'status_code': 200, 'depth': 1}
https://en.wikipedia.org/wiki/Scientific_method {'title': 'Scientific method', 'label': 'Scientific method', 'status_code': 200, 'depth': 1}
https://en.wikipedia.org/wiki/Interdisciplinary {'title': 'Interdisciplinary', 'label': 'Interdisciplinary', 'status_code': 200, 'depth': 1}
https://en.wikipedia.org/wiki/Comet_NEOWISE {'title': 'Comet NEOWISE', 'label': 'Comet NEOWISE', 'status_code': 200, 'depth': 1}
https://en.wikipedia.org/wiki/Scientific_computing {'title': 'Scientific computing', 'label': 'Scientific computing

In [6]:
'''graph1 = WebsiteGraphMP(
    start_url="https://en.wikipedia.org/wiki/Data_science",
    max_depth=4,
    max_links=10,
    path_regex=r"^/wiki/[A-Za-z_]+$",
    workers=10,
    layout={"physics": True}
)
graph1()

graph1.visualize(force_file_name = '4d.10l.10w')'''

'graph1 = WebsiteGraphMP(\n    start_url="https://en.wikipedia.org/wiki/Data_science",\n    max_depth=4,\n    max_links=10,\n    path_regex=r"^/wiki/[A-Za-z_]+$",\n    workers=10,\n    layout={"physics": True}\n)\ngraph1()\n\ngraph1.visualize(force_file_name = \'4d.10l.10w\')'

results:

c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph3.html | max_depth: 1 | max_links: 10 | crawl time: 1.1849431991577148 | workers: 10
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph4.html | max_depth: 1 | max_links: 10 | crawl time: 1.4319779872894287 | workers: 20
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph5.html | max_depth: 2 | max_links: 10 | crawl time: 12.662982702255249 | workers: 10
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph6.html | max_depth: 2 | max_links: 10 | crawl time: 13.79235053062439 | workers: 20
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph7.html | max_depth: 3 | max_links: 10 | crawl time: 66.65576791763306 | workers: 10
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph8.html | max_depth: 3 | max_links: 10 | crawl time: 62.48543572425842 | workers: 20
c:\Users\ivant\Desktop\proj\WebsiteGraph\graphs\graph9.html | max_depth: 4 | max_links: 10 | crawl time: 264.98015880584717 | workers: 10

`max_depth=4,
max_links=10`
<br>
С этими параметрами - визуализация проблематична, но строится относительно быстро, можно пользоваться как графом этой структурой можно спокойно

## Union graphs

In [7]:
'''graph1 = WebsiteGraphMP(start_url="https://en.wikipedia.org/wiki/Data_science", max_depth=1, max_links=10, path_regex=r"^/wiki/[A-Za-z_]+$")
graph1()

graph2 = WebsiteGraphMP(start_url="https://en.wikipedia.org/wiki/Artificial_intelligence", max_depth=1, max_links=10, path_regex=r"^/wiki/[A-Za-z_]+$")
graph2()

union_graph = graph1 + graph2
union_graph.visualize()'''

'graph1 = WebsiteGraphMP(start_url="https://en.wikipedia.org/wiki/Data_science", max_depth=1, max_links=10, path_regex=r"^/wiki/[A-Za-z_]+$")\ngraph1()\n\ngraph2 = WebsiteGraphMP(start_url="https://en.wikipedia.org/wiki/Artificial_intelligence", max_depth=1, max_links=10, path_regex=r"^/wiki/[A-Za-z_]+$")\ngraph2()\n\nunion_graph = graph1 + graph2\nunion_graph.visualize()'

In [89]:
results = graph1._prev_results()
gran_mean_times = results.groupby(['max_depth','max_links','workers'])[['crawl_time']].agg(['mean'])
gran_mean_times.loc[4,10].mean()

crawl_time  mean    286.954203
dtype: float64

In [1]:


from statsmodels.api import OLS
from sklearn.preprocessing import PolynomialFeatures

X = results[['max_depth','max_links','workers']].values
y = results['crawl_time'].values

X = PolynomialFeatures(degree=2).fit_transform(X)
model = OLS(y,X)
model = model.fit()
model.summary()

model.predict(PolynomialFeatures(degree=2).fit_transform([[10,10,10]]))[0]

Unnamed: 0_level_0,max_depth,max_links,crawl_time,workers
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
wtf.html,0,5,0.053846,10
graph11.html,0,5,0.061337,10
graph10.html,0,5,0.067775,10
graph10.html,0,10,0.072039,10
graph12.html,0,5,0.108873,10
graph10.html,0,5,0.119081,10
graph10.html,0,5,0.378627,10
graph3.html,1,10,1.184943,10
graph4.html,1,10,1.431978,20
graph10.html,1,10,3.873815,10
