```python
## \file /sandbox/davidka/crawler.py
# -*- coding: utf-8 -*-
#! .pyenv/bin/python3
```

Модуль для сбора данных со страниц различных сайтов
=====================================================


```rst
.. module:: sandbox.davidka.crawler
```


In [1]:
import ipdb # <- трасировка и точки останова
import asyncio, random
from pathlib import Path
from types import SimpleNamespace


import header
from header import __root__
from src import gs
from src.webdriver.llm_driver.simple_driver import SimpleDriver
from src.utils.jjson import j_loads, j_loads_ns, j_dumps
from src.utils.file import read_text_file, save_text_file, get_filenames_from_directory 
from src.utils.printer import pprint as print
from src.logger.logger import logger

 response.status_code=404 [0m
2025-04-27 21:54:10,029 - INFO - Anonymized telemetry enabled. See https://docs.browser-use.com/development/telemetry for more information.
2025-04-27 21:54:10,046 - INFO - ℹ️ [32m[49mConfig Gemini: Status=active, Model=gemini-1.5-flash-latest, Key Present=True [0m
2025-04-27 21:54:10,047 - INFO - ℹ️ [32m[49mConfig OpenAI: Status=disabled, Model=gpt-4o, Key Present=True [0m


## Конфигурация

In [2]:
class Config(SimpleNamespace):
    ENDPOINT:Path = __root__/'SANDBOX'/'davidka'
    mining_data_path:Path = ENDPOINT/'random_urls'
    crawl_files_list:list = get_filenames_from_directory(mining_data_path, 'json')
    generate_product_links_instruction:str =  Path(ENDPOINT/ 'instructions'/ 'generate_product_links.md').read_text(encoding='utf-8')
    grab_product_page_instruction:str =  Path(ENDPOINT/ 'instructions'/ 'grab_product_page.md').read_text(encoding='utf-8')


driver:SimpleDriver = SimpleDriver()

2025-04-27 21:54:19,379 - INFO - ℹ️ [32m[49mИнициализация Gemini: Model=gemini-1.5-flash-latest [0m
2025-04-27 21:54:19,379 - DEBUG - 🐛 [36m[49mУстановлена переменная окружения GOOGLE_API_KEY. [0m
2025-04-27 21:54:19,395 - INFO - ℹ️ [32m[49mGemini LLM инициализирован. [0m
2025-04-27 21:54:22,084 - INFO - ℹ️ [32m[49mДобавлен инструмент WebSearchAPI (SerpApi). [0m
2025-04-27 21:54:22,100 - INFO - ℹ️ [32m[49mИтоговый список инструментов: ['WebSearchAPI'] [0m


## Функции

In [3]:
def get_products_urls_list_from_files(crawl_files_list:list = []) -> list:
    """
   Функция читает содержимое файлов  в директории `mining_data`, перемешивает их и возвращает одним большим списком
   """
    products_urls_list = []
    for filename in crawl_files_list or Config.crawl_files_list:
        try:
            file_path = Config.mining_data_path / filename
            crawl_data = j_loads(file_path)['products']
            for product in crawl_data:
                products_urls_list.append(product['product_url'])
        except Exception as ex:
            logger.error(f'Ошибка при обработке файла/n {filename=}/n', ex)
            ...
    random.shuffle(products_urls_list)
    return products_urls_list if isinstance(products_urls_list, list) else [products_urls_list]

def yield_product_urls_from_files(directory: Path = Config.mining_data_path, pattern: str = 'json'):
    """
    Функция возвращает генератор списка `url` Применяется на больших объемах данных
    """
    filenames = get_filenames_from_directory(directory, pattern)
    for filename in filenames:
        try:
            file_path = directory / filename
            crawl_data = j_loads(file_path)['products']
            for product in crawl_data:
                yield product['product_url']
        except Exception as ex:
            logger.error(f'Ошибка при обработке файла {filename=}', ex)
            ...
            
def get_categories(crawl_files_list:list = []) -> list:
    """Возвращает все категории из файлов словарей для майнинга"""
    categories_list:list = []
    for filename in crawl_files_list or Config.crawl_files_list:
        try:
            file_path = Config.mining_data_path / filename
            crawl_data = j_loads(file_path)['products']
            for product in crawl_data:
                try:
                    categories_list.append(product['parent_category'])
                except:
                    ...
                try:    
                    categories_list.append(product['category_name'])
                except:
                    ...
        except Exception as ex:
            logger.error(f'Ошибка при обработке файла/n {filename=}/n', ex)
            ...
            return categories_list
            ...
    categories_list = list(set(categories_list))
    random.shuffle(categories_list)
    return categories_list 

### Для готового списка URL товаров

In [4]:
# Через генератор для совсем больших данных
# for product_url in yield_product_urls_from_files():

async def grab_product_page():
    for product_url in get_products_urls_list_from_files():
        try:
            logger.info(f'Обработка URL: {product_url}')
            task = Config.grab_product_page_instruction.replace('<URL>', product_url)
            
            extracted_data = await driver.simple_process_task_async(task)
            ipdb.set_trace()
            print(f'\n\n{extracted_data=}')
            ...
        except Exception as ex:
            logger.error(f'Ошибка при обработке {product_url=}', ex)

# await grab_product_page()

### Для поиска по категории

In [None]:
async def get_products_by_category(category:str, num_of_links:str = 'две', instruction:str = ''):
    
        try:
            logger.info(f'Обработка {category=}')
            task = instruction if instruction else Config.generate_product_links_instruction.replace('<product category name>', category).replace('<num of links>', num_of_links)
            extracted_data = await driver.simple_process_task_async(task)
            print(extracted_data)
            
            ...
        except Exception as ex:
            logger.error(f'Ошибка при обработке {category=}', ex)


# for category in get_categories():
#     instruction = Path(Config.ENDPOINT/ 'instructions'/ 'generate_product_links.md').read_text(encoding='utf-8')
#     await get_products_by_category(category, instruction = instruction)
#     break