In [None]:
import json

import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

from exceptions import NoItemsError, AttributeExtractionError

import random

In [7]:
with open('agents.json', 'r', encoding='utf-8') as file:
    agents = json.load(file)

agents['USER_AGENTS']

['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36']

In [None]:
user_agent = agents['USER_AGENTS']

def scrape_site_for_links(url, item_selector):
    '''
    
    '''
    options = Options()
    options.add_argument("--headless")
    options.add_argument(f"user-agent={user_agent}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    
    driver = webdriver.Chrome(options=options)

    link_list = []

    try:
        driver.get(url)
        time.sleep(random.uniform(1, 2))
        items = driver.find_elements(By.CSS_SELECTOR, item_selector)
        if not items:
            driver.quit()
            raise NoItemsError
        for element in items:
            a_tag = element.find_element(By.TAG_NAME, "a")
            href = a_tag.get_attribute("href")
            if href:
                link_list.append(href)
            else:
                driver.quit()
                raise AttributeExtractionError
    except Exception as e:
        driver.quit()
        raise e
    
    driver.quit()
    return link_list

In [None]:
def scrape_and_save(urls, item_selector, output_file="somename.txt"):
    """
    Открывает список URL в Selenium, ищет блоки по селектору, вытаскивает href из <a> внутри и сохраняет в файл.

    Args:
        urls (list[str]): список ссылок со страницами вакансий
        item_selector (str): CSS-селектор блока вакансии (например, "div.vacancies-section__item")
        output_file (str): имя файла для сохранения
    """
    # --- заглушка User-Agent (статический) ---
    user_agent = (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/127.0.0.0 Safari/537.36"
    )

    options = Options()
    options.add_argument(f"user-agent={user_agent}")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(options=options)

    with open(output_file, "w", encoding="utf-8") as f:
        for url in urls:
            try:
                driver.get(url)
                time.sleep(2)  # дать время на рендер

                items = driver.find_elements(By.CSS_SELECTOR, item_selector)
                if not items:
                    print(f"[WARN] нет элементов на {url}")
                    continue

                for el in items:
                    a_tag = el.find_element(By.TAG_NAME, "a")
                    href = a_tag.get_attribute("href")
                    if href:
                        f.write(f"{href}\n")

                print(f"[OK] {url} -> {len(items)} ссылок")

            except Exception as e:
                print(f"[ERR] {url} -> {e}")

    driver.quit()
    print(f"Результат сохранён в {output_file}")


# ==== пример вызова ====
if __name__ == "__main__":
    urls = [
        "https://career.avito.com/vacancies/data-science/"
    ]
    scrape_and_save(urls, item_selector="div.vacancies-section__item")


[OK] https://career.avito.com/vacancies/data-science/ -> 17 ссылок
Результат сохранён в somename.txt
