In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from collections import Counter
import nltk
import json
import random
from urllib.parse import quote_plus
from tqdm.auto import tqdm

try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords', quiet=True)
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt', quiet=True)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

BASE_URL_HH = "https://hh.ru"
TARGET_JOB_SEARCH_QUERY = "Python разработчик"
TARGET_AREA_ID = "113"
NUM_JOBS_TO_SCRAPE = 1000
MAX_SCRAPE_LIMIT = 1050
MAX_PAGES_TO_CHECK = 100

SKILL_KEYWORDS_RU = [
    'python', 'java', 'c++', 'c#', 'javascript', 'sql', 'nosql', 'mongodb', 'react', 'angular', 'vue', 'node.js',
    'django', 'flask', 'spring', 'api', 'rest', 'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'git', 'linux',
    'machine learning', 'deep learning', 'data analysis', 'data science', 'pandas', 'numpy', 'scikit-learn',
    'tensorflow', 'pytorch', 'nlp', 'computer vision', 'big data', 'hadoop', 'spark', 'kafka', 'rabbitmq',
    'html', 'css', 'typescript', 'php', 'laravel', 'symfony', 'ruby', 'swift', 'kotlin', 'ios', 'android',
    'sql', 'postgresql', 'mysql', 'oracle', 'ms sql', 'redis', 'memcached', 'clickhouse',
    'agile', 'scrum', 'jira', 'английский', 'коммуникабельность', 'ответственность', 'команда', 'аналитический',
    'системное мышление', 'разработка', 'тестирование', 'автоматизация', 'алгоритмы', 'структуры данных',
    'микросервисы', 'devops', 'ci/cd', 'сети', 'системный администратор', 'базы данных', 'проектирование',
    'управление проектами', 'product management', 'ux', 'ui', 'дизайн', 'аналитика', 'отчетность', 'bi',
    '1с', 'битрикс', 'ооп', 'asyncio', 'fastapi', 'celery', 'airflow', 'etl', 'data warehouse', 'dwh',
    'высшее образование', 'техническое образование', 'математика', 'статистика', 'linux', 'bash', 'unix',
    'go', 'golang', 'scala', 'rust', 'perl', 'delphi', 'objective-c', 'swiftui', 'kotlin multiplatform',
    'kubernetes', 'openshift', 'terraform', 'ansible', 'jenkins', 'gitlab ci', 'teamcity', 'prometheus', 'grafana',
    'zabbix', 'elk', 'http', 'tcp/ip', 'dns', 'vpn', 'сетевое администрирование', 'информационная безопасность',
    'рефакторинг', 'архитектура по', 'опыт работы', 'управление командой'
]

STOP_WORDS_RU = set(stopwords.words('russian')) | set(stopwords.words('english'))
COMMON_NOISE_RU = {
    'работа', 'вакансия', 'компания', 'требования', 'обязанности', 'условия', 'опыт', 'разработка',
    'сотрудник', 'проект', 'задачи', 'навыки', 'уровень', 'москва', 'россия', 'ищем', 'специалист',
    'команда', 'развитие', 'заработная', 'плата', 'оформление', 'тк', 'рф', 'график', 'полный', 'день',
    'офис', 'возможность', 'рост', 'дружный', 'коллектив', 'приглашаем', 'присоединиться', 'нашей',
    'нашу', 'вас', 'мы', 'год', 'лет', 'также', 'наличие', 'знание', 'умение', 'работы', 'задача',
    'понимание', 'принципы', 'процесс', 'желание', 'участие', 'поддержка', 'обеспечение', 'создание',
    'анализ', 'контроль', 'оптимизация', 'будет', 'плюсом', 'необходимо', 'от', 'до', 'руб', 'это',
    'или', 'навык', 'владение'
}

def get_soup_with_retries(url, retries=3, delay=5):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Sec-Ch-Ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
    }
    for i in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()
            if response.status_code == 200:
                return BeautifulSoup(response.content, 'html.parser')
        except requests.exceptions.RequestException:
            pass
        if i < retries - 1:
            time.sleep(delay * (i + 1))
    return None

def extract_job_links_hh(list_page_soup):
    links = []
    results_container = list_page_soup.find('div', {'data-qa': 'vacancy-serp__results'})
    if not results_container:
        results_container = list_page_soup.find('div', {'id': 'a11y-main-content'})
        if not results_container:
             return []

    vacancy_cards = results_container.select('div[data-qa^="vacancy-serp__vacancy"]')
    if not vacancy_cards:
         vacancy_cards = results_container.select('div:has(> h2 a[data-qa="serp-item__title"])')

    for card in vacancy_cards:
        link_tag = card.select_one('a[data-qa="serp-item__title"]')
        if not link_tag:
             link_tag = card.select_one('a.serp-item__title')

        if link_tag and link_tag.get('href'):
            href = link_tag['href'].split('?')[0]
            if '/vacancy/' in href:
                full_link = href if href.startswith('http') else BASE_URL_HH + href
                if full_link not in links:
                    links.append(full_link)
    return links

def extract_single_job_details_hh(job_page_soup, job_url):
    title = "N/A"
    title_tag = job_page_soup.find('h1', {'data-qa': 'vacancy-title'})
    if title_tag:
        title = title_tag.get_text(strip=True)

    salary_info = "N/A"
    salary_tag = job_page_soup.find('div', {'data-qa': 'vacancy-salary'})
    if salary_tag:
         salary_text_tag = salary_tag.find('span', class_=lambda x: x and 'compensation-text' in x)
         if salary_text_tag:
             salary_info = salary_text_tag.get_text(separator=' ', strip=True).replace('\u202f', ' ')
         else:
             salary_info = salary_tag.get_text(separator=' ', strip=True).replace('\u202f', ' ')

    company_name = "N/A"
    company_tag = job_page_soup.find('a', {'data-qa': 'vacancy-company-name'})
    if company_tag:
        company_name = company_tag.get_text(strip=True)

    location = "N/A"
    location_span = job_page_soup.find('span', {'data-qa': 'vacancy-view-location'})
    if location_span:
        location = location_span.get_text(strip=True)
    else:
        address_p = job_page_soup.find('p', {'data-qa': 'vacancy-view-location'})
        if address_p:
             location = address_p.get_text(strip=True)
        else:
            address_div = job_page_soup.find('div', {'data-qa': 'vacancy-address'})
            if address_div:
                location = address_div.get_text(strip=True)

    description = "N/A"
    description_div = job_page_soup.find('div', {'data-qa': 'vacancy-description'})
    if description_div:
        description = description_div.get_text(separator='\n', strip=True)
    else:
        desc_content_div = job_page_soup.find('div', class_=lambda x: x and 'vacancy-description' in x)
        if desc_content_div:
             desc_inner = desc_content_div.find('div', class_=lambda x: x and 'content' in x)
             if desc_inner:
                 description = desc_inner.get_text(separator='\n', strip=True)
             else:
                 description = desc_content_div.get_text(separator='\n', strip=True)

    skills_list = []
    skills_section = job_page_soup.find('div', class_=lambda x: x and 'bloko-tag-list' in x)
    if skills_section:
        skill_tags = skills_section.find_all(lambda tag: tag.name in ['span', 'div'] and tag.has_attr('data-qa') and 'bloko-tag__text' in tag['data-qa'])
        if not skill_tags:
             skill_tags = skills_section.find_all(lambda tag: tag.name in ['span', 'div'] and 'bloko-tag__text' in tag.get('class', []))
        skills_list = [skill.get_text(strip=True) for skill in skill_tags]

    experience = "N/A"
    exp_tag = job_page_soup.find('span', {'data-qa': 'vacancy-experience'})
    if exp_tag:
        experience = exp_tag.get_text(strip=True)

    employment_mode = "N/A"
    emp_tag = job_page_soup.find('p', {'data-qa': 'vacancy-view-employment-mode'})
    if emp_tag:
        employment_mode = emp_tag.get_text(strip=True)

    return {
        'title': title,
        'salary_info': salary_info,
        'company_name': company_name,
        'location': location,
        'experience': experience,
        'employment_mode': employment_mode,
        'skills_text': ", ".join(skills_list) if skills_list else "N/A",
        'description': description,
        'link': job_url
    }

def scrape_hh_jobs_data(search_query, area_id, num_postings_to_scrape=1000):
    all_jobs_data = []
    scraped_job_links = set()
    page_num = 0
    jobs_collected_this_run = 0
    pbar = tqdm(total=num_postings_to_scrape, desc="Scraping jobs", unit="job")

    while jobs_collected_this_run < num_postings_to_scrape and page_num < MAX_PAGES_TO_CHECK:
        query_encoded = quote_plus(search_query)
        current_url = f"{BASE_URL_HH}/search/vacancy?text={query_encoded}&area={area_id}&page={page_num}"

        list_page_soup = get_soup_with_retries(current_url)
        if not list_page_soup:
            break

        job_links_on_page = extract_job_links_hh(list_page_soup)

        if not job_links_on_page:
            if page_num == 0:
                print("No job links found on first page. Check query/area/selectors.")
            break

        new_links_found_on_this_page_count = 0
        links_to_process_on_page = []
        for job_link in job_links_on_page:
            if job_link not in scraped_job_links:
                 links_to_process_on_page.append(job_link)
                 scraped_job_links.add(job_link)

        if not links_to_process_on_page and page_num > 0:
             break

        for job_link in links_to_process_on_page:
            if jobs_collected_this_run >= num_postings_to_scrape:
                 break

            new_links_found_on_this_page_count += 1
            time.sleep(random.uniform(0.8, 1.8))

            single_job_soup = get_soup_with_retries(job_link)
            if single_job_soup:
                try:
                    job_details = extract_single_job_details_hh(single_job_soup, job_link)
                    all_jobs_data.append(job_details)
                    jobs_collected_this_run += 1
                    pbar.update(1)
                except Exception:
                    pass

        if jobs_collected_this_run >= num_postings_to_scrape:
            break

        page_num += 1
        if jobs_collected_this_run < num_postings_to_scrape:
            time.sleep(random.uniform(1.0, 2.5))

    pbar.close()
    return pd.DataFrame(all_jobs_data)

def process_job_data_for_json(df):
    if df.empty:
        return {"top_skills": [], "top_terms": [], "experience_levels": [], "salary_type_distribution": []}

    all_text_for_skills = []
    if 'description' in df.columns:
        all_text_for_skills.extend(df['description'].dropna().astype(str).tolist())
    if 'skills_text' in df.columns:
         all_text_for_skills.extend(df['skills_text'].dropna().astype(str).replace(',', ' ').tolist())

    full_text_lower = ' '.join(all_text_for_skills).lower()
    full_text_lower = re.sub(r'[^\w\s\.\+\#-]', '', full_text_lower)

    identified_skills_from_text = []
    temp_text_for_multi_word = full_text_lower

    multi_word_kws = sorted([kw for kw in SKILL_KEYWORDS_RU if any(c in kw for c in [' ', '.', '#', '+'])], key=len, reverse=True)
    for skill_kw in multi_word_kws:
        try:
            if skill_kw == 'c++':
                pattern = r'\bc\+\+\b'
            elif skill_kw == 'c#':
                pattern = r'\bc#\b'
            else:
                 pattern = r'\b' + re.escape(skill_kw) + r'\b'

            count = len(re.findall(pattern, temp_text_for_multi_word, re.IGNORECASE))
            if count > 0:
                identified_skills_from_text.extend([skill_kw] * count)
                temp_text_for_multi_word = re.sub(pattern, "", temp_text_for_multi_word, flags=re.IGNORECASE)
        except re.error:
             count = temp_text_for_multi_word.count(skill_kw)
             if count > 0:
                  identified_skills_from_text.extend([skill_kw] * count)
                  temp_text_for_multi_word = temp_text_for_multi_word.replace(skill_kw, "")

    single_word_tokens = word_tokenize(temp_text_for_multi_word)
    filtered_single_tokens = [word for word in single_word_tokens if word not in STOP_WORDS_RU and len(word) > 1]
    identified_skills_from_text.extend([word for word in filtered_single_tokens if word in SKILL_KEYWORDS_RU])
    skill_counts = Counter(identified_skills_from_text)

    descriptions_text = ' '.join(df['description'].dropna().astype(str).tolist()).lower()
    general_tokens = word_tokenize(re.sub(r'[^\w\s]', '', descriptions_text))
    general_filtered_tokens = [word for word in general_tokens if word not in STOP_WORDS_RU and len(word) > 2]
    filtered_word_freq = Counter({word: count for word, count in Counter(general_filtered_tokens).items() if word not in COMMON_NOISE_RU})

    experience_levels_list = []
    if 'experience' in df.columns:
        exp_counts = df['experience'].fillna('Не указан').value_counts()
        experience_levels_list = [{"name": str(name), "value": int(value)} for name, value in exp_counts.items()]

    salary_types_list = []
    if 'salary_info' in df.columns:
        def get_salary_type(s):
            s_lower = str(s).lower()
            if 'не указана' in s_lower or s == 'N/A':
                 return 'Не указана'
            elif 'от' in s_lower and 'до' in s_lower:
                 return 'От/До (Диапазон)'
            elif 'от' in s_lower:
                 return 'От (Минимум)'
            elif 'до' in s_lower:
                 return 'До (Максимум)'
            else:
                 return 'Другое/Фиксированная'

        df['salary_type_est'] = df['salary_info'].apply(get_salary_type)
        salary_counts = df['salary_type_est'].value_counts()
        salary_types_list = [{"name": str(name), "value": int(value)} for name, value in salary_counts.items()]

    return {
        "top_skills": [{"name": str(skill), "value": int(count)} for skill, count in skill_counts.most_common(20)],
        "top_terms": [{"name": str(term), "value": int(count)} for term, count in filtered_word_freq.most_common(20)],
        "experience_levels": experience_levels_list,
        "salary_type_distribution": salary_types_list
    }

if __name__ == "__main__":
    actual_num_to_scrape = min(NUM_JOBS_TO_SCRAPE, MAX_SCRAPE_LIMIT)
    scraped_df = scrape_hh_jobs_data(TARGET_JOB_SEARCH_QUERY, TARGET_AREA_ID, num_postings_to_scrape=actual_num_to_scrape)

    if not scraped_df.empty:
        dashboard_data = process_job_data_for_json(scraped_df.copy())
        output_json_filename = "data.json"
        with open(output_json_filename, 'w', encoding='utf-8') as f:
            json.dump(dashboard_data, f, indent=2, ensure_ascii=False)
        print(f"\nScraped {len(scraped_df)} job postings successfully.")
        print(f"Dashboard data saved to {output_json_filename}")
        print("\nSummary of processed data:")
        print(f"  Top skills found: {len(dashboard_data.get('top_skills', []))}")
        print(f"  Top terms found: {len(dashboard_data.get('top_terms', []))}")
        print(f"  Experience level categories: {len(dashboard_data.get('experience_levels', []))}")
        print(f"  Salary type categories: {len(dashboard_data.get('salary_type_distribution', []))}")

    else:
        print("No job postings were scraped.")

    print("\nScript finished.")

  from .autonotebook import tqdm as notebook_tqdm
Scraping jobs:  79%|███████▊  | 787/1000 [33:20<09:01,  2.54s/job]



Scraped 787 job postings successfully.
Dashboard data saved to data.json

Summary of processed data:
  Top skills found: 20
  Top terms found: 20
  Experience level categories: 4
  Salary type categories: 5

Script finished.


### hh.ru has limitation on scraping, I tried 4 times with different approaches, result was always 787