In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import random
from typing import List, Dict, Optional

In [2]:
BASE_URL = "https://hh.ru"
SEARCH_URL = "https://hh.ru/search/vacancy?text=Программист&professional_role=96&hhtmFrom&"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}
MAX_PAGES = 40 # Number of search result pages to scrape
DELAY = random.uniform(1, 3)  # Delay between requests to avoid being blocked

In [3]:
def get_page(url: str) -> Optional[BeautifulSoup]:
    """
    Fetch a webpage and return its BeautifulSoup object.
    
    Args:
        url: URL of the page to fetch
        
    Returns:
        BeautifulSoup object if successful, None otherwise
    """
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {url}: {e}")
        return None

In [4]:
def extract_vacancy_links_from_page(soup: BeautifulSoup) -> List[str]:
    """
    Extract vacancy links from a single search results page.
    
    Args:
        soup: BeautifulSoup object of the search results page
        
    Returns:
        List of vacancy URLs
    """
    links = []
    vacancy_blocks = soup.find_all('div', class_='vacancy-info--ieHKDTkezpEj0Gsx')
    
    for block in vacancy_blocks:
        link_tag = block.find('a', {
            'data-qa': 'serp-item__title',
            'class': lambda x: x and 'magritte-link___' in x
        })
        if link_tag and link_tag.get('href'):
            # Ensure the URL is absolute
            absolute_url = urljoin(BASE_URL, link_tag['href'].split('?')[0])
            links.append(absolute_url)
    
    return links

In [5]:
def get_all_vacancy_links() -> List[str]:
    """
    Get all vacancy links from all search result pages.
    
    Returns:
        List of all vacancy URLs found
    """
    all_links = []
    # Search from Moscow, St-Petesburg and Kazan
    for area in [1, 2, 88]:
        print(f"Searching area {area}...")
        for page in range(MAX_PAGES):
            if page%10 == 0:
                print(f"Processing search page {page + 1} of {MAX_PAGES}")
            page_url = f"{SEARCH_URL}&page={page}&area={area}=vacancy_search_list"
            soup = get_page(page_url)
            
            if not soup:
                # print(f"Failed to process page {page + 1}, skipping...")
                continue
            
            page_links = extract_vacancy_links_from_page(soup)
            if not page_links:
                # print(f"No links found on page {page + 1}, stopping pagination.")
                break
                
            all_links.extend(page_links)
            # print(f"Found {len(page_links)} links on this page (total: {len(all_links)})")
            
            # Respectful delay between requests
            time.sleep(DELAY)
    
    return all_links

In [6]:
print("\nStep 1: Collecting vacancy links from search results...")
vacancy_links = get_all_vacancy_links()


Step 1: Collecting vacancy links from search results...
Searching area 1...
Processing search page 1 of 40
Processing search page 11 of 40
Processing search page 21 of 40
Processing search page 31 of 40
Searching area 2...
Processing search page 1 of 40
Processing search page 11 of 40
Processing search page 21 of 40
Processing search page 31 of 40
Searching area 88...
Processing search page 1 of 40
Processing search page 11 of 40
Processing search page 21 of 40
Processing search page 31 of 40


In [None]:
print(len(vacancy_links))

In [None]:
# Should be written only once

# with open('links.txt', 'w') as f:
#     for link in vacancy_links:
#         f.write(f"{link}\n")

In [8]:
def extract_key_skills(soup):
    """
    Extracts the list of key skills from the HTML content of a job posting.
    
    Args:
        html_content (str): HTML content of the job posting
        
    Returns:
        list: List of key skills found in the posting
    """
    # soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the skills section by looking for the unordered list with the specific class
    skills_section = soup.find('ul', class_='vacancy-skill-list--JsTYRZ5o6dsoavK7')
    
    if not skills_section:
        return []
    
    # Extract all skill items
    skill_items = skills_section.find_all('li', attrs={'data-qa': 'skills-element'})
    
    skills = []
    for item in skill_items:
        # Find the div with the skill label
        label_div = item.find('div', class_='magritte-tag__label___YHV-o_3-1-13')
        if label_div:
            skills.append(label_div.get_text(strip=True))
    
    return skills


def process_vacancy_links(vacancy_links: List[str]) -> List[List[str]]:
    """
    Process each vacancy link to extract key skills.
    
    Args:
        vacancy_links: List of vacancy URLs to process
        
    Returns:
        2D array of skills (list of lists), where each inner list contains skills for one vacancy
    """
    all_skills = []
    
    for i, link in enumerate(vacancy_links, 1):
        if i % 10 == 0:
            print(f"Prossesing {i} link...")
        # print(f"Processing vacancy {i} of {len(vacancy_links)}: {link}")
        if link == "https://adsrv.hh.ru/click":
            continue

        soup = get_page(link)
        
        if not soup:
            # print(f"Failed to process vacancy {link}, skipping...")
            all_skills.append([])  # Add empty list for consistency
            continue
        
        skills = extract_key_skills(soup)
        all_skills.append(skills)
        # print(f"Found {len(skills)} skills for this vacancy")
        
        # Respectful delay between requests
        time.sleep(DELAY)
    
    return all_skills

In [9]:
print(len(vacancy_links))

2400


In [10]:
print("\nStep 2: Extracting key skills from each vacancy...")
vacancy_skills = process_vacancy_links(vacancy_links)


Step 2: Extracting key skills from each vacancy...
Prossesing 10 link...
Prossesing 20 link...
Prossesing 30 link...
Prossesing 40 link...
Prossesing 50 link...
Prossesing 60 link...
Prossesing 70 link...
Prossesing 80 link...
Prossesing 90 link...
Prossesing 100 link...
Prossesing 110 link...
Prossesing 120 link...
Prossesing 130 link...
Prossesing 140 link...
Prossesing 150 link...
Prossesing 160 link...
Prossesing 170 link...
Prossesing 180 link...
Prossesing 190 link...
Prossesing 200 link...
Prossesing 210 link...
Prossesing 220 link...
Prossesing 230 link...
Prossesing 240 link...
Prossesing 250 link...
Prossesing 260 link...
Prossesing 270 link...
Prossesing 280 link...
Prossesing 290 link...
Prossesing 300 link...
Prossesing 310 link...
Prossesing 320 link...
Prossesing 330 link...
Prossesing 340 link...
Prossesing 350 link...
Prossesing 360 link...
Prossesing 370 link...
Prossesing 380 link...
Prossesing 390 link...
Prossesing 400 link...
Prossesing 410 link...
Prossesing 420

In [11]:
print(vacancy_skills)

[[], ['PHP', 'PHP 7.4-8', 'Nginx', 'MySQL', 'PostgreSQL', 'REST API'], [], ['Администрирование серверов Windows', 'Linux', 'Ремонт ПК', 'Администрирование сетевого оборудования', 'Windows 7'], ['Точность и внимательность к деталям', 'Cистемы управления базами данных', 'MS Dos', 'Компьютер', 'Знание компьютера'], ['Умение работать в команде', 'Анализ данных', 'Техническая документация', 'Навыки составления отчетности'], [], ['PHP', 'Git', 'jQuery', 'CSS3', 'HTML5', 'Ajax', '1С-Битрикс'], [], [], [], ['1C: ERP', 'Оптимизация кода', 'Написание инструкций', 'ERP-системы на базе 1С'], ['ООП', 'MS SQL', 'Visual Studio C#', 'Design Patterns', 'С#'], ['Работа в команде'], ['SQL', 'Delphi', 'C#'], ['IDE Lazarus', 'C++', 'Разработка программного обеспечения для Linux', 'Разработка программного обеспечения для Windows', 'Python', 'Английский\xa0— A2 — Элементарный'], ['C++', 'Python', 'HTML', 'MS Visual Studio', 'Luo'], [], ['SQL', 'JavaScript', 'Кроссбраузерное тестирование', 'Sanity-тестировани

In [12]:
vacancy_skills = [skills for skills in vacancy_skills if skills != []]
print(vacancy_skills)
print(f"There are {len(vacancy_skills)} pages with skills needed info")

[['PHP', 'PHP 7.4-8', 'Nginx', 'MySQL', 'PostgreSQL', 'REST API'], ['Администрирование серверов Windows', 'Linux', 'Ремонт ПК', 'Администрирование сетевого оборудования', 'Windows 7'], ['Точность и внимательность к деталям', 'Cистемы управления базами данных', 'MS Dos', 'Компьютер', 'Знание компьютера'], ['Умение работать в команде', 'Анализ данных', 'Техническая документация', 'Навыки составления отчетности'], ['PHP', 'Git', 'jQuery', 'CSS3', 'HTML5', 'Ajax', '1С-Битрикс'], ['1C: ERP', 'Оптимизация кода', 'Написание инструкций', 'ERP-системы на базе 1С'], ['ООП', 'MS SQL', 'Visual Studio C#', 'Design Patterns', 'С#'], ['Работа в команде'], ['SQL', 'Delphi', 'C#'], ['IDE Lazarus', 'C++', 'Разработка программного обеспечения для Linux', 'Разработка программного обеспечения для Windows', 'Python', 'Английский\xa0— A2 — Элементарный'], ['C++', 'Python', 'HTML', 'MS Visual Studio', 'Luo'], ['SQL', 'JavaScript', 'Кроссбраузерное тестирование', 'Sanity-тестирование', 'Рефакторинг кода', 'Yii

In [None]:
# Should be executed only once

# with open('skills.txt', 'w') as f:
#     for skills in vacancy_skills:
#         f.write(f"{skills}\n")

In [13]:
from collections import defaultdict

def count_skills(skills_lists):
    skill_counts = defaultdict(int)
    
    for sublist in skills_lists:
        for skill in sublist:
            # Clean up the skill name (remove extra whitespace, normalize variations)
            cleaned_skill = skill.strip().replace('\xa0', ' ')  # Handle non-breaking spaces
            skill_counts[cleaned_skill] += 1
    
    return dict(skill_counts)

skill_counts = count_skills(vacancy_skills)

# Print sorted by count (descending)
for skill, count in sorted(skill_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{skill}: {count}")

1С программирование: 423
Git: 272
1С: Предприятие 8: 226
PostgreSQL: 202
SQL: 190
PHP: 186
1С: Бухгалтерия: 178
JavaScript: 168
C++: 148
Linux: 145
1С: Управление Торговлей: 133
MySQL: 128
Обновление конфигурации 1С: 127
Python: 126
HTML: 105
1С: Зарплата и управление персоналом: 101
1С: Документооборот: 97
Docker: 95
Java: 93
ERP-системы на базе 1С: 88
TypeScript: 80
REST API: 71
CSS: 71
ООП: 70
1С: Управление Производственным Предприятием: 69
Автоматизация процессов: 67
MS SQL: 66
Разработка ПО: 63
React: 63
1C: ERP: 61
C#: 58
АСУ ТП: 58
C/C++: 58
1С: Предприятие: 57
Создание конфигурации 1С: 56
1С: Зарплата и кадры: 54
Английский — B1 — Средний: 54
1С: 53
1С-Битрикс: 52
Работа в команде: 52
1С: Комплексная автоматизация: 51
Laravel: 51
Английский язык: 51
Оптимизация кода: 50
Redis: 49
RabbitMQ: 49
Qt: 49
Golang: 48
CSS3: 47
Node.js: 47
HTML5: 46
Работа с базами данных: 42
jQuery: 41
SCADA: 40
Английский — B2 — Средне-продвинутый: 38
VueJS: 36
Разработка технических заданий: 36
REST