In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import random
from typing import List, Dict, Optional

In [2]:
BASE_URL = "https://hh.ru"
SEARCH_URL = "https://hh.ru/search/vacancy?text=Программист&professional_role=96&hhtmFrom&"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}
MAX_PAGES = 40 # Number of search result pages to scrape
DELAY = random.uniform(1, 3)  # Delay between requests to avoid being blocked

In [3]:
def get_page(url: str) -> Optional[BeautifulSoup]:
    """
    Fetch a webpage and return its BeautifulSoup object.
    
    Args:
        url: URL of the page to fetch
        
    Returns:
        BeautifulSoup object if successful, None otherwise
    """
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {url}: {e}")
        return None

In [4]:
def extract_vacancy_links_from_page(soup: BeautifulSoup) -> List[str]:
    """
    Extract vacancy links from a single search results page.
    
    Args:
        soup: BeautifulSoup object of the search results page
        
    Returns:
        List of vacancy URLs
    """
    links = []
    vacancy_blocks = soup.find_all('div', class_='vacancy-info--ieHKDTkezpEj0Gsx')
    
    for block in vacancy_blocks:
        link_tag = block.find('a', {
            'data-qa': 'serp-item__title',
            'class': lambda x: x and 'magritte-link___' in x
        })
        if link_tag and link_tag.get('href'):
            # Ensure the URL is absolute
            absolute_url = urljoin(BASE_URL, link_tag['href'].split('?')[0])
            links.append(absolute_url)
    
    return links

In [5]:
def get_all_vacancy_links() -> List[str]:
    """
    Get all vacancy links from all search result pages.
    
    Returns:
        List of all vacancy URLs found
    """
    all_links = []
    # Search from Moscow, St-Petesburg and Kazan
    for area in [1, 2, 88]:
        print(f"Searching area {area}...")
        for page in range(MAX_PAGES):
            if page%10 == 0:
                print(f"Processing search page {page + 1} of {MAX_PAGES}")
            page_url = f"{SEARCH_URL}&page={page}&area={area}=vacancy_search_list"
            soup = get_page(page_url)
            
            if not soup:
                # print(f"Failed to process page {page + 1}, skipping...")
                continue
            
            page_links = extract_vacancy_links_from_page(soup)
            if not page_links:
                # print(f"No links found on page {page + 1}, stopping pagination.")
                break
                
            all_links.extend(page_links)
            # print(f"Found {len(page_links)} links on this page (total: {len(all_links)})")
            
            # Respectful delay between requests
            time.sleep(DELAY)
    
    return all_links

In [6]:
print("\nStep 1: Collecting vacancy links from search results...")
vacancy_links = get_all_vacancy_links()


Step 1: Collecting vacancy links from search results...
Searching area 1...
Processing search page 1 of 40
Processing search page 11 of 40
Processing search page 21 of 40
Processing search page 31 of 40
Searching area 2...
Processing search page 1 of 40
Processing search page 11 of 40
Processing search page 21 of 40
Processing search page 31 of 40
Searching area 88...
Processing search page 1 of 40
Processing search page 11 of 40
Processing search page 21 of 40
Processing search page 31 of 40


In [None]:
print(len(vacancy_links))

In [None]:
# Should be written only once

# with open('links.txt', 'w') as f:
#     for link in vacancy_links:
#         f.write(f"{link}\n")

In [8]:
def extract_key_skills(soup):
    """
    Extracts the list of key skills from the HTML content of a job posting.
    
    Args:
        html_content (str): HTML content of the job posting
        
    Returns:
        list: List of key skills found in the posting
    """
    # soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the skills section by looking for the unordered list with the specific class
    skills_section = soup.find('ul', class_='vacancy-skill-list--JsTYRZ5o6dsoavK7')
    
    if not skills_section:
        return []
    
    # Extract all skill items
    skill_items = skills_section.find_all('li', attrs={'data-qa': 'skills-element'})
    
    skills = []
    for item in skill_items:
        # Find the div with the skill label
        label_div = item.find('div', class_='magritte-tag__label___YHV-o_3-1-13')
        if label_div:
            skills.append(label_div.get_text(strip=True))
    
    return skills


def process_vacancy_links(vacancy_links: List[str]) -> List[List[str]]:
    """
    Process each vacancy link to extract key skills.
    
    Args:
        vacancy_links: List of vacancy URLs to process
        
    Returns:
        2D array of skills (list of lists), where each inner list contains skills for one vacancy
    """
    all_skills = []
    
    for i, link in enumerate(vacancy_links, 1):
        if i % 10 == 0:
            print(f"Prossesing {i} link...")
        # print(f"Processing vacancy {i} of {len(vacancy_links)}: {link}")
        if link == "https://adsrv.hh.ru/click":
            continue

        soup = get_page(link)
        
        if not soup:
            # print(f"Failed to process vacancy {link}, skipping...")
            all_skills.append([])  # Add empty list for consistency
            continue
        
        skills = extract_key_skills(soup)
        all_skills.append(skills)
        # print(f"Found {len(skills)} skills for this vacancy")
        
        # Respectful delay between requests
        time.sleep(DELAY)
    
    return all_skills

In [9]:
print(len(vacancy_links))

2400


In [10]:
print("\nStep 2: Extracting key skills from each vacancy...")
vacancy_skills = process_vacancy_links(vacancy_links)


Step 2: Extracting key skills from each vacancy...
Prossesing 10 link...
Prossesing 20 link...
Prossesing 30 link...
Prossesing 40 link...
Prossesing 50 link...
Prossesing 60 link...
Prossesing 70 link...
Prossesing 80 link...
Prossesing 90 link...
Prossesing 100 link...
Prossesing 110 link...
Prossesing 120 link...
Prossesing 130 link...
Prossesing 140 link...
Prossesing 150 link...
Prossesing 160 link...
Prossesing 170 link...
Prossesing 180 link...
Prossesing 190 link...
Prossesing 200 link...
Prossesing 210 link...
Prossesing 220 link...
Prossesing 230 link...
Prossesing 240 link...
Prossesing 250 link...
Prossesing 260 link...
Prossesing 270 link...
Prossesing 280 link...
Prossesing 290 link...
Prossesing 300 link...
Prossesing 310 link...
Prossesing 320 link...
Prossesing 330 link...
Prossesing 340 link...
Prossesing 350 link...
Prossesing 360 link...
Prossesing 370 link...
Prossesing 380 link...
Prossesing 390 link...
Prossesing 400 link...
Prossesing 410 link...
Prossesing 420

In [11]:
print(vacancy_skills)

[[], ['PHP', 'PHP 7.4-8', 'Nginx', 'MySQL', 'PostgreSQL', 'REST API'], [], ['Администрирование серверов Windows', 'Linux', 'Ремонт ПК', 'Администрирование сетевого оборудования', 'Windows 7'], ['Точность и внимательность к деталям', 'Cистемы управления базами данных', 'MS Dos', 'Компьютер', 'Знание компьютера'], ['Умение работать в команде', 'Анализ данных', 'Техническая документация', 'Навыки составления отчетности'], [], ['PHP', 'Git', 'jQuery', 'CSS3', 'HTML5', 'Ajax', '1С-Битрикс'], [], [], [], ['1C: ERP', 'Оптимизация кода', 'Написание инструкций', 'ERP-системы на базе 1С'], ['ООП', 'MS SQL', 'Visual Studio C#', 'Design Patterns', 'С#'], ['Работа в команде'], ['SQL', 'Delphi', 'C#'], ['IDE Lazarus', 'C++', 'Разработка программного обеспечения для Linux', 'Разработка программного обеспечения для Windows', 'Python', 'Английский\xa0— A2 — Элементарный'], ['C++', 'Python', 'HTML', 'MS Visual Studio', 'Luo'], [], ['SQL', 'JavaScript', 'Кроссбраузерное тестирование', 'Sanity-тестировани

In [12]:
vacancy_skills = [skills for skills in vacancy_skills if skills != []]
print(vacancy_skills)
print(f"There are {len(vacancy_skills)} pages with skills needed info")

[['PHP', 'PHP 7.4-8', 'Nginx', 'MySQL', 'PostgreSQL', 'REST API'], ['Администрирование серверов Windows', 'Linux', 'Ремонт ПК', 'Администрирование сетевого оборудования', 'Windows 7'], ['Точность и внимательность к деталям', 'Cистемы управления базами данных', 'MS Dos', 'Компьютер', 'Знание компьютера'], ['Умение работать в команде', 'Анализ данных', 'Техническая документация', 'Навыки составления отчетности'], ['PHP', 'Git', 'jQuery', 'CSS3', 'HTML5', 'Ajax', '1С-Битрикс'], ['1C: ERP', 'Оптимизация кода', 'Написание инструкций', 'ERP-системы на базе 1С'], ['ООП', 'MS SQL', 'Visual Studio C#', 'Design Patterns', 'С#'], ['Работа в команде'], ['SQL', 'Delphi', 'C#'], ['IDE Lazarus', 'C++', 'Разработка программного обеспечения для Linux', 'Разработка программного обеспечения для Windows', 'Python', 'Английский\xa0— A2 — Элементарный'], ['C++', 'Python', 'HTML', 'MS Visual Studio', 'Luo'], ['SQL', 'JavaScript', 'Кроссбраузерное тестирование', 'Sanity-тестирование', 'Рефакторинг кода', 'Yii

In [None]:
# Should be executed only once

# with open('skills.txt', 'w') as f:
#     for skills in vacancy_skills:
#         f.write(f"{skills}\n")

In [12]:
import ast

vacancy_skills = []
with open('skills.txt', 'r') as f:
    for line in f:
        # Safely convert string-represented lists to real lists
        vacancy_skills.append(ast.literal_eval(line.strip()))

print(vacancy_skills)

[['PHP', 'PHP 7.4-8', 'Nginx', 'MySQL', 'PostgreSQL', 'REST API'], ['Администрирование серверов Windows', 'Linux', 'Ремонт ПК', 'Администрирование сетевого оборудования', 'Windows 7'], ['Точность и внимательность к деталям', 'Cистемы управления базами данных', 'MS Dos', 'Компьютер', 'Знание компьютера'], ['Умение работать в команде', 'Анализ данных', 'Техническая документация', 'Навыки составления отчетности'], ['PHP', 'Git', 'jQuery', 'CSS3', 'HTML5', 'Ajax', '1С-Битрикс'], ['1C: ERP', 'Оптимизация кода', 'Написание инструкций', 'ERP-системы на базе 1С'], ['ООП', 'MS SQL', 'Visual Studio C#', 'Design Patterns', 'С#'], ['Работа в команде'], ['SQL', 'Delphi', 'C#'], ['IDE Lazarus', 'C++', 'Разработка программного обеспечения для Linux', 'Разработка программного обеспечения для Windows', 'Python', 'Английский\xa0— A2 — Элементарный'], ['C++', 'Python', 'HTML', 'MS Visual Studio', 'Luo'], ['SQL', 'JavaScript', 'Кроссбраузерное тестирование', 'Sanity-тестирование', 'Рефакторинг кода', 'Yii

In [13]:
import re
from collections import defaultdict

def normalize_skills(job_postings):
    # Define skill normalization mappings
    skill_mappings = {
        # Programming languages and frameworks
        r'python\s*[\d\.]*[x]?': 'Python',
        r'c\/c\+\+|\bc\+\+\s*\d*\w*|\bc\b(?![\+\+])': 'C/C++',
        r'java(?!script)': 'Java',
        r'javascript': 'JavaScript',
        r'typescript': 'TypeScript',
        r'golang|go\b': 'Go',
        r'ruby(?:\s*on\s*rails)?': 'Ruby on Rails',
        r'php\s*[\d\.]*': 'PHP',
        r'c#|с#': 'C#',
        r'\.net\s*(core|framework)?': '.NET',
        r'node\.?js': 'Node.js',
        r'react(?:\.js)?': 'React',
        r'vue\.?js': 'Vue.js',
        r'angular\s*\d*': 'Angular',
        r'django(?:\s*framework)?': 'Django',
        r'laravel': 'Laravel',
        r'symfony': 'Symfony',
        r'qt\s*\d*': 'Qt',
        
        # Databases
        r'mysql': 'MySQL',
        r'postgresql|postgres': 'PostgreSQL',
        r'ms\s*sql|microsoft\s*sql|sql\s*server': 'MS SQL',
        r'(?i).*\boracle\b.*': 'Oracle',
        r'mongodb': 'MongoDB',
        r'redis': 'Redis',
        r'sql(?!\w)': 'SQL',
        
        # 1C related - all normalized to just "1C"
        r'(?i).*\b(1[сc][: ]?.*)\b.*': '1C',
        r'1 c': '1C',
        
        # Web technologies
        r'html\s*\d*': 'HTML',
        r'css\s*\d*': 'CSS',
        r'rest\s*api': 'REST API',
        r'json\s*api': 'JSON API',
        r'soap': 'SOAP',
        r'web\s*services': 'Web Services',
        
        # DevOps and tools
        r'git*': 'Git',
        r'docker': 'Docker',
        r'kubernetes': 'Kubernetes',
        r'ci/cd': 'CI/CD',
        r'jenkins': 'Jenkins',
        r'(?i).*\b(vs[ -]?code|visual[ -]?studio[ -]?code|vscode|ms[ -]?visual[ -]?studio|visual[ -]?studio[ -]?debugger)\b.*': 'VSCode',
        
        # Operating systems
        r'linux': 'Linux',
        r'windows\s*\d*': 'Windows',
        
        # Other common skills
        r'ооп': 'ООП',
        r'английский\s*(?:язык)?\s*(?:—\s*[a-z]\d*\s*—\s*[а-яё]*)?': 'Английский язык',
        r'работа\s*с\s*базами\s*данных': 'Работа с базами данных',
        r'работа\s*в\s*команде': 'Работа в команде',
        r'автоматизация\s*процессов': 'Автоматизация процессов',
        r'оптимизация\s*кода': 'Оптимизация кода',
        r'Ray*': 'Ray',
        r'kafka(?:\s*\w*)*': 'Kafka',  # Any mention of Kafka
        r'psi(?:\s*\w*)*': 'PSI',      # Any mention of PSI
        r'git(?:\s*\w*)*': 'Git',
        r'бухгалтер\w*': 'Бухгалтерский учет',
        r'техническ\w* поддерж\w*': 'Техническая поддержка',
        r'баз\w* данных|бд\w*': 'Базы данных',
        r'коммуник\w*|комуник\w*': 'Коммуникабельность',
        r'ответственн\w*|ответст\w*': 'Ответственность',
    }
    
    # Compile regex patterns
    compiled_mappings = [(re.compile(pattern, re.IGNORECASE), replacement) 
                        for pattern, replacement in skill_mappings.items()]
    
    normalized_postings = []
    skill_counts = defaultdict(int)
    
    for posting in job_postings:
        normalized_posting = []
        for skill in posting:
            # Apply all regex patterns to find a match
            normalized_skill = None
            for pattern, replacement in compiled_mappings:
                if pattern.search(skill):
                    normalized_skill = replacement
                    break
            
            # If no pattern matched, keep the original but clean it up
            if normalized_skill is None:
                normalized_skill = skill.strip().replace('\xa0', ' ').capitalize()
            
            normalized_posting.append(normalized_skill)
            skill_counts[normalized_skill] += 1
        
        # Remove duplicates while preserving order
        seen = set()
        unique_posting = []
        for skill in normalized_posting:
            if skill not in seen:
                seen.add(skill)
                unique_posting.append(skill)
        
        normalized_postings.append(unique_posting)
    
    return normalized_postings, skill_counts

In [14]:
normalized_postings, skill_counts = normalize_skills(vacancy_skills)

# for i in range(5):
#     print(f"Original: {vacancy_skills[i]}")
#     print(f"Normalized: {normalized_postings[i]}")
#     print()

skills = sorted(skill_counts.items(), key=lambda x: x[1], reverse=True)

top_skills = skills[:20]
print("\nTop 20 skills:")
for skill, count in top_skills:
    print(f"{skill}: {count}")


Top 20 skills:
1C: 2205
Git: 350
C/C++: 299
SQL: 214
PHP: 205
PostgreSQL: 204
Английский язык: 188
JavaScript: 168
Linux: 164
Ray: 164
HTML: 157
MySQL: 131
Python: 127
CSS: 124
Java: 109
Docker: 103
MS SQL: 88
Go: 87
.NET: 81
TypeScript: 80


In [15]:
top_skills = skills[:80]
printskill = list()
print("\nTop 20 skills:")
for skill, count in top_skills:
    printskill.append(skill)

print(printskill)


Top 20 skills:
['1C', 'Git', 'C/C++', 'SQL', 'PHP', 'PostgreSQL', 'Английский язык', 'JavaScript', 'Linux', 'Ray', 'HTML', 'MySQL', 'Python', 'CSS', 'Java', 'Docker', 'MS SQL', 'Go', '.NET', 'TypeScript', 'React', 'REST API', 'ООП', 'Автоматизация процессов', 'Базы данных', 'Разработка по', 'Асу тп', 'Qt', 'Работа в команде', 'Laravel', 'Оптимизация кода', 'Vue.js', 'Redis', 'Техническая поддержка', 'Node.js', 'Работа с базами данных', 'Jquery', 'Scada', 'Kafka', 'Бухгалтерский учет', 'Разработка технических заданий', 'Rest', 'VSCode', 'MongoDB', 'Kotlin', 'Angular', 'Xml', 'Oracle', 'Api', 'CI/CD', 'Codesys', 'Работа с большим объемом информации', 'Системная интеграция', 'Скд', 'Kubernetes', 'Веб-программирование', 'Ответственность', 'Windows', 'Android', 'Erp', 'Delphi', 'Json', 'Stl', 'Информационные технологии', 'Symfony', 'Рефакторинг кода', 'Solid', 'Битрикс24', 'Ios', 'Arm', 'Конфигурационное тестирование', 'Fastapi', 'Обучение и развитие', 'Ethernet', 'Алгоритмы и структуры да

In [16]:
import json
from collections import defaultdict

output_data = dict()

output_data = {
    "top_20_skills": [
        {"skill": skill, "count": count} for skill, count in top_skills
    ]
}

# Write to JSON file (preserving Unicode characters)
with open('top_skills.json', 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

In [23]:
def categorize_skills(normalized_postings):
    # Define categories and their patterns
    categories = {
    "Programming Languages": [
        "C/C++", "Java", "Python", "JavaScript", "TypeScript", 
        "PHP", "Go", "Kotlin", "Swift", "SQL", "Delphi"
    ],
    "Web Dev": [
        "HTML", "CSS", "JavaScript", "TypeScript", "React", 
        "Vue.js", "Angular", "Node.js", "Jquery", "Ajax", 
        "REST API", "Rest", "Fastapi", "Redux", "Веб-программирование"
    ],
    "Frameworks": [
        ".NET", "Laravel", "Django", "Symfony", "Spring", 
        "Qt", "Fastapi", "1C", "Битрикс24", "Ray", 
        "Scada", "Codesys"
    ],
    "Tools": [
        "Git", "Docker", "Kubernetes", "CI/CD", "VSCode", 
        "Linux", "Windows", "Kafka", "Ethernet", "Redis", 
        "PostgreSQL", "MySQL", "MongoDB", "MS SQL", "Oracle", 
        "Xml", "Json", "Stl", "Arm"
    ],
    "Databases": [
        "SQL", "MySQL", "PostgreSQL", "MongoDB", "MS SQL", 
        "Oracle", "Redis", "Работа с базами данных", "Базы данных"
    ],
    "Software Engineering": [
        "ООП", "Solid", "Рефакторинг кода", "Оптимизация кода", 
        "Алгоритмы и структуры данных", "Разработка технических заданий",
        "Системная интеграция", "Автоматизация процессов", "Пуско-наладочные работы",
        "Конфигурационное тестирование", "Информационные технологии", "Erp", 
        "Асу тп", "Скд", "Техническая поддержка"
    ],
    "Soft Skills": [
        "Английский язык", "Работа в команде", "Ответственность", 
        "Обучение и развитие", "Работа в условиях многозадачности", 
        "Работа с большим объемом информации", "Бухгалтерский учет"
    ]
}
    # Compile all patterns
    compiled_categories = {}
    for category, patterns in categories.items():
        compiled = [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
        compiled_categories[category] = compiled

    categorized_skills = defaultdict(lambda: defaultdict(int))
    category_mapping = {}

    for posting in normalized_postings:
        for skill in posting:
            # Skip if already categorized
            if skill in category_mapping:
                categorized_skills[category_mapping[skill]][skill] += 1
                continue

            # Try to categorize
            categorized = False
            for category, patterns in compiled_categories.items():
                for pattern in patterns:
                    if pattern.search(skill):
                        categorized_skills[category][skill] += 1
                        category_mapping[skill] = category
                        categorized = True
                        break
                if categorized:
                    break
    return categorized_skills, category_mapping

In [24]:
categorized_skills, category_mapping = categorize_skills(normalized_postings)

# Print results by category
for category, skills in categorized_skills.items():
    print(f"\n{category} ({sum(skills.values())} skills, {len(skills)} unique):")
    for skill, count in sorted(skills.items(), key=lambda x: x[1], reverse=True)[:20]:
        print(f"  {skill}: {count}")


Programming Languages (1753 skills, 21 unique):
  C/C++: 263
  PostgreSQL: 203
  SQL: 202
  PHP: 190
  JavaScript: 168
  MySQL: 131
  Python: 126
  Java: 101
  MS SQL: 81
  TypeScript: 80
  Go: 76
  MongoDB: 35
  Kotlin: 35
  Delphi: 22
  Swift: 17
  Sqlalchemy: 8
  Sqlite: 7
  Уверенное владение современным delphi: 3
  Borland delphi: 2
  Google docs: 2

Tools (875 skills, 18 unique):
  Git: 305
  Linux: 156
  Docker: 97
  Redis: 49
  Kafka: 39
  VSCode: 35
  Xml: 34
  CI/CD: 32
  Windows: 22
  Json: 22
  Oracle: 22
  Stl: 22
  Arm: 21
  JSON API: 8
  Geojson: 3
  Микроконтроллеров семейств arm cortex mx: 3
  Buildroot yocto tcp/ip bmc arm cortex системное программирование: 3
  Arm-микроконтроллеры: 2

Web Dev (708 skills, 16 unique):
  HTML: 146
  CSS: 121
  React: 78
  REST API: 71
  Vue.js: 48
  Node.js: 47
  Jquery: 41
  Rest: 36
  Angular: 35
  Веб-программирование: 26
  Fastapi: 20
  Redux: 17
  Ajax: 16
  Crestron: 3
  State management (redux, mobx, effector и др.): 2
  Restfu

In [25]:
import json

output_data = {
    category: {
        "total_skills": sum(skills.values()),
        "unique_skills": len(skills),
        "skills": {skill: count for skill, count in sorted(skills.items(), key=lambda x: x[1], reverse=True)[:20]}
    }
    for category, skills in categorized_skills.items()
}

with open('skills_by_category.json', 'w') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

In [26]:
from collections import defaultdict
from itertools import combinations
import heapq

def find_top_skill_pairings(normalized_postings, top_n=20):
    # Count all co-occurrences of skills in postings
    co_occurrence = defaultdict(int)
    
    for posting in normalized_postings:
        # Generate all combinations of 2-4 skills for each posting
        for r in range(2, 5):
            for combo in combinations(sorted(posting), r):
                co_occurrence[combo] += 1
    
    # Convert to a list and sort by frequency (descending)
    sorted_combos = sorted(co_occurrence.items(), key=lambda x: (-x[1], -len(x[0])))
    sorted_combos = [(combo, freq) for (combo, freq) in sorted_combos if freq > 20]

    return sorted_combos

# Example usage
top_pairings = find_top_skill_pairings(normalized_postings)

In [27]:
from collections import defaultdict

def filter_redundant_pairings(pairings):
    # Create frequency dictionary
    freq_dict = defaultdict(list)
    for combo, freq in pairings:
        freq_dict[freq].append(combo)
    
    # Sort frequencies in descending order
    sorted_freqs = sorted(freq_dict.keys(), reverse=True)
    
    # Process each frequency level
    final_results = []
    seen_combinations = set()
    
    for freq in sorted_freqs:
        # Sort combinations by size (descending) to process larger ones first
        combos = sorted(freq_dict[freq], key=lambda x: (-len(x), x))
        
        for combo in combos:
            # Check if this combo is already covered by a superset
            is_redundant = False
            for existing in seen_combinations:
                if set(combo).issubset(set(existing)):
                    is_redundant = True
                    break
            
            if not is_redundant:
                final_results.append((combo, freq))
                seen_combinations.add(combo)
    
    return final_results

# Example usage
filtered_pairings = filter_redundant_pairings(top_pairings)

# Sort the final results by frequency (descending) and size (descending)
filtered_pairings.sort(key=lambda x: (-x[1], -len(x[0])))

# Print top 20 results
print("Rank\tFrequency\tNum Skills\tSkills")
print("-" * 60)
for i, (combo, freq) in enumerate(filtered_pairings[:20], 1):
    num_skills = len(combo)
    skills = ", ".join(combo)
    print(f"{i}\t{freq}\t\t{num_skills}\t\t{skills}")

Rank	Frequency	Num Skills	Skills
------------------------------------------------------------
1	112		2		CSS, HTML
2	99		2		MySQL, PHP
3	96		2		C/C++, Linux
4	82		2		Git, PHP
5	81		2		PostgreSQL, SQL
6	80		2		C/C++, Git
7	77		2		Git, PostgreSQL
8	75		2		HTML, JavaScript
9	72		2		CSS, JavaScript
10	70		2		Git, JavaScript
11	67		2		Git, MySQL
12	66		3		CSS, HTML, JavaScript
13	65		2		1C, Автоматизация процессов
14	64		2		Git, Linux
15	64		2		Git, SQL
16	60		2		HTML, PHP
17	58		2		C/C++, Английский язык
18	55		2		Git, HTML
19	54		2		Git, Английский язык
20	53		2		JavaScript, PHP


In [28]:
import json

pairings_list = [
    {"skills": list(skills), "count": count}
    for skills, count in filtered_pairings[:20]
]

# Write to JSON file
with open('filtered_pairings.json', 'w', encoding='utf-8') as f:
    json.dump(pairings_list, f, ensure_ascii=False, indent=4)