In [4]:
import aiohttp
import asyncio
import pandas as pd
import random
import time
from datetime import datetime, timedelta, timezone

# === –ù–∞—Å—Ç—Ä–æ–π–∫–∏ ===
header = {'User-Agent': 'HH-Data-Coll/v1.0 (contact 135861v@mail.ru)'}
area_id_russia = 113
per_page = 100
max_pages = 100
semaphore = asyncio.Semaphore(5)

# –û–ø—ã—Ç
experiences = ['noExperience', 'between1And3', 'between3And6', 'moreThan6']

# –ü–µ—Ä–∏–æ–¥: –ø–æ—Å–ª–µ–¥–Ω–∏–µ 30 –¥–Ω–µ–π —Å —à–∞–≥–æ–º 7 –¥–Ω–µ–π
def get_date_ranges(days=30, step=7):
    today = datetime.now(timezone.utc).date()
    start_date = today - timedelta(days=days)
    ranges = []
    current = start_date
    while current < today:
        end = min(current + timedelta(days=step), today)
        ranges.append((current.isoformat(), end.isoformat()))
        current = end
    return ranges

# RateLimiter
class RateLimiter:
    def __init__(self, max_rps=3, long_pause_every=500, long_pause_duration=60):
        self.max_rps = max_rps
        self.long_pause_every = long_pause_every
        self.long_pause_duration = long_pause_duration
        self.last_request = None
        self.request_count = 0
        self.lock = asyncio.Lock()

    async def wait(self):
        async with self.lock:
            now = time.monotonic()
            if self.last_request is not None:
                elapsed = now - self.last_request
                delay = max(0, 1.0 / self.max_rps - elapsed)
                if delay > 0:
                    await asyncio.sleep(delay)
            self.last_request = time.monotonic()
            self.request_count += 1
            if self.request_count % self.long_pause_every == 0:
                print(f"–î–æ–ª–≥–∞—è –ø–∞—É–∑–∞ {self.long_pause_duration} —Å–µ–∫ –ø–æ—Å–ª–µ {self.request_count} –∑–∞–ø—Ä–æ—Å–æ–≤...")
                await asyncio.sleep(self.long_pause_duration)

limiter = RateLimiter()

# –ü–æ–ª—É—á–µ–Ω–∏–µ IT-—Ä–æ–ª–µ–π
async def fetch_it_roles():
    url = 'https://api.hh.ru/professional_roles'
    async with aiohttp.ClientSession(headers=header) as session:
        async with session.get(url) as response:
            response.raise_for_status()
            data = await response.json()
            for cat in data['categories']:
                if int(cat['id']) == 11:
                    roles = [
                        {'id': int(role['id']), 'name': role['name']}
                        for role in cat['roles']
                        if int(role['id']) not in [12, 25, 34, 155]
                    ]
                    print(f'üîπ –ù–∞–π–¥–µ–Ω–æ {len(roles)} IT-—Ä–æ–ª–µ–π')
                    return roles
            return []

# –ó–∞–≥—Ä—É–∑–∫–∞ –æ–¥–Ω–æ–π —Å—Ç—Ä–∞–Ω–∏—Ü—ã
async def fetch_page(session, params, page, desc):
    params['page'] = page
    await limiter.wait()
    try:
        async with semaphore:
            async with session.get('https://api.hh.ru/vacancies', params=params) as response:
                if response.status == 403:
                    print(f"üö´ –û—à–∏–±–∫–∞ 403 –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page} ‚Äî {desc}")
                    return None
                response.raise_for_status()
                return await response.json()
    except Exception as e:
        print(f"‚ö†Ô∏è –û—à–∏–±–∫–∞ –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page} ‚Äî {desc}: {e}")
        await asyncio.sleep(random.uniform(1.0, 2.0))
        return None

# –°–±–æ—Ä –≤–∞–∫–∞–Ω—Å–∏–π –ø–æ –∫–æ–º–±–∏–Ω–∞—Ü–∏–∏ —Ä–æ–ª–µ–π, –æ–ø—ã—Ç–∞ –∏ –¥–∞—Ç
async def fetch_vacancies(session, role_id, role_name, experience, date_from, date_to):
    desc = f"—Ä–æ–ª—å '{role_name}', –æ–ø—ã—Ç '{experience}', {date_from}‚Äì{date_to}"
    vacancies = []
    params = {
        'professional_role': role_id,
        'area': area_id_russia,
        'experience': experience,
        'date_from': date_from,
        'date_to': date_to,
        'per_page': per_page,
    }

    for page in range(max_pages):
        data = await fetch_page(session, params, page, desc)
        if data is None:
            break
        items = data.get('items', [])
        if not items:
            break
        vacancies.extend(items)
        if page >= data.get('pages', 0) - 1:
            break
        await asyncio.sleep(random.uniform(0.3, 0.6))

    print(f"‚úÖ –°–æ–±—Ä–∞–Ω–æ {len(vacancies)} –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî {desc}")
    return vacancies

# –û—Å–Ω–æ–≤–Ω–æ–π —Å–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö
async def collect_vacancies_async():
    all_vacancies = []
    date_ranges = get_date_ranges()
    roles = await fetch_it_roles()

    async with aiohttp.ClientSession(headers=header) as session:
        for role in roles:
            print(f"\n== –†–æ–ª—å: {role['name']} (ID {role['id']}) ==")
            for exp in experiences:
                for date_from, date_to in date_ranges:
                    batch = await fetch_vacancies(session, role['id'], role['name'], exp, date_from, date_to)
                    all_vacancies.extend(batch)

    print(f"\nüöÄ –í—Å–µ–≥–æ —Å–æ–±—Ä–∞–Ω–æ {len(all_vacancies)} –≤–∞–∫–∞–Ω—Å–∏–π")
    return all_vacancies

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤–∞–∫–∞–Ω—Å–∏–π –≤ DataFrame
def extract_vacancy_data(raw_vacancies):
    extracted = []
    for v in raw_vacancies:
        area_info = v.get('area', {})
        extracted.append({
            'id': v.get('id'),
            'name': v.get('name'),
            'area': area_info.get('name'),
            'area_id': area_info.get('id'),
            'employer': v.get('employer', {}).get('name'),
            'published_at': v.get('published_at'),
            'created_at': v.get('created_at'),
            'closed_at': v.get('closed_at'),
            'archived': v.get('archived'),
            'url': v.get('alternate_url'),
            'salary_from': v.get('salary', {}).get('from') if v.get('salary') else None,
            'salary_to': v.get('salary', {}).get('to') if v.get('salary') else None,
            'currency': v.get('salary', {}).get('currency') if v.get('salary') else None,
            'experience': v.get('experience', {}).get('name'),
            'schedule': v.get('schedule', {}).get('name'),
            'employment': v.get('employment', {}).get('name'),
        })
    return pd.DataFrame(extracted)


In [3]:
raw_vacancies = await collect_vacancies_async()
df = extract_vacancy_data(raw_vacancies)

üîπ –ù–∞–π–¥–µ–Ω–æ 21 IT-—Ä–æ–ª–µ–π

== –†–æ–ª—å: BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö (ID 156) ==
‚úÖ –°–æ–±—Ä–∞–Ω–æ 15 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª—å 'BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö', –æ–ø—ã—Ç 'noExperience', 2025-06-15‚Äì2025-06-22
‚úÖ –°–æ–±—Ä–∞–Ω–æ 33 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª—å 'BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö', –æ–ø—ã—Ç 'noExperience', 2025-06-22‚Äì2025-06-29
‚úÖ –°–æ–±—Ä–∞–Ω–æ 21 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª—å 'BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö', –æ–ø—ã—Ç 'noExperience', 2025-06-29‚Äì2025-07-06
‚úÖ –°–æ–±—Ä–∞–Ω–æ 61 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª—å 'BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö', –æ–ø—ã—Ç 'noExperience', 2025-07-06‚Äì2025-07-13
‚úÖ –°–æ–±—Ä–∞–Ω–æ 35 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª—å 'BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö', –æ–ø—ã—Ç 'noExperience', 2025-07-13‚Äì2025-07-15
‚úÖ –°–æ–±—Ä–∞–Ω–æ 96 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª—å 'BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö', –æ–ø—

In [5]:
df.head()

Unnamed: 0,id,name,area,area_id,employer,published_at,created_at,closed_at,archived,url,salary_from,salary_to,currency,experience,schedule,employment
0,120752510,BI-–ê–Ω–∞–ª–∏—Ç–∏–∫,–°–∞—Ä–∞—Ç–æ–≤,79,–ë–∞–Ω–∫ –í–¢–ë (–ü–ê–û),2025-06-16T12:35:43+0300,2025-06-16T12:35:43+0300,,False,https://hh.ru/vacancy/120752510,,,,–ù–µ—Ç –æ–ø—ã—Ç–∞,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
1,120892345,–ê–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö,–ù–∞–±–µ—Ä–µ–∂–Ω—ã–µ –ß–µ–ª–Ω—ã,1641,NINEL,2025-06-20T16:17:48+0300,2025-06-20T16:17:48+0300,,False,https://hh.ru/vacancy/120892345,50000.0,80000.0,RUR,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
2,121680073,–ú–ª–∞–¥—à–∏–π –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö –ö–•–î (DWH),–ú–æ—Å–∫–≤–∞,1,–ú–°–ü –ë–∞–Ω–∫,2025-06-16T14:16:10+0300,2025-06-16T14:16:10+0300,,False,https://hh.ru/vacancy/121680073,,,,–ù–µ—Ç –æ–ø—ã—Ç–∞,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
3,121658033,–ú–ª–∞–¥—à–∏–π –∞–Ω–∞–ª–∏—Ç–∏–∫ BI,–ú–æ—Å–∫–≤–∞,1,ANCOR,2025-06-16T10:26:39+0300,2025-06-16T10:26:39+0300,,False,https://hh.ru/vacancy/121658033,,,,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
4,119855847,–°–ø–µ—Ü–∏–∞–ª–∏—Å—Ç (–æ–ø–µ—Ä–∞—Ç–æ—Ä Excel –∏ –±–∞–∑ –¥–∞–Ω–Ω—ã—Ö),–ú–æ—Å–∫–≤–∞,1,–ü—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–æ –ú–æ—Å–∫–≤—ã,2025-06-18T15:31:53+0300,2025-06-18T15:31:53+0300,,False,https://hh.ru/vacancy/119855847,,,,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54116 entries, 0 to 54115
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            54116 non-null  object 
 1   name          54116 non-null  object 
 2   area          54116 non-null  object 
 3   area_id       54116 non-null  object 
 4   employer      54116 non-null  object 
 5   published_at  54116 non-null  object 
 6   created_at    54116 non-null  object 
 7   closed_at     0 non-null      object 
 8   archived      54116 non-null  bool   
 9   url           54116 non-null  object 
 10  salary_from   21533 non-null  float64
 11  salary_to     13239 non-null  float64
 12  currency      24578 non-null  object 
 13  experience    54116 non-null  object 
 14  schedule      54116 non-null  object 
 15  employment    54116 non-null  object 
dtypes: bool(1), float64(2), object(13)
memory usage: 6.2+ MB
