In [2]:
import aiohttp
import asyncio
import pandas as pd
import random
import time
from datetime import datetime, timedelta, timezone
import logging

In [3]:
# === –ù–∞—Å—Ç—Ä–æ–π–∫–∏ ===
header = {'User-Agent': 'HH-Data-Coll/v1.0 (contact 135861v@mail.ru)'}
area_id_russia = 113
per_page = 100
max_pages = 100
semaphore = asyncio.Semaphore(5)

# –û–ø—ã—Ç
experiences = ['noExperience', 'between1And3', 'between3And6', 'moreThan6']

# –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –ª–æ–≥–≥–∏—Ä–æ–≤–∞–Ω–∏—è
log_filename = f"hh_parser_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",  
    handlers=[
        logging.FileHandler(log_filename, encoding="utf-8"), 
        logging.StreamHandler()
    ]
)

def log_elapsed_time(start, task="Code"):
    end = time.perf_counter()
    elapsed = end - start
    mins, secs = divmod(elapsed, 60)
    hrs, mins = divmod(mins, 60)
    logging.info(f"‚è± –í—Ä–µ–º—è –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è ({task}): {int(hrs):02d}:{int(mins):02d}:{secs:.2f} —Å–µ–∫—É–Ω–¥")


# –ü–µ—Ä–∏–æ–¥: –ø–æ—Å–ª–µ–¥–Ω–∏–µ 30 –¥–Ω–µ–π —Å —à–∞–≥–æ–º 7 –¥–Ω–µ–π
def get_date_ranges(days=30, step=7):
    today = datetime.now(timezone.utc).date()
    start_date = today - timedelta(days=days)
    ranges = []
    current = start_date
    while current < today:
        end = min(current + timedelta(days=step), today)
        ranges.append((current.isoformat(), end.isoformat()))
        current = end
    return ranges

# RateLimiter
class RateLimiter:
    def __init__(self, max_rps=3, long_pause_every=500, long_pause_duration=60):
        self.max_rps = max_rps
        self.long_pause_every = long_pause_every
        self.long_pause_duration = long_pause_duration
        self.last_request = None
        self.request_count = 0
        self.lock = asyncio.Lock()

    async def wait(self):
        async with self.lock:
            now = time.monotonic()
            if self.last_request is not None:
                elapsed = now - self.last_request
                delay = max(0, 1.0 / self.max_rps - elapsed)
                if delay > 0:
                    await asyncio.sleep(delay)
            self.last_request = time.monotonic()
            self.request_count += 1
            if self.request_count % self.long_pause_every == 0:
                logging.info(f"–î–æ–ª–≥–∞—è –ø–∞—É–∑–∞ {self.long_pause_duration} —Å–µ–∫ –ø–æ—Å–ª–µ {self.request_count} –∑–∞–ø—Ä–æ—Å–æ–≤...")
                await asyncio.sleep(self.long_pause_duration)

limiter = RateLimiter()


In [12]:
#=== –§—É–Ω–∫—Ü–∏–∏ —Å–±–æ—Ä–∞ –≤–∞–∫–∞–Ω—Å–∏–π ===
# –ü–æ–ª—É—á–µ–Ω–∏–µ IT-—Ä–æ–ª–µ–π
async def fetch_it_roles():
    url = 'https://api.hh.ru/professional_roles'
    async with aiohttp.ClientSession(headers=header) as session:
        async with session.get(url) as response:
            response.raise_for_status()
            data = await response.json()
            for cat in data['categories']:
                if int(cat['id']) == 11:
                    roles = [
                        {'id': int(role['id']), 'name': role['name']}
                        for role in cat['roles']
                        if int(role['id']) not in [12, 25, 34, 155]
                    ]
                    logging.info(f'üîπ –ù–∞–π–¥–µ–Ω–æ {len(roles)} IT-—Ä–æ–ª–µ–π')
                    return roles
            return []

# –ó–∞–≥—Ä—É–∑–∫–∞ –æ–¥–Ω–æ–π —Å—Ç—Ä–∞–Ω–∏—Ü—ã —Å –ø–æ–≤—Ç–æ—Ä–Ω—ã–º–∏ –ø–æ–ø—ã—Ç–∫–∞–º–∏
async def fetch_page(session, params, page, desc, max_retries=3):
    params['page'] = page
    for attempt in range(1, max_retries + 1):
        await limiter.wait()
        try:
            async with semaphore:
                async with session.get('https://api.hh.ru/vacancies', params=params) as response:
                    if response.status == 403:
                        logging.warning(f"üö´ –û—à–∏–±–∫–∞ 403 –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page} ‚Äî {desc} (–ø–æ–ø—ã—Ç–∫–∞ {attempt})")
                    elif response.status >= 400:
                        logging.warning(f"‚ö†Ô∏è –û—à–∏–±–∫–∞ {response.status} –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page} ‚Äî {desc} (–ø–æ–ø—ã—Ç–∫–∞ {attempt})")
                    response.raise_for_status()
                    return await response.json()
        except Exception as e:
            logging.warning(f"‚ö†Ô∏è –ò—Å–∫–ª—é—á–µ–Ω–∏–µ –ø—Ä–∏ –∑–∞–≥—Ä—É–∑–∫–µ —Å—Ç—Ä–∞–Ω–∏—Ü—ã {page} ‚Äî {desc} (–ø–æ–ø—ã—Ç–∫–∞ {attempt}): {e}")
            await asyncio.sleep(random.uniform(1.0, 2.0))

    # –í—Å–µ –ø–æ–ø—ã—Ç–∫–∏ –∏—Å—á–µ—Ä–ø–∞–Ω—ã ‚Äî –∫—Ä–∏—Ç–∏—á–µ—Å–∫–∞—è –æ—à–∏–±–∫–∞
    error_msg = f"‚ùå –û—à–∏–±–∫–∞: –Ω–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å —Å—Ç—Ä–∞–Ω–∏—Ü—É {page} ‚Äî {desc} –ø–æ—Å–ª–µ {max_retries} –ø–æ–ø—ã—Ç–æ–∫. –û—Å—Ç–∞–Ω–æ–≤–∫–∞."
    logging.error(error_msg)
    raise RuntimeError(error_msg)

# –°–±–æ—Ä –≤–∞–∫–∞–Ω—Å–∏–π –ø–æ –∫–æ–º–±–∏–Ω–∞—Ü–∏–∏ —Ä–æ–ª–µ–π, –æ–ø—ã—Ç–∞ –∏ –¥–∞—Ç
async def fetch_vacancies(session, role_id, role_name, experience, date_from, date_to):
    desc = f"—Ä–æ–ª—å '{role_name}', –æ–ø—ã—Ç '{experience}', {date_from}‚Äì{date_to}"
    vacancies = []
    params = {
        'professional_role': role_id,
        'area': area_id_russia,
        'experience': experience,
        'date_from': date_from,
        'date_to': date_to,
        'per_page': per_page,
    }

    for page in range(max_pages):
        data = await fetch_page(session, params, page, desc)
        if data is None:
            break
        items = data.get('items', [])
        if not items:
            break
        vacancies.extend(items)
        if page >= data.get('pages', 0) - 1:
            break
        await asyncio.sleep(random.uniform(0.3, 0.6))

    logging.info(f"‚úÖ –°–æ–±—Ä–∞–Ω–æ {len(vacancies)} –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî {desc}")
    return vacancies

# –û—Å–Ω–æ–≤–Ω–æ–π —Å–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö
async def collect_vacancies_async():
    all_vacancies = []
    date_ranges = get_date_ranges()
    roles = await fetch_it_roles()

    try:
        async with aiohttp.ClientSession(headers=header) as session:
            for role in roles:
                logging.info(f"\n== –†–æ–ª—å: {role['name']} (ID {role['id']}) ==")
                for exp in experiences:
                    for date_from, date_to in date_ranges:
                        batch = await fetch_vacancies(session, role['id'], role['name'], exp, date_from, date_to)
                        all_vacancies.extend(batch)

    except RuntimeError as critical_error:
        logging.critical(f"üö® –ü–∞—Ä—Å–∏–Ω–≥ –ø—Ä–µ—Ä–≤–∞–Ω: {critical_error}")
        raise  # —á—Ç–æ–±—ã –≤—ã–ø–æ–ª–Ω–µ–Ω–∏–µ –ø—Ä–µ—Ä–≤–∞–ª–æ—Å—å –ø–æ–ª–Ω–æ—Å—Ç—å—é –≤ Jupyter/—Å–∫—Ä–∏–ø—Ç–µ

    logging.info(f"\nüöÄ –í—Å–µ–≥–æ —Å–æ–±—Ä–∞–Ω–æ {len(all_vacancies)} –≤–∞–∫–∞–Ω—Å–∏–π")
    return all_vacancies

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤–∞–∫–∞–Ω—Å–∏–π –≤ DataFrame
def extract_vacancy_data(raw_vacancies):
    extracted = []
    for v in raw_vacancies:
        area_info = v.get('area', {})
        extracted.append({
            'load_datetime': datetime.now(),
            'id': v.get('id'),
            'name': v.get('name'),
            'area': area_info.get('name'),
            'area_id': area_info.get('id'),
            'employer': v.get('employer', {}).get('name'),
            'published_at': v.get('published_at'),
            'created_at': v.get('created_at'),
            'closed_at': v.get('closed_at'),
            'archived': v.get('archived'),
            'url': v.get('alternate_url'),
            'salary_from': v.get('salary', {}).get('from') if v.get('salary') else None,
            'salary_to': v.get('salary', {}).get('to') if v.get('salary') else None,
            'currency': v.get('salary', {}).get('currency') if v.get('salary') else None,
            'experience': v.get('experience', {}).get('name'),
            'schedule': v.get('schedule', {}).get('name'),
            'employment': v.get('employment', {}).get('name'),
        })
    return pd.DataFrame(extracted)


In [None]:
# –ó–∞–ø—É—Å–∫ —Å–±–æ—Ä–∞ –≤–∞–∫–∞–Ω—Å–∏–π
start = time.perf_counter()
raw_vacancies = await collect_vacancies_async()
df = extract_vacancy_data(raw_vacancies)
log_elapsed_time(start, "hh_parser")

In [5]:
df.head()

Unnamed: 0,id,name,area,area_id,employer,published_at,created_at,closed_at,archived,url,salary_from,salary_to,currency,experience,schedule,employment
0,120865772,–ú–ª–∞–¥—à–∏–π –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö,–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥,2,Topface Media,2025-06-24T16:59:10+0300,2025-06-24T16:59:10+0300,,False,https://hh.ru/vacancy/120865772,30000.0,40000.0,RUR,–ù–µ—Ç –æ–ø—ã—Ç–∞,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
1,122059039,–ú–ª–∞–¥—à–∏–π –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö,–ú–æ—Å–∫–≤–∞,1,–ú–ö–ö –§–∏–Ω–º–æ–ª–ª,2025-06-25T14:52:10+0300,2025-06-25T14:52:10+0300,,False,https://hh.ru/vacancy/122059039,90000.0,,RUR,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
2,121943475,–ê–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö,–¢–æ–º—Å–∫,90,–ó–≤–æ–Ω–∞—Ä—å,2025-06-23T12:13:10+0300,2025-06-23T12:13:10+0300,,False,https://hh.ru/vacancy/121943475,60000.0,80000.0,RUR,–ù–µ—Ç –æ–ø—ã—Ç–∞,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
3,121987787,–ê–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö,–ë—Ä—è–Ω—Å–∫,19,–ë–∞–Ω–Ω–µ—Ä –°—Ç–∞—Ç,2025-06-24T09:39:55+0300,2025-06-24T09:39:55+0300,,False,https://hh.ru/vacancy/121987787,50000.0,,RUR,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
4,121972260,–ê–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö,–†—è–∑–∞–Ω—å,77,–ë–∞–Ω–∫ –ü—Ä–∏–æ-–í–Ω–µ—à—Ç–æ—Ä–≥–±–∞–Ω–∫,2025-06-27T12:20:47+0300,2025-06-27T12:20:47+0300,,False,https://hh.ru/vacancy/121972260,,,,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57239 entries, 0 to 57238
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            57239 non-null  object 
 1   name          57239 non-null  object 
 2   area          57239 non-null  object 
 3   area_id       57239 non-null  object 
 4   employer      57239 non-null  object 
 5   published_at  57239 non-null  object 
 6   created_at    57239 non-null  object 
 7   closed_at     0 non-null      object 
 8   archived      57239 non-null  bool   
 9   url           57239 non-null  object 
 10  salary_from   22433 non-null  float64
 11  salary_to     13903 non-null  float64
 12  currency      25771 non-null  object 
 13  experience    57239 non-null  object 
 14  schedule      57239 non-null  object 
 15  employment    57239 non-null  object 
dtypes: bool(1), float64(2), object(13)
memory usage: 6.6+ MB
