In [16]:
import aiohttp
import asyncio
import pandas as pd
import random
import time
from datetime import datetime, timedelta, timezone
import logging

In [17]:
# === –ù–∞—Å—Ç—Ä–æ–π–∫–∏ ===
header = {'User-Agent': 'HH-Data-Coll/v1.0 (contact 135861v@mail.ru)'}
area_id_russia = 113
per_page = 100
max_pages = 100
semaphore = asyncio.Semaphore(5)

# –û–ø—ã—Ç
experiences = ['noExperience', 'between1And3', 'between3And6', 'moreThan6']

# –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –ª–æ–≥–≥–∏—Ä–æ–≤–∞–Ω–∏—è
log_filename = f"hh_parser_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",  
    handlers=[
        logging.FileHandler(log_filename, encoding="utf-8"), 
        logging.StreamHandler()
    ]
)

def log_elapsed_time(start, task="Code"):
    end = time.perf_counter()
    elapsed = end - start
    mins, secs = divmod(elapsed, 60)
    hrs, mins = divmod(mins, 60)
    logging.info(f"‚è± –í—Ä–µ–º—è –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è ({task}): {int(hrs):02d}:{int(mins):02d}:{secs:.2f} —Å–µ–∫—É–Ω–¥")


# –ü–µ—Ä–∏–æ–¥: –ø–æ—Å–ª–µ–¥–Ω–∏–µ 30 –¥–Ω–µ–π —Å —à–∞–≥–æ–º 7 –¥–Ω–µ–π
def get_date_ranges(days=30, step=7):
    today = datetime.now(timezone.utc).date()
    start_date = today - timedelta(days=days)
    ranges = []
    current = start_date
    while current < today:
        end = min(current + timedelta(days=step), today)
        ranges.append((current.isoformat(), end.isoformat()))
        current = end
    return ranges

# –ó–∞–≥—Ä—É–∑–∫–∞ —Å–ø—Ä–∞–≤–æ—á–Ω–∏–∫–æ–≤ –¥–ª—è –≤–∞–ª–∏–¥–∞—Ü–∏–∏ –¥–∞–Ω–Ω—ã—Ö
async def fetch_reference_data():
    async with aiohttp.ClientSession(headers=header) as session:
        async with session.get('https://api.hh.ru/dictionaries') as resp:
            dict_data = await resp.json()

        async with session.get('https://api.hh.ru/areas') as resp:
            area_data = await resp.json()
            area_ids = set()

            def extract_area_ids(areas):
                for a in areas:
                    area_ids.add(str(a['id']))
                    if 'areas' in a and a['areas']:
                        extract_area_ids(a['areas'])

            extract_area_ids(area_data)

        return {
            'experience': {e['name'] for e in dict_data['experience']},
            'employment': {e['name'] for e in dict_data['employment']},
            'schedule': {e['name'] for e in dict_data['schedule']},
            'area_ids': area_ids
        }

# RateLimiter
class RateLimiter:
    def __init__(self, max_rps=3, long_pause_every=500, long_pause_duration=60):
        self.max_rps = max_rps
        self.long_pause_every = long_pause_every
        self.long_pause_duration = long_pause_duration
        self.last_request = None
        self.request_count = 0
        self.lock = asyncio.Lock()

    async def wait(self):
        async with self.lock:
            now = time.monotonic()
            if self.last_request is not None:
                elapsed = now - self.last_request
                delay = max(0, 1.0 / self.max_rps - elapsed)
                if delay > 0:
                    await asyncio.sleep(delay)
            self.last_request = time.monotonic()
            self.request_count += 1
            if self.request_count % self.long_pause_every == 0:
                logging.info(f"–î–æ–ª–≥–∞—è –ø–∞—É–∑–∞ {self.long_pause_duration} —Å–µ–∫ –ø–æ—Å–ª–µ {self.request_count} –∑–∞–ø—Ä–æ—Å–æ–≤...")
                await asyncio.sleep(self.long_pause_duration)

limiter = RateLimiter()


In [18]:
#=== –§—É–Ω–∫—Ü–∏–∏ —Å–±–æ—Ä–∞ –≤–∞–∫–∞–Ω—Å–∏–π ===
# –ü–æ–ª—É—á–µ–Ω–∏–µ IT-—Ä–æ–ª–µ–π
async def fetch_it_roles():
    url = 'https://api.hh.ru/professional_roles'
    async with aiohttp.ClientSession(headers=header) as session:
        async with session.get(url) as response:
            response.raise_for_status()
            data = await response.json()
            for cat in data['categories']:
                if int(cat['id']) == 11:
                    roles = [
                        {'id': int(role['id']), 'name': role['name']}
                        for role in cat['roles']
                        if int(role['id']) not in [12, 25, 34, 155]
                    ]
                    logging.info(f'üîπ –ù–∞–π–¥–µ–Ω–æ {len(roles)} IT-—Ä–æ–ª–µ–π')
                    return roles
            return []

# –ó–∞–≥—Ä—É–∑–∫–∞ –æ–¥–Ω–æ–π —Å—Ç—Ä–∞–Ω–∏—Ü—ã —Å –ø–æ–≤—Ç–æ—Ä–Ω—ã–º–∏ –ø–æ–ø—ã—Ç–∫–∞–º–∏
async def fetch_page(session, params, page, desc, max_retries=3):
    params['page'] = page
    for attempt in range(1, max_retries + 1):
        await limiter.wait()
        try:
            async with semaphore:
                async with session.get('https://api.hh.ru/vacancies', params=params) as response:
                    if response.status == 403:
                        logging.warning(f"üö´ –û—à–∏–±–∫–∞ 403 –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page} ‚Äî {desc} (–ø–æ–ø—ã—Ç–∫–∞ {attempt})")
                    elif response.status >= 400:
                        logging.warning(f"‚ö†Ô∏è –û—à–∏–±–∫–∞ {response.status} –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page} ‚Äî {desc} (–ø–æ–ø—ã—Ç–∫–∞ {attempt})")
                    response.raise_for_status()
                    return await response.json()
        except Exception as e:
            logging.warning(f"‚ö†Ô∏è –ò—Å–∫–ª—é—á–µ–Ω–∏–µ –ø—Ä–∏ –∑–∞–≥—Ä—É–∑–∫–µ —Å—Ç—Ä–∞–Ω–∏—Ü—ã {page} ‚Äî {desc} (–ø–æ–ø—ã—Ç–∫–∞ {attempt}): {e}")
            await asyncio.sleep(random.uniform(1.0, 2.0))

    error_msg = f"‚ùå –û—à–∏–±–∫–∞: –Ω–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å —Å—Ç—Ä–∞–Ω–∏—Ü—É {page} ‚Äî {desc} –ø–æ—Å–ª–µ {max_retries} –ø–æ–ø—ã—Ç–æ–∫. –û—Å—Ç–∞–Ω–æ–≤–∫–∞."
    logging.error(error_msg)
    raise RuntimeError(error_msg)

# –°–±–æ—Ä –≤–∞–∫–∞–Ω—Å–∏–π –ø–æ –∫–æ–º–±–∏–Ω–∞—Ü–∏–∏ —Ä–æ–ª–µ–π, –æ–ø—ã—Ç–∞ –∏ –¥–∞—Ç
async def fetch_vacancies(session, role_id, role_name, experience, date_from, date_to):
    desc = f"—Ä–æ–ª—å '{role_name}', –æ–ø—ã—Ç '{experience}', {date_from}‚Äì{date_to}"
    vacancies = []
    params = {
        'professional_role': role_id,
        'area': area_id_russia,
        'experience': experience,
        'date_from': date_from,
        'date_to': date_to,
        'per_page': per_page,
    }

    for page in range(max_pages):
        data = await fetch_page(session, params, page, desc)
        if data is None:
            break
        items = data.get('items', [])
        if not items:
            break
        vacancies.extend(items)
        if page >= data.get('pages', 0) - 1:
            break
        await asyncio.sleep(random.uniform(0.3, 0.6))

    logging.info(f"‚úÖ –°–æ–±—Ä–∞–Ω–æ {len(vacancies)} –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî {desc}")
    return vacancies

# –û—Å–Ω–æ–≤–Ω–æ–π —Å–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö
async def collect_vacancies_async():
    all_vacancies = []
    date_ranges = get_date_ranges()
    roles = await fetch_it_roles()

    try:
        async with aiohttp.ClientSession(headers=header) as session:
            for role in roles:
                logging.info(f"\n== –†–æ–ª—å: {role['name']} (ID {role['id']}) ==")
                for exp in experiences:
                    for date_from, date_to in date_ranges:
                        batch = await fetch_vacancies(session, role['id'], role['name'], exp, date_from, date_to)
                        all_vacancies.extend(batch)

    except RuntimeError as critical_error:
        logging.critical(f"üö® –ü–∞—Ä—Å–∏–Ω–≥ –ø—Ä–µ—Ä–≤–∞–Ω: {critical_error}")
        raise  

    # logging.info(f"\nüöÄ –í—Å–µ–≥–æ —Å–æ–±—Ä–∞–Ω–æ {len(all_vacancies)} –≤–∞–∫–∞–Ω—Å–∏–π")
    return all_vacancies

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –≤–∞–∫–∞–Ω—Å–∏–π –≤ DataFrame
def extract_vacancy_data(raw_vacancies):
    extracted = []
    for v in raw_vacancies:
        area_info = v.get('area', {})
        extracted.append({
            'load_datetime': datetime.now(),
            'id': v.get('id'),
            'name': v.get('name'),
            'area': area_info.get('name'),
            'area_id': area_info.get('id'),
            'employer': v.get('employer', {}).get('name'),
            'published_at': v.get('published_at'),
            'created_at': v.get('created_at'),
            'closed_at': v.get('closed_at'),
            'archived': v.get('archived'),
            'url': v.get('alternate_url'),
            'salary_from': v.get('salary', {}).get('from') if v.get('salary') else None,
            'salary_to': v.get('salary', {}).get('to') if v.get('salary') else None,
            'currency': v.get('salary', {}).get('currency') if v.get('salary') else None,
            'experience': v.get('experience', {}).get('name'),
            'schedule': v.get('schedule', {}).get('name'),
            'employment': v.get('employment', {}).get('name'),
        })
    return pd.DataFrame(extracted)

# –í–∞–ª–∏–¥–∞—Ü–∏—è –¥–∞–Ω–Ω—ã—Ö
def validate_vacancies(df: pd.DataFrame, dicts: dict):
    required_fields = ['id', 'name', 'area', 'area_id', 'published_at']
    errors = []

    def is_empty(val):
        return val is None or val == "" or (isinstance(val, list) and len(val) == 0)

    invalid_rows = []

    for i, row in df.iterrows():
        row_errors = []

        # –ü—Ä–æ–≤–µ—Ä–∫–∞ –æ–±—è–∑–∞—Ç–µ–ª—å–Ω—ã—Ö –ø–æ–ª–µ–π –Ω–∞ –ø—É—Å—Ç–æ—Ç—É
        for field in required_fields:
            if field not in row or is_empty(row[field]):
                row_errors.append(f"–ü—É—Å—Ç–æ–µ –æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ–µ –ø–æ–ª–µ '{field}'")

        # –ü—Ä–æ–≤–µ—Ä–∫–∞ –∑–Ω–∞—á–µ–Ω–∏–π –∏–∑ —Å–ø—Ä–∞–≤–æ—á–Ω–∏–∫–æ–≤
        if 'experience' in row and row['experience'] not in dicts['experience'] and not is_empty(row['experience']):
            row_errors.append(f"–ù–µ–∏–∑–≤–µ—Å—Ç–Ω–æ–µ experience: {row['experience']}")
        if 'employment' in row and row['employment'] not in dicts['employment'] and not is_empty(row['employment']):
            row_errors.append(f"–ù–µ–∏–∑–≤–µ—Å—Ç–Ω–æ–µ employment: {row['employment']}")
        if 'schedule' in row and row['schedule'] not in dicts['schedule'] and not is_empty(row['schedule']):
            row_errors.append(f"–ù–µ–∏–∑–≤–µ—Å—Ç–Ω–æ–µ schedule: {row['schedule']}")
        if 'area_id' in row and str(row['area_id']) not in dicts['area_ids'] and not is_empty(row['area_id']):
            row_errors.append(f"–ù–µ–∏–∑–≤–µ—Å—Ç–Ω—ã–π area_id: {row['area_id']}")

        if row_errors:
            errors.append((i, row_errors))
            invalid_rows.append(i)

    if errors:
        for i, errs in errors:
            logging.warning(f"–°—Ç—Ä–æ–∫–∞ {i} –æ—Ç–∫–ª–æ–Ω–µ–Ω–∞: {', '.join(errs)}")

    reject_df = df.loc[invalid_rows].copy()
    valid_df = df.drop(index=invalid_rows).reset_index(drop=True)

    return valid_df.reset_index(drop=True), reject_df.reset_index(drop=True)


In [20]:
# –ó–∞–ø—É—Å–∫ —Å–±–æ—Ä–∞ –≤–∞–∫–∞–Ω—Å–∏–π
start = time.perf_counter()

raw_vacancies = await collect_vacancies_async()
df = extract_vacancy_data(raw_vacancies)

reference_data = await fetch_reference_data()
valid_df, reject_df = validate_vacancies(df, reference_data)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
reject_filename = f"rejected_vacancies_{timestamp}_reject.parquet"
if not reject_df.empty:
    reject_df.to_csv(reject_filename, index=False)
    logging.info(f"\nüö´ {len(reject_df)} –≤–∞–∫–∞–Ω—Å–∏–π –æ—Ç–∫–ª–æ–Ω–µ–Ω–æ –∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–æ –≤ —Ñ–∞–π–ª {reject_filename}")
else:
    logging.info("\n‚úÖ –í—Å–µ –≤–∞–∫–∞–Ω—Å–∏–∏ –ø—Ä–æ—à–ª–∏ –≤–∞–ª–∏–¥–∞—Ü–∏—é. Reject-—Ñ–∞–π–ª –Ω–µ —Å–æ–∑–¥–∞–Ω.")


logging.info(f"\nüöÄ –í—Å–µ–≥–æ —Å–æ–±—Ä–∞–Ω–æ –≤–∞–∫–∞–Ω—Å–∏–π: {len(raw_vacancies)}")
logging.info(f"üì¶ –ó–∞–≥—Ä—É–∂–µ–Ω–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö id: {valid_df['id'].nunique()}")

log_elapsed_time(start, "hh_parser")


2025-07-28 19:56:14,326 [INFO] üîπ –ù–∞–π–¥–µ–Ω–æ 21 IT-—Ä–æ–ª–µ–π
2025-07-28 19:56:14,327 [INFO] 
== –†–æ–ª—å: BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö (ID 156) ==
2025-07-28 19:56:14,707 [INFO] ‚úÖ –°–æ–±—Ä–∞–Ω–æ 12 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª—å 'BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö', –æ–ø—ã—Ç 'noExperience', 2025-06-28‚Äì2025-07-05
2025-07-28 19:56:15,130 [INFO] ‚úÖ –°–æ–±—Ä–∞–Ω–æ 40 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª—å 'BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö', –æ–ø—ã—Ç 'noExperience', 2025-07-05‚Äì2025-07-12
2025-07-28 19:56:15,552 [INFO] ‚úÖ –°–æ–±—Ä–∞–Ω–æ 37 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª—å 'BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö', –æ–ø—ã—Ç 'noExperience', 2025-07-12‚Äì2025-07-19
2025-07-28 19:56:15,962 [INFO] ‚úÖ –°–æ–±—Ä–∞–Ω–æ 41 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª—å 'BI-–∞–Ω–∞–ª–∏—Ç–∏–∫, –∞–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö', –æ–ø—ã—Ç 'noExperience', 2025-07-19‚Äì2025-07-26
2025-07-28 19:56:16,226 [INFO] ‚úÖ –°–æ–±—Ä–∞–Ω–æ 15 –≤–∞–∫–∞–Ω—Å–∏–π ‚Äî —Ä–æ–ª

In [21]:
valid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49969 entries, 0 to 49968
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   load_datetime  49969 non-null  datetime64[ns]
 1   id             49969 non-null  object        
 2   name           49969 non-null  object        
 3   area           49969 non-null  object        
 4   area_id        49969 non-null  object        
 5   employer       49969 non-null  object        
 6   published_at   49969 non-null  object        
 7   created_at     49969 non-null  object        
 8   closed_at      0 non-null      object        
 9   archived       49969 non-null  bool          
 10  url            49969 non-null  object        
 11  salary_from    19515 non-null  float64       
 12  salary_to      11958 non-null  float64       
 13  currency       22193 non-null  object        
 14  experience     49969 non-null  object        
 15  schedule       4996

In [22]:
valid_df.head()

Unnamed: 0,load_datetime,id,name,area,area_id,employer,published_at,created_at,closed_at,archived,url,salary_from,salary_to,currency,experience,schedule,employment
0,2025-07-28 20:06:43.536101,122307065,Data Engineer,–û—Ä–µ–Ω–±—É—Ä–≥,70,–ú–∏–Ω–∏—Å—Ç–µ—Ä—Å—Ç–≤–æ —Ü–∏—Ñ—Ä–æ–≤–æ–≥–æ —Ä–∞–∑–≤–∏—Ç–∏—è –∏ —Å–≤—è–∑–∏ –û—Ä–µ–Ω–±—É...,2025-07-02T13:43:25+0300,2025-07-02T13:43:25+0300,,False,https://hh.ru/vacancy/122307065,71000.0,,RUR,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
1,2025-07-28 20:06:43.536115,122301031,–°—Ç–∞–∂–µ—Ä —Å–∏—Å—Ç–µ–º–Ω—ã–π –∞–Ω–∞–ª–∏—Ç–∏–∫,–ú–æ—Å–∫–≤–∞,1,–°–±–µ—Ä–ú–æ–±–∞–π–ª,2025-07-02T11:55:06+0300,2025-07-02T11:55:06+0300,,False,https://hh.ru/vacancy/122301031,,,,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ì–∏–±–∫–∏–π –≥—Ä–∞—Ñ–∏–∫,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
2,2025-07-28 20:06:43.536119,121374239,–ê–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö (Junior),–ú–æ—Å–∫–≤–∞,1,–ë–∞–Ω–∫ –í–¢–ë (–ü–ê–û),2025-07-03T14:13:24+0300,2025-07-03T14:13:24+0300,,False,https://hh.ru/vacancy/121374239,,,,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
3,2025-07-28 20:06:43.536123,122249561,–ê–Ω–∞–ª–∏—Ç–∏–∫ –¥–∞–Ω–Ω—ã—Ö,–ú–æ—Å–∫–≤–∞,1,–°—ã–∫—Ç—ã–≤–∫–∞—Ä –¢–∏—Å—Å—å—é –ì—Ä—É–ø,2025-07-01T12:39:43+0300,2025-07-01T12:39:43+0300,,False,https://hh.ru/vacancy/122249561,100000.0,,RUR,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
4,2025-07-28 20:06:43.536127,122425111,–ê—Å—Å–∏—Å—Ç–µ–Ω—Ç –ø–æ —Ä–∞–±–æ—Ç–µ —Å –¥–∞–Ω–Ω—ã–º–∏ (–ê—Å—Å–∏—Å—Ç–µ–Ω—Ç –∞–Ω–∞–ª–∏...,–ú–æ—Å–∫–≤–∞,1,"–ì—Ä—É–ø–ø–∞ –∫–æ–º–ø–∞–Ω–∏–π ¬´Group4Media¬ª, –£–ø—Ä–∞–≤–ª—è—é—â–∞—è –∫–æ–º...",2025-07-04T16:02:08+0300,2025-07-04T16:02:08+0300,,False,https://hh.ru/vacancy/122425111,,,,–ù–µ—Ç –æ–ø—ã—Ç–∞,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å
