In [2]:
import os
from time import sleep
from dotenv import dotenv_values

import json
import requests

import numpy as np
import pandas as pd 
from tqdm import tqdm

from geopy.geocoders import Nominatim

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
def get_vacancies_id(header:dict, param:dict=None, URL:str='https://api.hh.ru/vacancies', per_page:int=100, time_delay:float=0)->json:
    sleep(time_delay)
    return requests.get(url=URL, headers=header, params=param).json()

In [6]:
JOB_TITLE = 'Аналитик данных'

USER_AGENT = dotenv_values('.env')['USER_AGENT']
CACHE_FILE = dotenv_values('.env')['CACHE_FILE']
PATH_TO_YANDEXDISC = dotenv_values('.env')['PATH_TO_YANDEXDISC']

BASE_URL='https://api.hh.ru/vacancies'

PER_PAGE = 100

header = {'User-Agent':USER_AGENT}
param = {'text': JOB_TITLE,
          'search_field': 'name',
          'page': 0,
          'per_page': PER_PAGE,
          'only_with_salary': True,
          'locale': 'RU'}

In [9]:
resp = get_vacancies_id(header, param, per_page=PER_PAGE)

VAC_CNT = resp['found']
PAGES_CNT = resp['pages']
current_page = resp['page']
VAC_CNT, PAGES_CNT, current_page

(203, 3, 0)

In [10]:
vac_ids = []
current_page = 0
PAGES_CNT = 999

while current_page <= PAGES_CNT-1:
    resp = get_vacancies_id(header, param, per_page=PER_PAGE)
    current_page = resp['page']
    PAGES_CNT = resp['pages']

    for item in resp['items']:
        vac_ids.append(item['id'])

    param['page'] = current_page + 1
    resp = get_vacancies_id(header, param, per_page=PER_PAGE)
    current_page = resp['page']


print("It's all OK" if len(set(vac_ids)) == VAC_CNT else f"Smt went WRONG\n{len(set(vac_ids))}___{VAC_CNT}")

Smt went WRONG
204___203


In [11]:
def get_skills_str(key_skills:list)->str:
    '''Функция преобразования списка словарей требуемых скилов в строку'''
    result = []
    for skill in key_skills:
        result.append(skill.get('name'))
    return '; '.join(result)

In [12]:
CURRENCY_CACHE = {}
updated_currency = False

def get_curse_valute(valute: str) -> float:
    global CURRENCY_CACHE, updated_currency

    valute = valute.upper()

    #исключения для написания валют на HH
    exception_valute={'BYR':'BYN'} 
    if valute in exception_valute:
        valute=exception_valute[valute]

    CURRENCY_CACHE = {}
    CACHE_FILE = r'resources/valute_cache.json'

    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE) as f:
            CURRENCY_CACHE = json.load(f)
    
    if not updated_currency:
        try:
            #пробуем запросить курсы валют
            res = requests.get("https://www.cbr-xml-daily.ru/daily_json.js", timeout=3).json()
            valutes = res.get('Valute', {})
            
            #обновим файл с кэшом
            for code, data in valutes.items():
                nominal = data.get('Nominal', 1)
                value = data.get('Value')
                if nominal and value:
                    CURRENCY_CACHE[code] = value / nominal

            #сохраним курс
            with open(CACHE_FILE, 'w') as f:
                json.dump(CURRENCY_CACHE, f, ensure_ascii=False, indent=2)
            
            updated_currency = True

        except Exception as e:
            print(f'Error: Не удалось обновить курсы валют ({e})')

    # Возвращаем запрошенную валюту (если есть в ответе)
    if valute in CURRENCY_CACHE:
        return CURRENCY_CACHE[valute]
    
    print(f'Error: Не удалось определить курс валюты {valute}')
    return 1

In [17]:
data_list = []

for vac_id in tqdm(vac_ids, desc='Получение описания вакансий...'):
    try:
        vac_annote = get_vacancies_id(header=header, URL=BASE_URL+f'/{vac_id}', time_delay=0.4)


        employer = vac_annote.get('employer')
        salary_range = vac_annote.get('salary_range', {})
        mode = salary_range.get('mode', {}) if salary_range else {}
        prof_roles = vac_annote.get('professional_roles', [])


        salary_from = None
        salary_to = None

        # Обработка периода оплаты
        mode_name = mode.get('name', 'За месяц').replace('\xa0',' ')
        coef = 1
        if 'час' in mode_name:
            coef = 160 #160 рабочих часов в месяц
        elif 'год' in mode_name:
            coef = 1/12

        #обработка gross значений
        salary = vac_annote.get('salary')
        if salary:
            is_gross = salary.get('gross', True)
            conversion_rate = 0.87 if is_gross else 1.0

            raw_from = salary.get('from')
            raw_to = salary.get('to')
            #salary_from
            if raw_from is not None:
                salary_from = raw_from * coef * conversion_rate
            #salary_to
            if raw_to is not None:
                salary_to = raw_to * coef * conversion_rate
            
            #переводим в рубли,если значение не в рублях
            currency = salary.get('currency')
            if currency and currency != 'RUR':
                rate = get_curse_valute(currency)
        
                if salary_from is not None:
                    salary_from = round(salary_from * rate)
                
                if salary_to is not None:
                    salary_to = round(salary_to * rate)



        #обработка широты и долготы
        address = vac_annote.get('address')
        lat = address.get('lat') if address else None
        lng = address.get('lng') if address else None
        geo = f"[{lat}, {lng}]" if lat is not None and lng is not None else None

        
        #Определение грейда вакансии
        name = vac_annote.get('name', '').lower()
        if any(word in name for word in ['стажер', 'стажёр', 'интерн', 'помощник', 'intern', 'trainee']):
            grade = 'Intern'
        elif any(word in name for word in ['младший', 'junior', 'джуниор', 'начинающий']):
            grade = 'Junior'
        elif any(word in name for word in ['мидл', 'middle', 'средний', 'mid-level']):
            grade = 'Middle'
        elif any(word in name for word in ['сеньор', 'senior', 'старший', 'ведущий', 'опытный']):
            grade = 'Senior'
        elif any(word in name for word in ['тимлид', 'teamlead', 'руководитель', 'lead', 'главный']):
            grade = 'Team Lead'
        else:
            grade = 'Middle'



        row = {
            'vac_id': vac_annote.get('id'),
            'vac_name': vac_annote.get('name'),
            'grade':grade,
            'city': vac_annote.get('area', {}).get('name'),
            'geo': geo,
            'published_at': vac_annote.get('published_at'),
            'archived': vac_annote.get('archived'),
            'employer_id': employer.get('id') if employer else None,
            'emp_name': vac_annote.get('employment', {}).get('name'),
            'addres': address.get('raw') if address else None,
            'is_accredited': employer.get('accredited_it_employer') if employer else None,
            'is_trusted': employer.get('trusted') if employer else None,
            'salary_from' : salary_from,
            'salary_to' : salary_to,
            'currency': salary.get('currency') if salary else None,
            'gross': salary.get('gross') if salary else None,
            'mode_name': mode.get('name'),
            'frequency': mode.get('frequency'),
            'prof_role': prof_roles[0].get('name') if prof_roles else None,
            'schedule_name': vac_annote.get('schedule', {}).get('name'),
            'insider_interview': vac_annote.get('insider_interview'),
            'response_letter_required': vac_annote.get('response_letter_required'),
            'experience': vac_annote.get('experience', {}).get('name'),
            'key_skills': get_skills_str(vac_annote.get('key_skills')) if vac_annote.get('key_skills') else None,
            'has_test': vac_annote.get('has_test'),
            'description': vac_annote.get('description'),
            'url':vac_annote.get('alternate_url')
        }
        data_list.append(row)
        
    except Exception as e:
        print(f'Error processing vac_id {vac_id}: {e}')

# Создаем DataFrame одним вызовом
full_df = pd.DataFrame(data_list)
print('Success: Parsing done')

Получение описания вакансий...:   0%|          | 0/204 [00:00<?, ?it/s]

Получение описания вакансий...: 100%|██████████| 204/204 [01:50<00:00,  1.84it/s]

Success: Parsing done





In [19]:
full_df[full_df.mode_name == 'За\xa0час']

Unnamed: 0,vac_id,vac_name,grade,city,geo,published_at,archived,employer_id,emp_name,addres,is_accredited,is_trusted,salary_from,salary_to,currency,gross,mode_name,frequency,prof_role,schedule_name,insider_interview,response_letter_required,experience,key_skills,has_test,description,url
141,122429339,Аналитик 1с / миграция анализ учёт и хранение ...,Middle,Санкт-Петербург,"[55.864505, 37.544449]",2025-07-04T17:31:30+0300,False,1217056,Полная занятость,"Москва, Коровинское шоссе, 1А",False,True,,159840.0,RUR,False,За час,,Другое,Удаленная работа,,False,От 3 до 6 лет,1С-Битрикс; Аналитическое мышление; 1С: Предпр...,False,<p><strong>Обязанности</strong>:</p> <p>• Поша...,https://hh.ru/vacancy/122429339


In [59]:
geolocator = Nominatim(user_agent="geoapi", timeout=10)

def load_cache():
    # Загрузка кэша из файла
    if os.path.exists(CACHE_FILE):
        try:
            with open(CACHE_FILE, 'r', encoding='utf-8') as f:
                return json.load(f)
        except:
            return {}
    print('⚠️ Warning: Cache file not found')
    return {}


def get_geopoints(city: str, cache: dict) -> str:
    # Получение координат с использованием кэша
    if not city or pd.isna(city):
        return None
    
    # Проверяем кэш
    if city in cache:
        return cache[city]
    
    # Геокодирование если нет в кэше
    try:
        location = geolocator.geocode(f"{city}")
        if location:
            result = f'[{str(location.latitude)}, {str(location.longitude)}]'
        else:
            print(f'Error: Went wrong with {city}')
            result = None
    except Exception:
        result = None
    
    # Обновляем кэш
    cache[city] = result
    return result


def save_cache(cache):
    # Сохранение кэша в файл
    with open(CACHE_FILE, 'w', encoding='utf-8') as f:
        json.dump(cache, f, ensure_ascii=False, indent=2)

In [60]:
cache = load_cache()
mask = full_df.geo.isna()
full_df.loc[mask, 'geo'] = full_df.loc[mask, 'city'].apply(lambda x: get_geopoints(x, cache))

save_cache(cache)

--- 

In [61]:
full_df.to_csv('resources/full_df.csv')
print('✔ Success: DataFrame saved')

✔ Success: DataFrame saved


In [48]:
full_df.to_excel(PATH_TO_YANDEXDISC+'\\my.xlsx')
print('✔ Success: DataFrame saved to YandexDisc')

✔ Success: DataFrame saved to YandexDisc
