In [None]:
# %pip install tqdm

In [1]:
import json
import requests
import numpy as np
import pandas as pd 
from tqdm import tqdm
from time import sleep
from dotenv import dotenv_values

In [2]:
def get_vacancies_id(header:dict, param:dict=None, URL:str='https://api.hh.ru/vacancies', per_page:int=100, time_delay:float=0)->json:
    sleep(time_delay)
    return requests.get(url=URL, headers=header, params=param).json()

In [3]:
JOB_TITLE = 'Аналитик данных'

USER_AGENT = dotenv_values('.env')['USER_AGENT']
BASE_URL='https://api.hh.ru/vacancies'

PER_PAGE = 100

header = {'User-Agent':USER_AGENT}
param = {'text': JOB_TITLE,
          'search_field': 'name',
          'page': 0,
          'per_page': PER_PAGE,
          'only_with_salary': True,
          'locale': 'RU'}

In [4]:
resp = get_vacancies_id(header, param, per_page=PER_PAGE)

VAC_CNT = resp['found']
PAGES_CNT = resp['pages']
current_page = resp['page']
VAC_CNT, PAGES_CNT, current_page

(200, 2, 0)

In [5]:
vac_ids = []
current_page = 0
PAGES_CNT = 999

while current_page <= PAGES_CNT-1:
    resp = get_vacancies_id(header, param, per_page=PER_PAGE)
    current_page = resp['page']
    PAGES_CNT = resp['pages']

    for item in resp['items']:
        vac_ids.append(item['id'])

    param['page'] = current_page + 1
    resp = get_vacancies_id(header, param, per_page=PER_PAGE)
    current_page = resp['page']


print("It's all OK" if len(set(vac_ids)) == VAC_CNT else f"Smt went WRONG\n{len(set(vac_ids))}___{VAC_CNT}")

It's all OK


In [6]:
def get_skills_str(key_skills:list)->str:
    '''Функция преобразования списка словарей требуемых скилов в строку'''
    result = []
    for skill in key_skills:
        result.append(skill.get('name'))
    return '; '.join(result)

In [7]:
vac_df = pd.DataFrame(columns=[
    'vac_id',
    'vac_name',
    'city',
    'published_at',
    'archived',
    'experience',
    'url',
    'employer_id',
    'schedule_name',
    'employment_name',
    'prof_role'
])

emp_df = pd.DataFrame(columns=[
    'vac_id',
    'emp_id',
    'emp_name',
    'addres',
    'emp_url',
    'is_accredited',
    'is_trusted'
])

sal_df = pd.DataFrame(columns=[
    'vac_id',
    'salary_from',
    'salary_to',
    'currency',
    'gross',
    'mode_name',
    'frequency'
])

req_df = pd.DataFrame(columns=[
    'vac_id',
    'insider_interview',
    'response_letter_required',
    'experience',
    'key_skills',
    'has_test'
])

for vac_id in tqdm(vac_ids, desc= 'Getting annotation...:'):
    #формирование датафрейма по информации вакансий
    try:
        vac_annote = get_vacancies_id(header=header, URL=BASE_URL+f'/{vac_id}', time_delay=0.4)
        vac_df.loc[len(vac_df)] = [
            vac_annote.get('id'), 
            vac_annote.get('name'),
            vac_annote.get('area').get('id'),
            vac_annote.get('published_at'),
            vac_annote.get('archived'),
            vac_annote.get('experience').get('name'),
            vac_annote.get('alternate_url'),
            vac_annote.get('employer').get('id'),
            vac_annote.get('schedule').get('name'),
            vac_annote.get('employment').get('name'),
            vac_annote.get('professional_roles')[0].get('name') if vac_annote.get('professional_roles')[0] else None,
        ]
    except Exception as e:
        print('vac_df', vac_id, e)
    
    #формирование датафрейма с информацией по работодателю из вакансий
    try:
        emp_df.loc[len(emp_df)] = [
            vac_annote.get('id'),
            vac_annote.get('employer').get('id'), 
            vac_annote.get('employer').get('name'),
            vac_annote.get('address').get('raw') if vac_annote.get('address') else None,
            vac_annote.get('employer').get('alternate_url'),
            vac_annote.get('employer').get('accredited_it_employer'),
            vac_annote.get('employer').get('trusted')
        ]
    except Exception as e:
        print('emp_df', vac_id, e)

    #формирование датафрейма с информацией по заработной плате из вакансий
    try:
        sal_df.loc[len(sal_df)] = [
            vac_annote.get('id'), 
            vac_annote.get('salary').get('from'),
            vac_annote.get('salary').get('to'),
            vac_annote.get('salary').get('currency'),
            vac_annote.get('salary').get('gross'),
            vac_annote.get('salary_range').get('mode').get('name'),
            vac_annote.get('salary_range').get('mode').get('frequency')
        ]
    except Exception as e:
        print('sal_df', vac_id, e)

    try:
        req_df.loc[len(req_df)] = [
            vac_annote.get('id'), 
            vac_annote.get('insider_interview'),
            vac_annote.get('response_letter_required'),
            vac_annote.get('experience').get('name'),
            get_skills_str(vac_annote.get('key_skills') ) if vac_annote.get('key_skills') else None,
            vac_annote.get('has_test')
        ]
    except Exception as e:
        print('req_df', vac_id, e)

print('Done')

Getting annotation...:: 100%|██████████| 200/200 [01:58<00:00,  1.69it/s]

Done





In [9]:
vac_df.to_csv('vac_df.csv')
req_df.to_csv('req_df.csv')
sal_df.to_csv('sal_df.csv')
emp_df.to_csv('emp_df.csv')