In [None]:
# %pip install tqdm

In [1]:
import json
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from time import sleep
from dotenv import dotenv_values

In [2]:
def get_vacancies_id(header:dict, param:dict=None, URL:str='https://api.hh.ru/vacancies', per_page:int=100, time_dalay:float=0)->json:
    sleep(time_dalay)
    return requests.get(url=URL, headers=header, params=param).json()

In [3]:
JOB_TITLE = 'Аналитик данных'

USER_AGENT = dotenv_values('.env')['USER_AGENT']
BASE_URL='https://api.hh.ru/vacancies'

PER_PAGE = 100

header = {'User-Agent':USER_AGENT}
param = {'text': JOB_TITLE,
          'search_field': 'name',
          'page': 0,
          'per_page': PER_PAGE,
          'only_with_salary': True,
          'locale': 'RU'}

In [4]:
resp = get_vacancies_id(header, param, per_page=PER_PAGE)

VAC_CNT = resp['found']
PAGES_CNT = resp['pages']
current_page = resp['page']
VAC_CNT, PAGES_CNT, current_page

(240, 3, 0)

In [5]:
vac_ids = []
current_page = 0
PAGES_CNT = 999

while current_page <= PAGES_CNT-1:
    resp = get_vacancies_id(header, param, per_page=PER_PAGE)
    current_page = resp['page']
    PAGES_CNT = resp['pages']

    for item in resp['items']:
        vac_ids.append(item['id'])

    param['page'] = current_page + 1
    resp = get_vacancies_id(header, param, per_page=PER_PAGE)
    current_page = resp['page']


print("It's all OK" if len(set(vac_ids)) == VAC_CNT else f"Smt went WRONG\n{len(set(vac_ids))}___{VAC_CNT}")

It's all OK


In [27]:
def get_key_skills(res:str)->str:
    '''
    Функция для формирования строки требующихся скиллов
    Param:
        res:str - ответ от api.hh
    Return:
        str - строка с навыками
    '''
    result = []
    for items in res.get('key_skills',{}):
        for k,v in items.items():
            result.append(v)
    return '; '.join(result)

In [45]:
def create_dfs()->pd.DataFrame:
    vacancies_df    =   pd.DataFrame(columns=[
                        'vac_id',
                        'vac_name',
                        'city',
                        'published_at',
                        'archived',
                        'expirience',
                        'alternate_url',
                        'employer_id',
                        'employer_name',
                        'schedule_name',
                        'employment_name',
                        'prof_role',
                        'work_format',
                        'work_hour',
                        'work_schedule_by_days'
                        ])

    salary_df       =   pd.DataFrame(columns=[
                            'vac_id',
                            'salary_from',
                            'salary_to',
                            'currency',
                            'gross',
                            'range_mode_name',
                            'frequency'
                        ])

    requerments_df  = pd.DataFrame(columns=[
                        'vac_id',
                        'response_letter',
                        'has_test',
                        'descriptions',
                        'key_skills'
                        ])
    
    return vacancies_df, salary_df, requerments_df

In [9]:
#создаем датафреймы
vacancies_df, salary_df, requerments_df = create_dfs()

for vac_id in tqdm(vac_ids,desc='GET vacancies annotate'):
    vac_resp = get_vacancies_id(header, URL=BASE_URL+'/'+vac_id, time_dalay=0.5)
    
    vacancies_df.loc[len(vacancies_df)] = [vac_resp['id'], \
                                           vac_resp['name'], \
                                           vac_resp['area']['name'], \
                                           vac_resp['published_at'], \
                                           vac_resp['archived'], \
                                           vac_resp['experience']['name'], \
                                           vac_resp['alternate_url'], \
                                           vac_resp['employer']['id'], \
                                           vac_resp['employer']['name'], \
                                           vac_resp['schedule']['name'], \
                                           vac_resp['employment']['name'], \
                                           vac_resp['professional_roles'][0]['name'], \
                                           vac_resp['work_format'][0]['name'], \
                                           vac_resp['working_hours'][0]['name'], \
                                           vac_resp['work_schedule_by_days'][0]['name']]

    salary_df.loc[len(salary_df)] = [vac_resp['id'], \
                                     vac_resp['salary']['from'], \
                                     vac_resp['salary']['to'], \
                                     vac_resp['salary']['currency'], \
                                     vac_resp['salary']['gross'], \
                                     vac_resp['salary_range']['mode']['name'], \
                                     vac_resp['salary_range']['frequency']['name']
                                    ]

    requerments_df.loc[len(requerments_df)] = [vac_resp['id'], \
                                           vac_resp['response_letter_required'], \
                                           vac_resp['has_test'],
                                           vac_resp['description'],\
                                           get_key_skills(vac_resp)
                                        ]

GET vacancies annotate:   0%|          | 0/240 [00:00<?, ?it/s]

GET vacancies annotate:   1%|▏         | 3/240 [00:02<03:26,  1.15it/s]


IndexError: list index out of range

In [46]:
# Создаем датафреймы
vacancies_df, salary_df, requerments_df = create_dfs()

for vac_id in tqdm(vac_ids, desc='GET vacancies annotate'):
    # try:
    vac_resp = get_vacancies_id(header, URL=BASE_URL+'/'+vac_id, time_dalay=0.3)
    
    # Обработка данных для vacancies_df с проверками
    vacancies_df.loc[len(vacancies_df)] = [
        vac_resp.get('id'),
        vac_resp.get('name', ''),
        vac_resp.get('area', {}).get('name', ''),
        vac_resp.get('published_at', ''),
        vac_resp.get('archived', False),
        vac_resp.get('experience', {}).get('name', ''),
        vac_resp.get('alternate_url', ''),
        vac_resp.get('employer', {}).get('id', ''),
        vac_resp.get('employer', {}).get('name', ''),
        vac_resp.get('schedule', {}).get('name', ''),
        vac_resp.get('employment', {}).get('name', ''),
        vac_resp.get('professional_roles', [{}])[0].get('name', ''),
        vac_resp.get('work_format', [{}])[0].get('name', ''),
        vac_resp.get('working_hours', [{}])[0].get('name', ''),
        vac_resp.get('work_schedule_by_days', [{}])[0].get('name', '')
    ]

    salary_df.loc[len(salary_df)] = [
        vac_resp.get('id'),
        vac_resp.get('salary',{}).get('from',''),
        vac_resp.get('salary',{}).get('to',''),
        vac_resp.get('salary',{}).get('currency'),
        vac_resp.get('salary',{}).get('gross'),
        vac_resp.get('salary_range').get('mode',{}).get('name',''),
        vac_resp.get('salary_range').get('frequency',{}).get('name','')
    ]

    # Обработка данных для requerments_df с проверками
    requerments_df.loc[len(requerments_df)] = [
        vac_resp.get('id'),
        vac_resp.get('response_letter_required', False),
        vac_resp.get('has_test', False),
        vac_resp.get('description', ''),
        get_key_skills(vac_resp)
    ]
    # except Exception as e:
    #     print(f"Ошибка обработки вакансии {vac_id}: {str(e)}")
    #     continue

GET vacancies annotate:   1%|▏         | 3/240 [00:01<02:23,  1.65it/s]


IndexError: list index out of range

In [None]:
# r = get_vacancies_id(header, param, BASE_URL+'/121329518')

In [47]:
vac_resp.get('work_format')

[]

In [48]:
vac_resp

{'id': '121133654',
 'premium': False,
 'billing_type': {'id': 'standard_plus', 'name': 'Стандарт плюс'},
 'relations': [],
 'name': 'Аналитик данных',
 'insider_interview': None,
 'response_letter_required': False,
 'area': {'id': '2759',
  'name': 'Ташкент',
  'url': 'https://api.hh.ru/areas/2759'},
 'salary': {'from': 900, 'to': None, 'currency': 'USD', 'gross': True},
 'salary_range': {'from': 900,
  'to': None,
  'currency': 'USD',
  'gross': True,
  'mode': {'id': 'MONTH', 'name': 'За\xa0месяц'},
  'frequency': None},
 'type': {'id': 'open', 'name': 'Открытая'},
 'address': {'city': 'Ташкент',
  'street': 'улица Бабура',
  'building': '34',
  'lat': 41.288809,
  'lng': 69.251431,
  'description': None,
  'raw': 'Ташкент, улица Бабура, 34',
  'metro': None,
  'metro_stations': []},
 'allow_messages': True,
 'experience': {'id': 'between1And3', 'name': 'От 1 года до 3 лет'},
 'schedule': {'id': 'fullDay', 'name': 'Полный день'},
 'employment': {'id': 'full', 'name': 'Полная занятос