In [3]:
import json
import requests
import numpy as np
import pandas as pd 
from tqdm import tqdm
from time import sleep
from dotenv import dotenv_values

In [4]:
def get_vacancies_id(header:dict, param:dict=None, URL:str='https://api.hh.ru/vacancies', per_page:int=100, time_delay:float=0)->json:
    sleep(time_delay)
    return requests.get(url=URL, headers=header, params=param).json()

In [5]:
JOB_TITLE = 'Аналитик данных'

USER_AGENT = dotenv_values('.env')['USER_AGENT']
BASE_URL='https://api.hh.ru/vacancies'

PER_PAGE = 100

header = {'User-Agent':USER_AGENT}
param = {'text': JOB_TITLE,
          'search_field': 'name',
          'page': 0,
          'per_page': PER_PAGE,
          'only_with_salary': True,
          'locale': 'RU'}

In [6]:
resp = get_vacancies_id(header, param, per_page=PER_PAGE)

VAC_CNT = resp['found']
PAGES_CNT = resp['pages']
current_page = resp['page']
VAC_CNT, PAGES_CNT, current_page

(193, 2, 0)

In [7]:
vac_ids = []
current_page = 0
PAGES_CNT = 999

while current_page <= PAGES_CNT-1:
    resp = get_vacancies_id(header, param, per_page=PER_PAGE)
    current_page = resp['page']
    PAGES_CNT = resp['pages']

    for item in resp['items']:
        vac_ids.append(item['id'])

    param['page'] = current_page + 1
    resp = get_vacancies_id(header, param, per_page=PER_PAGE)
    current_page = resp['page']


print("It's all OK" if len(set(vac_ids)) == VAC_CNT else f"Smt went WRONG\n{len(set(vac_ids))}___{VAC_CNT}")

Smt went WRONG
194___193


In [8]:
def get_skills_str(key_skills:list)->str:
    '''Функция преобразования списка словарей требуемых скилов в строку'''
    result = []
    for skill in key_skills:
        result.append(skill.get('name'))
    return '; '.join(result)

In [None]:
data_list = []

for vac_id in tqdm(vac_ids, desc='Получение описания вакансий...'):
    try:
        vac_annote = get_vacancies_id(header=header, URL=BASE_URL+f'/{vac_id}', time_delay=0.4)


        address = vac_annote.get('address')
        employer = vac_annote.get('employer')
        salary = vac_annote.get('salary')
        salary_range = vac_annote.get('salary_range', {})
        mode = salary_range.get('mode', {}) if salary_range else {}
        prof_roles = vac_annote.get('professional_roles', [])
        
        name = vac_annote.get('name', '').lower()
        if any(word in name for word in ['стажер', 'стажёр', 'интерн', 'помощник', 'intern', 'trainee']):
            grade = 'Intern'
        elif any(word in name for word in ['младший', 'junior', 'джуниор', 'начинающий']):
            grade = 'Junior'
        elif any(word in name for word in ['мидл', 'middle', 'средний', 'mid-level']):
            grade = 'Middle'
        elif any(word in name for word in ['сеньор', 'senior', 'старший', 'ведущий', 'опытный']):
            grade = 'Senior'
        elif any(word in name for word in ['тимлид', 'teamlead', 'руководитель', 'lead', 'главный']):
            grade = 'Team Lead'
        else:
            grade = 'Middle'


        row = {
            'vac_id': vac_annote.get('id'),
            'vac_name': vac_annote.get('name'),
            'grade':grade,
            'city': vac_annote.get('area', {}).get('name'),
            'geo': f"[{address.get('lat')}, {address.get('lng')}]" if address else None,
            'published_at': vac_annote.get('published_at'),
            'archived': vac_annote.get('archived'),
            'employer_id': employer.get('id') if employer else None,
            'emp_name': vac_annote.get('employment', {}).get('name'),
            'addres': address.get('raw') if address else None,
            'is_accredited': employer.get('accredited_it_employer') if employer else None,
            'is_trusted': employer.get('trusted') if employer else None,
            'salary_from': salary.get('from') if salary else None,
            'salary_to': salary.get('to') if salary else None,
            'currency': salary.get('currency') if salary else None,
            'gross': salary.get('gross') if salary else None,
            'mode_name': mode.get('name'),
            'frequency': mode.get('frequency'),
            'prof_role': prof_roles[0].get('name') if prof_roles else None,
            'schedule_name': vac_annote.get('schedule', {}).get('name'),
            'insider_interview': vac_annote.get('insider_interview'),
            'response_letter_required': vac_annote.get('response_letter_required'),
            'experience': vac_annote.get('experience', {}).get('name'),
            'key_skills': get_skills_str(vac_annote.get('key_skills')) if vac_annote.get('key_skills') else None,
            'has_test': vac_annote.get('has_test'),
            'description': vac_annote.get('description'),
            'url':vac_annote.get('alternate_url')
        }
        data_list.append(row)
        
    except Exception as e:
        print(f'Error processing vac_id {vac_id}: {e}')

# Создаем DataFrame одним вызовом
full_df = pd.DataFrame(data_list)
print('Success: Parsing done')

Получение описания вакансий...: 100%|██████████| 194/194 [01:53<00:00,  1.71it/s]

Success: Parsing done





--- 

In [13]:
full_df.to_csv('resources/full_df.csv')

print('Success: DataFrame saved')

Success: DataFrame saved


In [16]:
%pip install gsheets

Collecting gsheets
  Downloading gsheets-0.6.1-py3-none-any.whl.metadata (9.6 kB)
Collecting google-api-python-client (from gsheets)
  Downloading google_api_python_client-2.176.0-py3-none-any.whl.metadata (7.0 kB)
Collecting oauth2client>=1.5.0 (from gsheets)
  Downloading oauth2client-4.1.3-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting httplib2>=0.9.1 (from oauth2client>=1.5.0->gsheets)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting pyasn1>=0.1.7 (from oauth2client>=1.5.0->gsheets)
  Downloading pyasn1-0.6.1-py3-none-any.whl.metadata (8.4 kB)
Collecting pyasn1-modules>=0.0.5 (from oauth2client>=1.5.0->gsheets)
  Downloading pyasn1_modules-0.4.2-py3-none-any.whl.metadata (3.5 kB)
Collecting rsa>=3.1.4 (from oauth2client>=1.5.0->gsheets)
  Downloading rsa-4.9.1-py3-none-any.whl.metadata (5.6 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0,>=1.32.0 (from google-api-python-client->gsheets)
  Downloading google_auth-2.40.3-py2.py3-none-any.whl.metadata 


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
path_to_gdisk = r'E:\Obsidian\YandexDisk'

full_df.to_excel(path_to_gdisk+'\\my.xlsx')

