In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import lxml
import re

In [2]:
def pars_page_hh(url):

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36'}
    link = 'https://izhevsk.hh.ru/vacancies/programmist?page=0'
    html = requests.get(link, headers=headers).text
    soup = BeautifulSoup(html, 'lxml')
    vac_page = soup.find_all('div', {'class': 'vacancy-serp-item'})

    names = []
    company = []
    salary_min = []
    salary_max = []
    salary_in = []
    url = []
    all_vac = []

    for vac in vac_page:
        vac_text = vac.find('div', {'class': 'vacancy-serp-item__info'})

        # Парсинг наименования вакансии
        name_block = vac_text.find('a', {'data-qa': 'vacancy-serp__vacancy-title'})
        if name_block is not None:
            vac_name = name_block.text
        else:
            vac_name = ''
        names.append(vac_name)

        # Парсинг наименования компании
        vac_name = vac.find('div', {'class': 'vacancy-serp-item__meta-info-company'})
        if vac_name is not None:
            comp_name = vac_name.text.replace('\xa0', '')
        else:
            comp_name = ''
        company.append(comp_name)

        # Парсинг данных по зарплате
        vac_salary = vac.find('div', {'class': 'vacancy-serp-item__sidebar'})
        if vac_salary is not None:
            vac_sal = vac_salary.text.replace('\u202f', '')
        else:
            vac_sal = ''

        if vac_sal is None:
            salary_min.append(None)
            salary_max.append(None)
        elif '–' in vac_sal:
            temp_list = vac_sal.split('–')
            salary_min.append(int(temp_list[0].split(' ')[0]))
            salary_max.append(int(temp_list[1].split(' ')[1]))
        elif 'от' in vac_sal:
            salary_min.append(int(vac_sal.split(' ')[1]))
            salary_max.append(None)
        elif 'до' in vac_sal:
            salary_min.append(None)
            salary_max.append(int(vac_sal.split(' ')[1]))  # то же что и с 'от'
        else:
            salary_min.append(None)
            salary_max.append(None)

        currency = vac_sal.split(' ')[-1]
        if currency is None:
            salary_in.append(None)
        else: salary_in.append(currency)

        # Парсинг ссылки на вакансию
        vac_link = vac_text.find('a', {'data-qa': 'vacancy-serp__vacancy-title'})
        if vac_link is not None:
            vac_url = vac_link.get('href')
        else:
            vac_url = ''
        url.append(vac_url)

    # Сборка всех вакансий на странице
    for i in range(1, len(names)):
        vac = {'name': names[i],
               'salary_min': salary_min[i],
               'salary_max': salary_max[i],
               'salary_in': salary_in[i],
               'company_name': company[i],
               'link': url[i],
               'website': 'https://izhevsk.hh.ru'}
        all_vac.append(vac)

    return all_vac

all_vac_hh = []
for i in range(0, 9):
    url = 'https://izhevsk.hh.ru/vacancies/programmist?page={}'.format(i)
    vac_temp = pars_page_hh(url)
    all_vac_hh += vac_temp

In [3]:
def superjob(vacancy_name='Программист'):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36'
    }
    params = {
        'keywords': vacancy_name
    }
    base_link = 'https://izhevsk.superjob.ru/'
    get_link = base_link + 'vacancy/search/'
    response = requests.get(get_link, headers=headers, params=params)
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    vac_page = soup.find_all('div', {'class': 'f-test-search-result-item'})

    all_vac_sj = []

    for vac in vac_page:
        # Парсинг наименования вакансии
        vac_text = vac.find('div', {'class': '_1h3Zg _2rfUm _2hCDz _21a7u'})
        if vac_text is not None:
            vac_name = vac_text.text
        else:
            vac_name = None

        # Парсинг названия компании
        comp_name = vac.find('span', {
            'class': '_1h3Zg _3Fsn4 f-test-text-vacancy-item-company-name e5P5i _2hCDz _2ZsgW _2SvHc'})
        if comp_name is not None:
            comp_n = comp_name.text
        else:
            comp_n = None

        # Парсинг ссылки на вакансию
        vac_url = vac.find('div', {'class': '_1h3Zg _2rfUm _2hCDz _21a7u'})
        if vac_url is not None:
            vac_url = vac_url.a['href']

        # Парсинг вилки зарплаты
        if vac.find('span', {'class': '_1h3Zg _2Wp8I _2rfUm _2hCDz _2ZsgW'}) is not None:
            vac_sal = vac.find('span', {'class': '_1h3Zg _2Wp8I _2rfUm _2hCDz _2ZsgW'}).text
            salary = vac_sal.replace('\xa0', '')
            if '—' in salary:
                salary_min = salary.split('—')[0]
                salary_min = re.sub(r'[^0-9]', '', salary_min)
                salary_max = salary.split('—')[1]
                salary_max = re.sub(r'[^0-9]', '', salary_max)
                salary_min = int(salary_min)
                salary_max = int(salary_max)
            elif 'от' in salary:
                salary_min = salary[2:]
                salary_min = re.sub(r'[^0-9]', '', salary_min)
                salary_min = int(salary_min)
                salary_max = None
            elif 'договорённости' in salary:
                salary_min = None
                salary_max = None
            elif 'до' in salary:
                salary_min = None
                salary_max = salary[2:]
                salary_max = re.sub(r'[^0-9]', '', salary_max)
                salary_max = int(salary_max)
            else:
                salary_min = int(re.sub(r'[^0-9]', '', salary))
                salary_max = int(re.sub(r'[^0-9]', '', salary))

            all_vac_sj.append(
                {
                    'name': vac_name,
                    'salary_min': salary_min,
                    'salary_max': salary_max,
                    'salary_in': 'руб.',
                    'company_name': comp_n,
                    'link': base_link + vac_url,
                    'website': base_link
                })

    return all_vac_sj

In [8]:
all_vacancies = []
all_vacancies.extend(all_vac_hh)
all_vac_sj = superjob()
all_vacancies.extend(all_vac_sj)

df = pd.DataFrame(all_vacancies)
df[430:448]

Unnamed: 0,name,salary_min,salary_max,salary_in,company_name,link,website
430,Backend-разработчик (Laravel/Lumen/SQL / Middl...,90000.0,,руб.,Натнэт,https://izhevsk.hh.ru/vacancy/36861738?query=%...,https://izhevsk.hh.ru
431,Асессор-разработчик,,,,Яндекс,https://izhevsk.hh.ru/vacancy/47034755?query=%...,https://izhevsk.hh.ru
432,Web-разработчик / Web developer,,,,ОООCifrasoft,https://izhevsk.hh.ru/vacancy/47194334?query=%...,https://izhevsk.hh.ru
433,Backend разработчик (Java),120000.0,180000.0,руб.,BFG Group,https://izhevsk.hh.ru/vacancy/47809711?query=%...,https://izhevsk.hh.ru
434,PHP-разработчик,,90000.0,руб.,ОООАстарус,https://izhevsk.hh.ru/vacancy/47279160?query=%...,https://izhevsk.hh.ru
435,Программист-стажер,,,,ОООQ-Digital,https://izhevsk.hh.ru/vacancy/47784178?query=%...,https://izhevsk.hh.ru
436,Backend - разработчик (Python),120000.0,220000.0,руб.,BFG Group,https://izhevsk.hh.ru/vacancy/43615645?query=%...,https://izhevsk.hh.ru
437,Backend-разработчик (PHP/Laravel / Middle+ — S...,90000.0,,руб.,Натнэт,https://izhevsk.hh.ru/vacancy/35287896?query=%...,https://izhevsk.hh.ru
438,Laravel-разработчик,,60000.0,руб.,WM Studio,https://izhevsk.hh.ru/vacancy/47477969?query=%...,https://izhevsk.hh.ru
439,Программист-разработчик (Junior),,,,ООООТР,https://izhevsk.hh.ru/vacancy/47803697?query=%...,https://izhevsk.hh.ru
