In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:


def scrape_headhunter(search_query="программист", pages=1):
    base_url = "https://hh.ru/search/vacancy"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    }
    vacancies = []

    for page in range(pages):
        params = {
            "text": search_query,
            "page": page
        }
        response = requests.get(base_url, headers=headers, params=params)
        if response.status_code != 200:
            print(f"Ошибка {response.status_code} при запросе страницы {page}")
            continue

        soup = BeautifulSoup(response.text, "html.parser")
        vacancy_cards = soup.find_all("div", class_="vacancy-serp-item")

        for card in vacancy_cards:
            title_tag = card.find("a", class_="serp-item__title")
            title = title_tag.text.strip() if title_tag else None
            link = title_tag["href"] if title_tag else None

            salary_tag = card.find("span", class_="bloko-header-section-3")
            salary = salary_tag.text.strip() if salary_tag else None

            company_tag = card.find("a", class_="bloko-link bloko-link_kind-tertiary")
            company = company_tag.text.strip() if company_tag else None

            city_tag = card.find("div", {"data-qa": "vacancy-serp__vacancy-address"})
            city = city_tag.text.strip() if city_tag else None

            experience_tag = card.find("div", {"data-qa": "vacancy-serp__vacancy-work-experience"})
            experience = experience_tag.text.strip() if experience_tag else None

            remote = "удаленная работа" in card.text.lower()

            vacancies.append({
                "Название вакансии": title,
                "Мин ЗП": None,
                "Макс ЗП": None,
                "Зарплата (строка)": salary,
                "Компания": company,
                "Город": city,
                "Мин опыт работы": experience,
                "Удаленная работа": remote,
                "Ссылка": link
            })

    return pd.DataFrame(vacancies)

# Скрапим данные и сохраняем в CSV
data = scrape_headhunter(pages=2)


In [5]:
data

In [7]:
def get_url_page(page: int) -> str:
    return f"https://bishkek.headhunter.kg/search/vacancy?resume=056a0788ff0e1708f00039ed1f4e4b64466153&page={page}&searchSessionId=5485e1b6-490f-4c2c-be2e-7a01be3e2f98"

In [41]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}
url = get_url_page(1)
response = requests.get(url, headers=headers).text
soup = BeautifulSoup(response, 'html.parser')

In [None]:
import re

def extract_min_experience(experience_str):
    
    if "без опыта" in experience_str.lower():
        return 0

    match = re.search(r"(\d+)-(\d+)", experience_str)
    if match:
        return int(match.group(1))
    
    match_more_than = re.search(r"более (\d+)", experience_str)
    if match_more_than:
        return int(match_more_than.group(1)) 

    if "лет" in experience_str:
        return int(re.search(r"\d+", experience_str).group(0))

    return None


In [45]:
cards = soup.find_all('div', class_='vacancy-card--n77Dj8TY8VIUF0yM font-inter')


In [59]:
def parse_salary(salary_str):
    if not salary_str:
        return {
            "salary_from": None,
            "salary_to": None,
            "currency": None,
            "is_net": None  # True - "на руки", False - "до вычета налогов"
        }

    salary_str = salary_str.text

    salary_str = salary_str.replace("\u202f", "").strip()

    is_net = "на руки" in salary_str
    is_gross = "до вычета налогов" in salary_str

    currency_match = re.search(r"[\₽\$€₸Br]", salary_str)
    currency = currency_match.group(0) if currency_match else None

    range_match = re.search(r"(\d+)[^\d]*(\d+)?", salary_str)
    if "от" in salary_str:
        salary_from = int(range_match.group(1))
        salary_to = None
    elif "–" in salary_str or "—" in salary_str:
        salary_from = int(range_match.group(1))
        salary_to = int(range_match.group(2))
    else:
        salary_from = salary_to = int(range_match.group(1))

    if salary_to is None:
        salary_to = salary_from

    return {
        "salary_from": salary_from,
        "salary_to": salary_to,
        "currency": currency,
        "is_net": is_net  # True - "на руки", False - "до вычета налогов"
    }

In [107]:
def extract(page_start = 1, page_end = 1) -> pd.DataFrame:

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    }

    df = pd.DataFrame(columns=['title', 'company', 'salary_from', 'salary_to', 'currency', 'is_net', 'year_experience', 'link'])

    for page in range(page_start, page_end + 1):
        url = get_url_page(page)
        response = requests.get(url, headers=headers).text
        soup = BeautifulSoup(response, 'html.parser')

        cards = soup.find_all('div', class_='vacancy-card--n77Dj8TY8VIUF0yM font-inter')

        for card in cards:
            title = card.find('span', class_='magritte-text___tkzIl_4-3-14').text
            link_tag = card.find('a', class_='magritte-link___b4rEM_4-3-14 magritte-link_style_neutral___iqoW0_4-3-14 magritte-link_enable-visited___Biyib_4-3-14')
            link = link_tag['href']

            subtitle = card.find('span', class_='magritte-text___pbpft_3-0-20 magritte-text_style-primary___AQ7MW_3-0-20 magritte-text_typography-label-1-regular___pi3R-_3-0-20')
            handled_salary = parse_salary(subtitle)
            salary_from = handled_salary['salary_from']
            salary_to = handled_salary['salary_to']
            currency = handled_salary['currency']
            is_net = handled_salary['is_net']

            tags = card.find_all('div', class_='compensation-labels--vwum2s12fQUurc2J compensation-labels_magritte--pbBIkJ7Ww24ZILKz')

            year_experience = None

            for tag in tags:
                tag_solo = tag.find('div', class_='magritte-tag___WdGxk_3-0-23 magritte-tag_style-neutral___cw1Bt_3-0-23 magritte-tag_size-medium___Splpy_3-0-23')
                if tag_solo is None or 'опыт' not in tag_solo.text.lower():
                    continue
            
                year_experience = extract_min_experience(tag_solo.text)

            company = card.find('a', class_='magritte-link___b4rEM_4-3-14 magritte-link_style_neutral___iqoW0_4-3-14')
            if (company is not None):
                company = company.text
            else:
                company = None


            df.loc[len(df)] = [title, company, salary_from, salary_to, currency, is_net, year_experience, link]
            
    return df


In [108]:
df = extract(1, 10)

In [109]:
df

Unnamed: 0,title,company,salary_from,salary_to,currency,is_net,year_experience,link
0,Водитель персональный,ООО Медико-фармацевтический дистрибьютор,120000,120000,₽,False,6,https://bishkek.headhunter.kg/vacancy/11420954...
1,Frontend-разработчик,ООО ВайсВеб,,,,,1,https://bishkek.headhunter.kg/vacancy/11428342...
2,Бухгалтер в одном лице,ИП bai.food,100000,200000,₸,True,3,https://bishkek.headhunter.kg/vacancy/11398579...
3,Семейный водитель,АО Транспортная Компания Логистик-Центр,120000,120000,₽,True,3,https://bishkek.headhunter.kg/vacancy/11429403...
4,Офис-менеджер (администратор),УП АвалонСтройГрупп,800,1100,B,True,0,https://bishkek.headhunter.kg/vacancy/11391510...
...,...,...,...,...,...,...,...,...
175,Администратор кабинета платных услуг,ГАУЗ Оренбургский Областной Клинический Онколо...,40000,40000,₽,True,1,https://bishkek.headhunter.kg/vacancy/11417439...
176,Водитель,ООО Семаргл,90000,90000,₽,True,3,https://bishkek.headhunter.kg/vacancy/11418413...
177,Ведущий специалист по работе с обращениями гра...,КГАУ ДПО Камчатская школа управления,70000,80000,₽,True,0,https://bishkek.headhunter.kg/vacancy/11423381...
178,Специалист по учету (табельщик),ТОО RW constructor,200000,230000,₸,True,0,https://bishkek.headhunter.kg/vacancy/11385812...


In [132]:
def extract(page_start = 1, page_end = 1) -> pd.DataFrame:

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    }

    df = pd.DataFrame(columns=['title', 'company', 'salary_from', 'salary_to', 'currency', 'is_net', 'year_experience', 'link'])

    for page in range(page_start, page_end + 1):

        url = get_url_page(page)
        response = requests.get(url, headers=headers).text
        soup = BeautifulSoup(response, 'html.parser')

        cards = soup.find_all('div', class_='vacancy-card--n77Dj8TY8VIUF0yM font-inter')

        for card in cards:
            title = card.find('span', class_='magritte-text___tkzIl_4-3-14').text
            link_tag = card.find('a', class_='magritte-link___b4rEM_4-3-14 magritte-link_style_neutral___iqoW0_4-3-14 magritte-link_enable-visited___Biyib_4-3-14')
            link = link_tag['href']

            subtitle = card.find('span', class_='magritte-text___pbpft_3-0-20 magritte-text_style-primary___AQ7MW_3-0-20 magritte-text_typography-label-1-regular___pi3R-_3-0-20')
            handled_salary = parse_salary(subtitle)
            salary_from = handled_salary['salary_from']
            salary_to = handled_salary['salary_to']
            currency = handled_salary['currency']
            is_net = handled_salary['is_net']

            tags = card.find_all('div', class_='compensation-labels--vwum2s12fQUurc2J compensation-labels_magritte--pbBIkJ7Ww24ZILKz')

            year_experience = None
            for tag in tags:
                tag_solo = tag.find('div', class_='magritte-tag___WdGxk_3-0-23 magritte-tag_style-neutral___cw1Bt_3-0-23 magritte-tag_size-medium___Splpy_3-0-23')
                if tag_solo is None or 'опыт' not in tag_solo.text.lower():
                    continue
            
                year_experience = extract_min_experience(tag_solo.text)
                break

            company = card.find('a', class_='magritte-link___b4rEM_4-3-14 magritte-link_style_neutral___iqoW0_4-3-14')
            company = company.text if company is not None else None

            if all([title, company, salary_from, salary_to, currency, is_net, year_experience, link]):
                df.loc[len(df)] = [title, company, salary_from, salary_to, currency, is_net, year_experience, link]

            
            
    return df


In [133]:
df = extract(1, 5)

In [134]:
df1 = df.copy()

In [135]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, 0 to 42
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            43 non-null     object
 1   company          43 non-null     object
 2   salary_from      43 non-null     int64 
 3   salary_to        43 non-null     int64 
 4   currency         43 non-null     object
 5   is_net           43 non-null     bool  
 6   year_experience  43 non-null     int64 
 7   link             43 non-null     object
dtypes: bool(1), int64(3), object(4)
memory usage: 2.7+ KB


In [136]:
def transform(df: pd.DataFrame) -> pd.DataFrame:
    df['salary_from'] = df['salary_from'].astype(int)
    df['salary_to'] = df['salary_to'].astype(int)
    df['year_experience'] = df['year_experience'].astype(int)
    df['is_net'] = df['is_net'].astype(bool)
    return df

In [137]:
transform(df1)

Unnamed: 0,title,company,salary_from,salary_to,currency,is_net,year_experience,link
0,Бухгалтер в одном лице,ИП bai.food,100000,200000,₸,True,3,https://bishkek.headhunter.kg/vacancy/11398579...
1,Исполнительный директор,Work&Wolf,400000,400000,₽,True,3,https://bishkek.headhunter.kg/vacancy/11429872...
2,Семейный водитель,АО Транспортная Компания Логистик-Центр,120000,120000,₽,True,3,https://bishkek.headhunter.kg/vacancy/11429403...
3,Офисный водитель на автомобиль организации,ТОО QUIP PRO,250000,350000,₸,True,3,https://bishkek.headhunter.kg/vacancy/11384291...
4,Оператор 1С реализация,ООО Интерлюкс,100000,100000,₽,True,1,https://bishkek.headhunter.kg/vacancy/11427760...
5,Руководитель направления,ООО БиЭмТи Трейдинг,5000,5000,B,True,1,https://bishkek.headhunter.kg/vacancy/11429140...
6,Личный водитель,Магидова Татьяна Юрьевна,150000,150000,₽,True,1,https://bishkek.headhunter.kg/vacancy/11427991...
7,Специалист по сдуванию пены с пива,ООО Заречная Пивоварня,200000,250000,₽,True,1,https://bishkek.headhunter.kg/vacancy/11385649...
8,Охранник,Евразия,150000,150000,₽,True,3,https://bishkek.headhunter.kg/vacancy/11418320...
9,Водитель категории В на микроавтобус,ИП Демещенко Николай Валерьевич,2000,3000,B,True,1,https://bishkek.headhunter.kg/vacancy/11429069...


In [138]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, 0 to 42
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            43 non-null     object
 1   company          43 non-null     object
 2   salary_from      43 non-null     int32 
 3   salary_to        43 non-null     int32 
 4   currency         43 non-null     object
 5   is_net           43 non-null     bool  
 6   year_experience  43 non-null     int32 
 7   link             43 non-null     object
dtypes: bool(1), int32(3), object(4)
memory usage: 2.2+ KB


In [139]:
import sqlite3

In [142]:
def load(df: pd.DataFrame, db_name: str, table_name: str) -> None:
    path = f"data/{db_name}"

    with sqlite3.connect(path) as conn:
        df.to_sql(table_name, conn, if_exists='replace', index=False)

In [143]:
load(df1, 'vacancies.db', 'vacancies')