In [1]:

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_autoinstaller

# setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # ensure GUI is off
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()

# set the target URL
url = "put-url-here-to-scrape"

# set up the webdriver
driver = webdriver.Chrome(options=chrome_options)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [33]:
import datetime as dt

import typing as tp
from pydantic import BaseModel
import locale

SEPARATOR = "|"

month_mapping = {
    "января": 1,
    "февраля": 2,
    "марта": 3,
    "апреля": 4,
    "мая": 5,
    "июня": 6,
    "июля": 7,
    "августа": 8,
    "сентября": 9,
    "октября": 10,
    "ноября": 11,
    "декабря": 12,
}

en_month_mapping = {
    "january": 1,
    "february": 2,
    "march": 3,
    "april": 4,
    "may": 5,
    "june": 6,
    "july": 7,
    "august": 8,
    "september": 9,
    "october": 10,
    "november": 11,
    "december": 12,
}
def parse_birth_date(date_str):
    # Split the date string into components
    if date_str is None:
        return dt.datetime(1997, 2, 7)
    day, month_name, year = date_str.split()

    # Convert day and year to integers
    day = int(day)
    year = int(year)

    # Convert month name to lower case and trim any whitespace
    month_name = month_name.lower().strip()

    # Get the month number from the mapping
    month = month_mapping.get(month_name) or en_month_mapping.get(month_name)

    if month is None:
        raise ValueError(f"Invalid month name: {month_name}")

    # Create a datetime object
    return dt.datetime(year, month, day)


class Candidate(BaseModel):
    dob: dt.datetime = dt.datetime(1997, 2, 7)
    specialization: str = ""
    education: str = ""
    skills: str = ""
    about: str = ""
    experience: str = ""
    link: str = ""
    src: str = "hh.ru"
    raw: tp.Optional[dict[str, tp.Any]] = None

    @staticmethod
    def from_hh(data: dict[str, tp.Any]) -> "Candidate":
        # check if all string values don't containe any \xa0
        for key, value in data.items():
            if isinstance(value, str) and "\xa0" in value:
                data[key] = value.replace("\xa0", " ")

        locale.setlocale(locale.LC_TIME, 'ru_RU.UTF-8')
        # transfors birth_data to datetime from "7 февраля 1997"
        dob = parse_birth_date(data["birth_date"])

        specialization = SEPARATOR.join([entry["name"] for entry in data["specialization"]])
        education = SEPARATOR.join([data["education_level"]] +[
            str(level["year"]) + " " + level["name"]
            for level in data["education"]
        ])
        skills = SEPARATOR.join([
                f"{lang['name']} ({lang['level']})"
                for lang in data["language"]
            ])
        experience = SEPARATOR.join([
            f"{job['start']}:{job['end']}:{job['position']}: {job['description']}"
            for job in data["experience"]
        ])
        try:
            link = data["link"]
        except:
            link = "hh.ru"
        return Candidate(
            dob=dob,
            specialization=specialization,
            education=education,
            skills=skills,
            experience=experience,
            link=link,
            src="hh.ru",
            raw=data,
            about=data["skills"]
        )

In [3]:
import re
from parse_hh_data import download, parse
import json
from tqdm import tqdm

file_data = {}
final_resumes:list[Candidate] = []
counter = 0
for i in tqdm(range(0,5,1)):
    response = driver.get(f"https://hh.ru/resumes/razrabotchik?page={i}")
    resumes = driver.find_elements(By.CLASS_NAME, "magritte-link___b4rEM_4-3-2")
    links = [resume.get_attribute("href") for resume in resumes]
    links = [x for x in links if x is not None]
    for link in tqdm(links):
        try:
            tmp = re.search(r"https://hh.ru/resume/(.*?)\?query", str(link)).group(1)
            url = f"https://hh.ru/resume/{tmp}"
            resume = download.resume(tmp)
            resume_json = parse.resume(resume)
            resume_json["link"] = url
            final_resumes.append(Candidate.from_hh(resume_json)) 
        except:
            pass

driver.quit()

  0%|          | 0/5 [00:00<?, ?it/s]Timeout error occurred: HTTPSConnectionPool(host='hh.ru', port=443): Read timed out. (read timeout=10)


A second request to the https://hh.ru/resume/4b224b610008199b0f0039ed1f484252676e44 will be sent in 10 seconds


100%|██████████| 21/21 [00:41<00:00,  1.98s/it]
100%|██████████| 21/21 [00:25<00:00,  1.20s/it]
100%|██████████| 21/21 [00:17<00:00,  1.19it/s]
100%|██████████| 21/21 [00:18<00:00,  1.14it/s]
100%|██████████| 21/21 [00:17<00:00,  1.17it/s]
100%|██████████| 5/5 [02:46<00:00, 33.38s/it]


In [4]:
final_resumes

[]

In [8]:
import json

with open("data.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)


In [28]:
en_month_mapping.get("may")

5

In [34]:
result = [
    Candidate.from_hh(test_data[record])
    for record in test_data.keys()
]

In [35]:
for record in result:
    print(record)

dob=datetime.datetime(1997, 2, 7, 0, 0) specialization='Программист, разработчик' education='Высшее образование (Магистр)|2021 Российский государственный университет нефти и газа им. И.М. Губкина, Москва|2019 Российский государственный университет нефти и газа им. И.М. Губкина, Москва' skills='Русский (Родной)|Английский (B1 - Средний)' about='Личные навыки: Имею желание развиваться и работать, нацелен на \nрезультат.\nСпокоен, вежлив, грамотная речь.\nКомпьютерные навыки: Владение Microsoft Office, LabView, Multisim, \nMatlab + Simulink\nПрофессиональные интересы: Организация локальных вычислительных \nсетей, создание виртуальных приборов и систем сборов данных с \nиспользованием National Instruments LabVIEW, моделирование и расчеты с \nиспользованием Matlab/Simulink.\n\n' experience='01-10-2021:None:: Администрирования, разработка, оптимизация программного кода и запросов 1С Документооборот.\nСоздание новых объектов, обработок, отчетов.\nТестирование доработок. ' link='hh.ru' src='hh

In [45]:
import asyncpg

con_str = "postgresql://postgres:postgres-password@localhost:5432/dev"

async def insert_candidates(candidates: tp.List[Candidate]):
    async with asyncpg.create_pool(con_str) as pool:
        async with pool.acquire() as conn:
            await conn.executemany(
                """
                INSERT INTO candidates (
                    dob, spezialization, education, description, experience, cv_url, raw_json, src
                ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
                """,
                [
                    (
                        candidate.dob,
                        candidate.specialization,
                        candidate.education,
                        candidate.about,
                        candidate.experience,
                        candidate.link,
                        json.dumps(candidate.raw),
                        candidate.src,
                    )
                    for candidate in candidates
                ],
            )

import asyncio

await insert_candidates(result)

In [37]:
# save Candites to csv
df = pd.DataFrame(data=[dict(record) for record in result],columns=list(dict(result[0]).keys()))
df.to_csv("candidates.csv", sep="*", index=False)