In [10]:
!pip install webdriver_manager selenium_stealth

Collecting selenium_stealth
  Downloading selenium_stealth-1.0.6-py3-none-any.whl.metadata (6.4 kB)
Downloading selenium_stealth-1.0.6-py3-none-any.whl (32 kB)
Installing collected packages: selenium_stealth
Successfully installed selenium_stealth-1.0.6


In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

def scrape_hh_jobs(search_term="Data Scientist", num_posts=1000):
    base_url = "https://hh.ru"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    job_data = []
    page = 0
    
    while len(job_data) < num_posts:
        url = f"{base_url}/search/vacancy?text={search_term}&page={page}&items_on_page=20"
        response = requests.get(url, headers=headers)
        
        if response.status_code != 200:
            print(f"Failed to fetch page {page}")
            page += 1
            continue
            
        soup = BeautifulSoup(response.text, 'html.parser')
        job_cards = soup.find_all('div', class_='magritte-redesign')
        
        if not job_cards:
            break
            
        for card in job_cards:
            try:
                header = card.find('h2', class_='bloko-header-section-2')
                href = header.find('a')['href']
                job_data.append(href)
            except Exception as e:
                print(f"Error parsing job card: {e}")
                continue

        print(len(job_data), "jobs found so far")
                
        page += 1
        time.sleep(random.uniform(0.1, 0.3))  # Delay between pages
        
    return pd.DataFrame(job_data)

# Run the scraper
df = scrape_hh_jobs("ML")
df.to_csv('hh_data_scientist_jobs.csv', index=False)
df

21 jobs found so far
42 jobs found so far
63 jobs found so far
84 jobs found so far
105 jobs found so far
126 jobs found so far
147 jobs found so far
168 jobs found so far
189 jobs found so far
210 jobs found so far
231 jobs found so far
252 jobs found so far
273 jobs found so far
294 jobs found so far
315 jobs found so far
336 jobs found so far
357 jobs found so far
378 jobs found so far
399 jobs found so far
420 jobs found so far
441 jobs found so far
462 jobs found so far
483 jobs found so far
504 jobs found so far
525 jobs found so far
546 jobs found so far
567 jobs found so far
588 jobs found so far
609 jobs found so far
630 jobs found so far
651 jobs found so far
672 jobs found so far
693 jobs found so far
714 jobs found so far
735 jobs found so far
756 jobs found so far
777 jobs found so far
798 jobs found so far
819 jobs found so far
840 jobs found so far
861 jobs found so far
882 jobs found so far
903 jobs found so far
924 jobs found so far
945 jobs found so far
966 jobs found

Unnamed: 0,0
0,https://hh.ru/vacancy/120338693?query=ML&hhtmF...
1,https://hh.ru/vacancy/120338693?query=ML&hhtmF...
2,https://hh.ru/vacancy/120338679?query=ML&hhtmF...
3,https://hh.ru/vacancy/80321762?query=ML&hhtmFr...
4,https://hh.ru/vacancy/118354500?query=ML&hhtmF...
...,...
1003,https://hh.ru/vacancy/119010102?query=ML&hhtmF...
1004,https://hh.ru/vacancy/119700961?query=ML&hhtmF...
1005,https://hh.ru/vacancy/115496430?query=ML&hhtmF...
1006,https://hh.ru/vacancy/120123071?query=ML&hhtmF...


In [81]:
# Create DataFrame to store job information
df_info = pd.DataFrame(columns=['url', 'experience', 'work_format', 'salary', 'employment_type', 'skills'])
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

for i in range(len(df)):
    url = df.iloc[i, 0]
    print(f"Processing {i+1}/{len(df)}: {url}")
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch job details for {url}")
            continue
            
        soup = BeautifulSoup(response.text, 'html.parser')

        job_data = {'url': url}

        try:
            title = soup.find('div', class_='vacancy-title')
            salary_element = title.find('span', class_='magritte-text___pbpft_3-0-32 magritte-text_style-primary___AQ7MW_3-0-32 magritte-text_typography-label-1-regular___pi3R-_3-0-32')
            job_data['salary'] = salary_element.text.strip() if salary_element else None
        except Exception as e:
            print(f"Error extracting salary: {e}")
            job_data['salary'] = None
        
        try:
            experience_element = soup.select_one('[data-qa="vacancy-experience"]')
            job_data['experience'] = experience_element.text.strip() if experience_element else None
        except Exception as e:
            print(f"Error extracting experience: {e}")
            job_data['experience'] = None

        try:
            work_format_element = soup.select_one('[data-qa="work-formats-text"]')
            job_data['work_format'] = work_format_element.text.strip() if work_format_element else None
        except Exception as e:
            print(f"Error extracting work format: {e}")
            job_data['work_format'] = None
            
        try:
            employment_type_element = soup.select_one('[data-qa="common-employment-text"]')
            job_data['employment_type'] = employment_type_element.text.strip() if employment_type_element else None
        except Exception as e:
            print(f"Error extracting employment type: {e}")
            job_data['employment_type'] = None

        try:
            skills_elements = soup.select('[data-qa="skills-element"]')
            skills = [skill.text.strip() for skill in skills_elements]
            job_data['skills'] = ', '.join(skills) if skills else None
        except Exception as e:
            print(f"Error extracting skills: {e}")
            job_data['skills'] = None

        df_info = pd.concat([df_info, pd.DataFrame([job_data])], ignore_index=True)

        time.sleep(random.uniform(0.1, 0.3))
        
    except Exception as e:
        print(f"Error processing {url}: {e}")
        continue

# Save the results
df_info.to_csv('hh_job_details.csv', index=False)
print(f"Extracted details for {len(df_info)} jobs out of {len(df)}")

Processing 1/1008: https://hh.ru/vacancy/120338693?query=ML&hhtmFrom=vacancy_search_list
Processing 2/1008: https://hh.ru/vacancy/120338693?query=ML&hhtmFrom=vacancy_search_list
Processing 3/1008: https://hh.ru/vacancy/120338679?query=ML&hhtmFrom=vacancy_search_list
Processing 4/1008: https://hh.ru/vacancy/80321762?query=ML&hhtmFrom=vacancy_search_list
Processing 5/1008: https://hh.ru/vacancy/118354500?query=ML&hhtmFrom=vacancy_search_list
Processing 6/1008: https://hh.ru/vacancy/118354543?query=ML&hhtmFrom=vacancy_search_list
Processing 7/1008: https://hh.ru/vacancy/120321719?query=ML&hhtmFrom=vacancy_search_list
Processing 8/1008: https://hh.ru/vacancy/117558803?query=ML&hhtmFrom=vacancy_search_list
Processing 9/1008: https://hh.ru/vacancy/119958507?query=ML&hhtmFrom=vacancy_search_list
Processing 10/1008: https://hh.ru/vacancy/120027646?query=ML&hhtmFrom=vacancy_search_list
Processing 11/1008: https://hh.ru/vacancy/119383740?query=ML&hhtmFrom=vacancy_search_list
Processing 12/1008: 

In [82]:
df_info

Unnamed: 0,url,experience,work_format,salary,employment_type,skills
0,https://hh.ru/vacancy/120338693?query=ML&hhtmF...,3–6 лет,Формат работы: удалённо,Уровень дохода не указан,Полная занятость,"Python, Docker, Machine Learning, Английский —..."
1,https://hh.ru/vacancy/120338693?query=ML&hhtmF...,3–6 лет,Формат работы: удалённо,Уровень дохода не указан,Полная занятость,"Python, Docker, Machine Learning, Английский —..."
2,https://hh.ru/vacancy/120338679?query=ML&hhtmF...,3–6 лет,Формат работы: удалённо,Уровень дохода не указан,Полная занятость,"Python, Docker, Machine Learning, Английский —..."
3,https://hh.ru/vacancy/80321762?query=ML&hhtmFr...,3–6 лет,,Уровень дохода не указан,Полная занятость,
4,https://hh.ru/vacancy/118354500?query=ML&hhtmF...,3–6 лет,Формат работы: удалённо или гибрид,Уровень дохода не указан,Полная занятость,
...,...,...,...,...,...,...
1003,https://hh.ru/vacancy/119010102?query=ML&hhtmF...,более 6 лет,Формат работы: на месте работодателя,Уровень дохода не указан,Полная занятость,"Управление командой, Управление процессами, Ин..."
1004,https://hh.ru/vacancy/119700961?query=ML&hhtmF...,3–6 лет,Формат работы: гибрид,Уровень дохода не указан,Полная занятость,
1005,https://hh.ru/vacancy/115496430?query=ML&hhtmF...,3–6 лет,,Уровень дохода не указан,Полная занятость,"IT, поиск, Python, ML, Hadoop, Spark, Yarn, Hi..."
1006,https://hh.ru/vacancy/120123071?query=ML&hhtmF...,3–6 лет,Формат работы: удалённо,Уровень дохода не указан,Полная занятость,"Английский язык, Python, SQL, Tableau, Английс..."
