In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging
import re
from typing import Dict, List, Optional
from datetime import datetime

class EnhancedJobScraper:
    def __init__(self):
        self.base_url = "https://getmatch.ru/vacancies"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def parse_salary(self, salary_text: str) -> tuple:
        """
        –ü–∞—Ä—Å–∏—Ç —Å—Ç—Ä–æ–∫—É —Å –∑–∞—Ä–ø–ª–∞—Ç–æ–π –∏ –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç –º–∏–Ω–∏–º–∞–ª—å–Ω–æ–µ –∏ –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ

        –ü—Ä–∏–º–µ—Ä—ã –≤—Ö–æ–¥–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö:
        - "250 000 ‚Äî 300 000 ‚ÇΩ/–º–µ—Å –Ω–∞ —Ä—É–∫–∏"
        - "–æ—Ç 250 000 ‚ÇΩ/–º–µ—Å –Ω–∞ —Ä—É–∫–∏"
        - "–¥–æ 300 000 ‚ÇΩ/–º–µ—Å –Ω–∞ —Ä—É–∫–∏"
        """
        if not salary_text:
            return None, None

        # –£–¥–∞–ª—è–µ–º –≤—Å–µ –ª–∏—à–Ω–∏–µ –ø—Ä–æ–±–µ–ª—ã –∏ –ø—Ä–∏–≤–æ–¥–∏–º –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É
        salary_text = salary_text.lower().strip()

        # –£–¥–∞–ª—è–µ–º "‚ÇΩ/–º–µ—Å –Ω–∞ —Ä—É–∫–∏" –∏ –ø–æ–¥–æ–±–Ω—ã–µ –æ–∫–æ–Ω—á–∞–Ω–∏—è
        salary_text = re.sub(r'‚ÇΩ/–º–µ—Å.*$', '', salary_text)

        # –£–¥–∞–ª—è–µ–º –≤—Å–µ –ø—Ä–æ–±–µ–ª—ã –∏–∑ —á–∏—Å–µ–ª
        salary_text = re.sub(r'\s(?=\d)', '', salary_text)

        # –ü—ã—Ç–∞–µ–º—Å—è –Ω–∞–π—Ç–∏ –¥–≤–∞ —á–∏—Å–ª–∞ (–¥–∏–∞–ø–∞–∑–æ–Ω)
        range_match = re.findall(r'\d+', salary_text)

        if '–æ—Ç' in salary_text and len(range_match) == 1:
            return int(range_match[0]), None
        elif '–¥–æ' in salary_text and len(range_match) == 1:
            return None, int(range_match[0])
        elif len(range_match) >= 2:
            return int(range_match[0]), int(range_match[1])
        elif len(range_match) == 1:
            return int(range_match[0]), int(range_match[0])

        return None, None

    def get_job_description(self, job_url: str) -> Dict:
        """–ò–∑–≤–ª–µ–∫–∞–µ—Ç –ø–æ–ª–Ω—É—é –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –æ –≤–∞–∫–∞–Ω—Å–∏–∏"""
        try:
            response = requests.get(job_url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            job_data = {
                'url': job_url,
                'title': None,
                'company_name': None,
                'salary_text': None,
                'salary_from': None,
                'salary_to': None,
                'location': None,
                'work_format': None,
                'specialization': None,
                'level': None,
                'company_logo_url': None,
                'description_text': [],
                'skills': [],
                'posted_date': None
            }

            # –ó–∞–≥–æ–ª–æ–≤–æ–∫ –≤–∞–∫–∞–Ω—Å–∏–∏
            title_elem = soup.find('h1')
            if title_elem:
                job_data['title'] = title_elem.text.strip()

            # –ù–∞–∑–≤–∞–Ω–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏
            company_elem = soup.find('h2')
            if company_elem and company_elem.find('a'):
                job_data['company_name'] = company_elem.find('a').text.strip()

            # –ó–∞—Ä–ø–ª–∞—Ç–∞
            salary_elem = soup.find('h3')
            if salary_elem:
                salary_text = salary_elem.text.strip()
                job_data['salary_text'] = salary_text
                job_data['salary_from'], job_data['salary_to'] = self.parse_salary(salary_text)

            # –õ–æ–∫–∞—Ü–∏—è –∏ —Ñ–æ—Ä–º–∞—Ç —Ä–∞–±–æ—Ç—ã
            location_container = soup.find('div', class_='b-vacancy-locations')
            if location_container:
                locations = location_container.find_all('span', class_='g-label')
                for loc in locations:
                    if 'üìç' in loc.text:
                        job_data['location'] = loc.text.replace('üìç', '').strip()
                    else:
                        job_data['work_format'] = loc.text.strip()

            # –°–ø–µ—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –∏ —É—Ä–æ–≤–µ–Ω—å
            specs_container = soup.find('div', class_='b-specs')
            if specs_container:
                rows = specs_container.find_all('div', class_='row')
                for row in rows:
                    term = row.find('div', class_='b-term')
                    value = row.find('div', class_='b-value')
                    if term and value:
                        term_text = term.text.strip().lower()
                        if '—Å–ø–µ—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è' in term_text:
                            job_data['specialization'] = value.text.strip()
                        elif '—É—Ä–æ–≤–µ–Ω—å' in term_text:
                            job_data['level'] = value.text.strip()

            # –õ–æ–≥–æ—Ç–∏–ø –∫–æ–º–ø–∞–Ω–∏–∏
            logo_elem = soup.find('img', {'alt': lambda x: x and 'logo' in x.lower()})
            if logo_elem:
                job_data['company_logo_url'] = logo_elem.get('src')

            # –û–ø–∏—Å–∞–Ω–∏–µ –≤–∞–∫–∞–Ω—Å–∏–∏
            description_sections = [
                ('b-vacancy-short-description', '–ö—Ä–∞—Ç–∫–æ–µ –æ–ø–∏—Å–∞–Ω–∏–µ'),
                ('b-vacancy-description', '–ü–æ–ª–Ω–æ–µ –æ–ø–∏—Å–∞–Ω–∏–µ')
            ]

            for class_name, section_name in description_sections:
                section = soup.find('section', class_=class_name)
                if section:
                    for elem in section.stripped_strings:
                        if elem.strip():
                            job_data['description_text'].append(elem.strip())

            # –¢–µ—Ö–Ω–æ–ª–æ–≥–∏–∏/–Ω–∞–≤—ã–∫–∏
            stack_container = soup.find('div', class_='b-vacancy-stack-container')
            if stack_container:
                skills = [skill.text.strip() for skill in stack_container.find_all('span', class_='g-label')]
                job_data['skills'] = skills

            # –û–±—ä–µ–¥–∏–Ω—è–µ–º —Ç–µ–∫—Å—Ç –æ–ø–∏—Å–∞–Ω–∏—è
            job_data['description_text'] = '\n'.join(job_data['description_text'])

            return job_data

        except Exception as e:
            self.logger.error(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–æ–ª—É—á–µ–Ω–∏–∏ –æ–ø–∏—Å–∞–Ω–∏—è –≤–∞–∫–∞–Ω—Å–∏–∏ {job_url}: {e}")
            return {'url': job_url, 'error': str(e)}

    def get_job_urls(self, page: int = 1) -> List[str]:
        """–ü–æ–ª—É—á–µ–Ω–∏–µ —Å–ø–∏—Å–∫–∞ URL –≤–∞–∫–∞–Ω—Å–∏–π —Å —Å—Ç—Ä–∞–Ω–∏—Ü—ã"""
        params = {"p": page}
        try:
            response = requests.get(self.base_url, params=params, headers=self.headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')
            job_cards = soup.find_all('div', class_='b-vacancy-card')

            job_urls = []
            for card in job_cards:
                title_elem = card.find('h3')
                if title_elem:
                    link_elem = title_elem.find('a')
                    if link_elem and link_elem.get('href'):
                        job_urls.append('https://getmatch.ru' + link_elem.get('href'))

            return job_urls
        except Exception as e:
            self.logger.error(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–æ–ª—É—á–µ–Ω–∏–∏ —Å–ø–∏—Å–∫–∞ –≤–∞–∫–∞–Ω—Å–∏–π —Å–æ —Å—Ç—Ä–∞–Ω–∏—Ü—ã {page}: {e}")
            return []

    def scrape_to_csv(self, num_pages: int = 5, output_format: str = 'csv'):
        """–°–±–æ—Ä –æ–ø–∏—Å–∞–Ω–∏–π –≤–∞–∫–∞–Ω—Å–∏–π —Å –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö —Å—Ç—Ä–∞–Ω–∏—Ü"""
        all_jobs = []

        for page in range(1, num_pages + 1):
            self.logger.info(f"–û–±—Ä–∞–±–æ—Ç–∫–∞ —Å—Ç—Ä–∞–Ω–∏—Ü—ã {page}...")
            job_urls = self.get_job_urls(page)

            for url in job_urls:
                self.logger.info(f"–ü–æ–ª—É—á–µ–Ω–∏–µ –æ–ø–∏—Å–∞–Ω–∏—è –≤–∞–∫–∞–Ω—Å–∏–∏: {url}")
                job_data = self.get_job_description(url)
                if job_data and 'error' not in job_data:
                    all_jobs.append(job_data)
                time.sleep(1)  # –ó–∞–¥–µ—Ä–∂–∫–∞ –º–µ–∂–¥—É –∑–∞–ø—Ä–æ—Å–∞–º–∏

            time.sleep(2)  # –ó–∞–¥–µ—Ä–∂–∫–∞ –º–µ–∂–¥—É —Å—Ç—Ä–∞–Ω–∏—Ü–∞–º–∏

        if all_jobs:
            timestamp = time.strftime("%Y%m%d_%H%M%S")

            if output_format == 'csv':
                df = pd.DataFrame(all_jobs)
                filename = f'job_descriptions_{timestamp}.csv'
                df.to_csv(filename, index=False, encoding='utf-8')
                self.logger.info(f"–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ {len(all_jobs)} –æ–ø–∏—Å–∞–Ω–∏–π –≤–∞–∫–∞–Ω—Å–∏–π –≤ —Ñ–∞–π–ª {filename}")
            elif output_format == 'json':
                filename = f'job_descriptions_{timestamp}.json'
                pd.DataFrame(all_jobs).to_json(filename, orient='records', force_ascii=False, indent=2)
                self.logger.info(f"–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ {len(all_jobs)} –æ–ø–∏—Å–∞–Ω–∏–π –≤–∞–∫–∞–Ω—Å–∏–π –≤ —Ñ–∞–π–ª {filename}")
        else:
            self.logger.error("–ù–µ —É–¥–∞–ª–æ—Å—å —Å–æ–±—Ä–∞—Ç—å –¥–∞–Ω–Ω—ã–µ –æ –≤–∞–∫–∞–Ω—Å–∏—è—Ö")

if __name__ == "__main__":
    scraper = EnhancedJobScraper()
    scraper.scrape_to_csv(num_pages=69)