In [None]:
import os
import re
import requests
from lxml import etree
from openpyxl import Workbook, load_workbook
from urllib.parse import urlencode, urljoin

In [None]:
def get_info(tree):
    offers = tree.xpath("//div[contains(@class, 'ClOfferSnippet') and @data-test='offer']")
    print(f"Найдено {len(offers)} предложений на странице.")
    offers_info = []

    for offer in offers:
        address = offer.xpath(".//span[contains(@class, 'ClClickableAddress__link') and not(@data-test)]//text()")
        extra_info = address[-1] if address else ""
        if extra_info:
            if extra_info.startswith("до центра"):
                extra_info = extra_info.split()[-2]
            elif extra_info.startswith("В центре"):
                extra_info = 0

        link = offer.xpath(".//a[contains(@class, 'LinkSnippet LinkSnippet_fullWidth LinkSnippet_hover')]/@href")
        link_url = link[0].strip() if link else None

        offers_info.append({
            "link": link_url,
            "extra_info": extra_info
        })

    if not offers_info:
        return "На странице не найдено предложений."

    return offers_info

In [None]:
def fetch_list_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        parser = etree.HTMLParser()
        tree = etree.fromstring(response.content, parser=parser)

        if tree.xpath("//*[contains(@class, 'ClEmptySearch')]"):
            return "Ошибка: Страница не найдена"

        return get_info(tree)

    except requests.RequestException as e:
        return f"Ошибка при получении URL с XPath: {e}"
    except Exception as ex:
        return f"Ошибка при разборе HTML с XPath: {ex}"

In [None]:
def parse_offer_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        parser = etree.HTMLParser()
        tree = etree.fromstring(response.content, parser=parser)

        offer_details = tree.xpath("//div[contains(@class, 'DescriptionCell') and contains(@class, 'OfferCard__infoCell')]//text()")
        details_data = [detail.strip() for detail in offer_details if detail.strip()]

        additional_data = tree.xpath("//div[contains(@class, 'OfferCard__additionalDataInfoFlatItem')]//text()")
        additional_data_clean = [item.strip() for item in additional_data if item.strip()]

        expandable_description = tree.xpath("//div[contains(@class, 'fonts-module__primary___73abfc') and contains(@class, 'ExpandableDescription__description')]//text()")
        description_clean = [desc.strip() for desc in expandable_description if desc.strip()]

        address_parts = tree.xpath("//a[contains(@class, 'ClClickableAddress__link') and @data-test='offer-address']//text()")
        address_clean = ' '.join(part.strip() for part in address_parts if part.strip()) if address_parts else "Адрес не найден"

        price = tree.xpath("//span[@itemprop='price' and @data-test='offer-price']//text()")
        if price:
            price_clean = price[0].strip().replace(chr(160), "").replace(" ", "")
        else:
            price_clean = "Цена не найдена"

        route_cards = tree.xpath("//div[contains(@class, 'OfferRouteCard') and contains(@class, 'OfferRouteCardList__item')]")
        route_info_list = []

        for card in route_cards:
            station_name = card.xpath(".//a[contains(@class, 'SubwayStation__link')]//text()")
            station_name = station_name[0].strip() if station_name else "Станция не найдена"

            distance_or_time = card.xpath(".//div[contains(@class, 'colors-named-module__secondary___eb0c51') and contains(@class, 'fonts-module__primary___73abfc')]//text()")
            distance_or_time = distance_or_time[0].strip() if distance_or_time else "Информация о расстоянии/времени не найдена"

            icon_element = card.xpath(".//div[contains(@class, 'OfferRouteTimeCard__icon')]/*[name()='svg']")
            if icon_element:
                svg_content = etree.tostring(icon_element[0], method='html', encoding='unicode')
                if 'M5.76127 11.1185L6.93706 12.9845L6.03002 15.4335' in svg_content:
                    by_foot = 1
                elif 'M14.4305 9.23685C14.4338 9.31745 14.4366 9.39787' in svg_content:
                    by_foot = 0
                else:
                    by_foot = -1
            else:
                by_foot = 'Информация о транспорте не найдена'

            route_info = {
                'station_name': station_name,
                'distance_or_time': distance_or_time,
                'by_foot': by_foot
            }
            route_info_list.append(route_info)

        return {
            "details": details_data,
            "additional_data": additional_data_clean,
            "description": description_clean,
            "address": address_clean,
            "price": price_clean,
            "route_info": route_info_list
        }

    except requests.RequestException as e:
        return {"error": f"Ошибка при получении URL: {e}"}
    except Exception as ex:
        return {"error": f"Ошибка при разборе HTML: {ex}"}

In [None]:
def fetch_list_pages(config):
    base_url = "https://m2.ru/moskva/nedvizhimost/kupit-kvartiru/"
    room = config.get("room")
    save_to_xlsx = config.get("save_to_xlsx", False)
    totalAreaMin = config.get("totalAreaMin")
    totalAreaMax = config.get("totalAreaMax")

    file_parts = []
    if room is not None:
        file_parts.append(f"{room}-rooms")
    if totalAreaMin is not None:
        file_parts.append(f"min-{totalAreaMin}")
    if totalAreaMax is not None:
        file_parts.append(f"max-{totalAreaMax}")
    excel_file_name = '_'.join(file_parts) + '.xlsx'
    
    if not os.path.exists('data'):
        os.makedirs('data')
    
    excel_file = os.path.join('data', excel_file_name)

    room_count = {
        "0": "studiya/",
        "1": "1-komnata/",
        "2": "2-komnaty/",
        "3": "3-komnaty/",
        "4": "4-komnaty/",
        "5+": "5-komnat_i_bolee/",
        "free": "svobodnaya-planirovka/"
    }

    if room is not None and str(room) in room_count:
        url_path = room_count[str(room)]
    else:
        url_path = ""

    field_name_mapping = {
        'link': 'ссылка',
        'extra_info': 'до центра',
        'description': 'описание',
        'price': 'цена',
        'full_address': 'адрес',
        'room': 'комнат'
    }

    route_pattern = re.compile(r'route(\d+)_(.*)')

    def map_field_name(key):
        if key in field_name_mapping:
            return field_name_mapping[key]
        else:
            m = route_pattern.match(key)
            if m:
                i, subkey = m.groups()
                subkey_mapping = {
                    'station_name': 'станция',
                    'by_foot': 'пешком',
                    'distance_or_time': 'время_до_станции'
                }
                if subkey in subkey_mapping:
                    return f"{subkey_mapping[subkey]}{i}"
        return key

    if save_to_xlsx:
        if os.path.exists(excel_file):
            wb = load_workbook(excel_file)
            ws = wb.active
            existing_columns = [cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))]
        else:
            wb = Workbook()
            ws = wb.active
            initial_columns = ['link', 'extra_info', 'room']
            existing_columns = [map_field_name(col) for col in initial_columns]
            ws.append(existing_columns)
            wb.save(excel_file)

    for i in range(1, 501):
        query_params = {}
        if totalAreaMax is not None:
            query_params['totalAreaMax'] = totalAreaMax
        if totalAreaMin is not None:
            query_params['totalAreaMin'] = totalAreaMin
        query_params['pageNumber'] = i

        full_url = urljoin(base_url, url_path) + '?' + urlencode(query_params)
        print(f"Загружаем страницу {i}: {full_url}")

        result = fetch_list_page(full_url)
        if result == "Ошибка: Страница не найдена":
            print("Больше нет страниц для загрузки.")
            break

        if isinstance(result, list):
            offers_info = result
            for offer in offers_info:
                link_url = offer.get('link')
                extra_info = offer.get('extra_info', '')
                data_dict = {'link': link_url, 'extra_info': extra_info, 'room': room}

                offer_details = parse_offer_page(link_url)
                if 'error' in offer_details:
                    print(f"Ошибка при парсинге страницы предложения {link_url}: {offer_details['error']}")
                    continue

                details_data = offer_details.get('details', [])
                details_dict = dict(zip(details_data[::2], details_data[1::2]))

                if 'Комнатность' in details_dict:
                    del details_dict['Комнатность']

                data_dict.update(details_dict)

                additional_data = offer_details.get('additional_data', [])
                additional_dict = dict(zip(additional_data[::2], additional_data[1::2]))
                data_dict.update(additional_dict)

                description = offer_details.get('description', [])
                data_dict['description'] = ' '.join(description).strip()

                data_dict['price'] = offer_details.get('price', '')

                data_dict['full_address'] = offer_details.get('address', '')

                route_info_list = offer_details.get('route_info', [])
                for idx, route in enumerate(route_info_list, start=1):
                    prefix = f'route{idx}_'
                    for key, value in route.items():
                        data_dict[f'{prefix}{key}'] = value

                for area_key in ['Площадь квартиры', 'Жилая площадь', 'Площадь кухни']:
                    if area_key in data_dict and data_dict[area_key]:
                        data_dict[area_key] = data_dict[area_key][:-3]

                if 'Высота потолков' in data_dict and data_dict['Высота потолков']:
                    data_dict['Высота потолков'] = data_dict['Высота потолков'][:-2]

                if 'Этаж' in data_dict and data_dict['Этаж']:
                    floor_info = data_dict['Этаж']
                    parts = floor_info.split(' из ')
                    if len(parts) == 2:
                        data_dict['Этаж'] = parts[0].strip()
                        data_dict['Этажей в доме'] = parts[1].strip()

                mapped_data_dict = {}
                for key, value in data_dict.items():
                    mapped_key = map_field_name(key)
                    mapped_data_dict[mapped_key] = value

                new_keys = [key for key in mapped_data_dict.keys() if key not in existing_columns]
                if new_keys:
                    existing_columns.extend(new_keys)
                    for key in new_keys:
                        ws.cell(row=1, column=existing_columns.index(key)+1, value=key)

                row_data = [mapped_data_dict.get(col, '') for col in existing_columns]
                ws.append(row_data)

                wb.save(excel_file)

        else:
            print(result)

In [None]:
config = {
    "room": 1,
    "save_to_xlsx": True,
    "totalAreaMin": 35,
    "totalAreaMax": 39,
    }

fetch_list_pages(config)