In [10]:
import os
import re
import pandas as pd
import numpy as np

from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils import get_column_letter
from openpyxl.worksheet.table import Table, TableStyleInfo


import requests
from LxmlSoup import LxmlSoup
from bs4 import BeautifulSoup
import time
import random


In [None]:
# Парсинг HomesOverseas

def parse_homesoverseas(country_url_part: str, has_info=True, max_pages=20):
    all_names = []
    all_prices= []
    all_infos = []
    page=0
    while True:
        url = f"https://www.homesoverseas.ru/country/{country_url_part}/apartments?p={page}&s=50"
        print(f"[{country_url_part}] p={page}: {url}")
        resp=requests.get(url)
        if resp.status_code!=200:
            print(f"HTTP {resp.status_code}, стоп.")
            break

        soup=LxmlSoup(resp.text)
        names  = soup.find_all("div", class_="name_mess")
        prices = soup.find_all("div", class_="price_tr")

        if len(names)==0 and len(prices)==0:
            print("Пустая страница => завершение.")
            break

        if has_info:
            infos= soup.find_all("div", class_="line mrg5T options")
            length= min(len(names), len(prices), len(infos))
            for i in range(length):
                all_names.append(names[i].text())
                all_prices.append(prices[i].text())
                all_infos.append(infos[i].text())
        else:
            length= min(len(names), len(prices))
            for i in range(length):
                all_names.append(names[i].text())
                all_prices.append(prices[i].text())

        page+=1
        if page>=max_pages:
            print(f"max_pages={max_pages} => stop.")
            break

    if has_info:
        df = pd.DataFrame({"name": all_names, "price": all_prices, "info": all_infos})
    else:
        df = pd.DataFrame({"name": all_names, "price": all_prices})
    return df


# Преобразование

def transform_homesoverseas_data(df: pd.DataFrame, has_info=True)-> pd.DataFrame:
    df=df.copy()
    for c in df.columns:
        if df[c].dtype==object:
            df[c] = df[c].astype(str).str.replace(r"\s+"," ",regex=True).str.strip()

    def clean_price(p):
        if not isinstance(p,str):
            return np.nan
        p=p.replace("от ","")
        if "ценапозапросу" in p.lower():
            return np.nan
        p=p.replace("€","").replace("\xa0","")
        p=re.sub(r"[^0-9.,]","",p).replace(",",".").strip()
        if p=="":
            return np.nan
        try:
            return float(p)
        except:
            return np.nan

    df["price"] = df["price"].apply(clean_price)
    df.dropna(subset=["price"], inplace=True)

    df["square"]=np.nan
    df["rooms"] =np.nan
    if has_info and "info" in df.columns:
        def extract_square(txt):
            m=re.search(r"площадь\s+([\d.,]+)",txt,re.IGNORECASE)
            if m:
                val=m.group(1).replace(",",".")
                try:return float(val)
                except:pass
            m2=re.search(r"([\d.,]+)\s*м",txt)
            if m2:
                val=m2.group(1).replace(",",".")
                try:return float(val)
                except:pass
            return np.nan

        def extract_rooms(txt):
            m=re.search(r"(\d+)\s*спал", txt, re.IGNORECASE)
            if m:
                return float(m.group(1))
            return np.nan

        df["square"]= df["info"].apply(extract_square)
        df["rooms"] = df["info"].apply(extract_rooms)

    df["price_per_sq"]= np.where(
        (df["price"].notna())&(df["square"]>0),
        df["price"]/df["square"],
        np.nan
    )

    df.insert(0,"№", range(1,len(df)+1))

    rename_map={
        "name":"Название",
        "price":"Цена",
        "square":"Площадь (м²)",
        "rooms":"Комнаты",
        "price_per_sq":"Цена за м²"
    }
    df= df.rename(columns=rename_map)

    wanted=["№","Название","Цена","Площадь (м²)","Комнаты","Цена за м²"]
    exist =[c for c in wanted if c in df.columns]
    df=df[exist]
    return df


# Сохранение в Excel

def save_homesoverseas_excel(df: pd.DataFrame, file_path:str):
    """
    1) Заголовок = row=2
    2) Данные = row=3..(2 + nrows)
    3) Таблица (ListObject) => ref="A2:..."
    4) totalsRowShown = True (для «СРЕДНЕЕ:» / average)
    5) мини-сводная => H1..I2
    """
    from openpyxl import Workbook
    from openpyxl.worksheet.table import Table, TableStyleInfo
    from openpyxl.utils import get_column_letter

    wb=Workbook()
    ws=wb.active
    ws.title="Data"

    if df.empty:
        df.to_excel(file_path,index=False)
        return

    cols=df.columns.to_list()
    nrows=len(df)
    ncols=len(cols)

    # Запишем заголовки в row=2
    header_row = 2
    for j, c_name in enumerate(cols):
        ws.cell(row=header_row, column=j+1, value=c_name)

    # Запишем данные с row=3..(2 + nrows)
    data_start = 3
    for i in range(nrows):
        for j, c_name in enumerate(cols):
            val = df.iat[i, j]
            ws.cell(row=data_start + i, column=j+1, value=val)

    # Конец данных
    data_end_row = data_start + nrows - 1  # = 3 + nrows -1 = 2 + nrows
    end_col = ncols

    # Создаём таблицу: диапазон от A2.. (col)(2+nrows)
    from openpyxl.utils import get_column_letter
    ref_range = f"A{header_row}:{get_column_letter(end_col)}{data_end_row}"

    tab = Table(displayName="Table1", ref=ref_range)
    style = TableStyleInfo(name="TableStyleMedium9",
                           showRowStripes=True,
                           showColumnStripes=False)
    tab.tableStyleInfo=style
    tab.totalsRowShown=True

    # Настраиваем итог: "Название" => "СРЕДНЕЕ:",
    #                  "Цена","Цена за м²" => average
    for i, colObj in enumerate(tab.tableColumns):
        hdr = cols[i]
        if hdr=="Название":
            colObj.totalsRowLabel= "СРЕДНЕЕ:"
            colObj.totalsRowFunction= None
        elif hdr in ["Цена","Цена за м²"]:
            colObj.totalsRowFunction= "average"
        else:
            colObj.totalsRowFunction= None

    ws.add_table(tab)

    # Форматируем столбцы
    for j, c_name in enumerate(cols, start=1):
        ws.column_dimensions[get_column_letter(j)].width=15
        if c_name in ["Цена","Цена за м²"]:
            # формат для всех ячеек (стр.3..(2+nrows)) + итог
            for row_i in range(data_start, data_end_row+1):
                ws.cell(row=row_i, column=j).number_format='#,##0.00 "€"'

    # мини-сводная: H1..I2
    ws["H1"]="Средняя цена"
    ws["I1"]="Средняя цена за м²"
    # SUBTOTAL(101, Table1[[#All],[Цена]])
    ws["H2"]= f"=SUBTOTAL(101,Table1[[#All],[Цена]])"
    ws["I2"]= f"=SUBTOTAL(101,Table1[[#All],[Цена за м²]])"

    ws["H2"].number_format='#,##0.00 "€"'
    ws["I2"].number_format='#,##0.00 "€"'
    ws.column_dimensions["H"].width=20
    ws.column_dimensions["I"].width=20

    wb.save(file_path)

def main():
    countries = [
        ("Австралия", "Avstralija", False),
        ("Австрия", "Austria", True),
        ("Андорра", "Andorra", True),
        ("Болгария", "Bolgarija", True),
        ("Венгрия", "Vengrija", True),
        ("Германия", "Germanija", True),
        ("Греция", "Grecija", True),
        ("Грузия", "Gruzija", True),
        ("Израиль", "Izrail", True),
        ("Испания", "Spain", True),
        ("Италия", "Italija", True),
        ("Канада", "Kanada", True),
        ("Кипр", "Kipr", True),
        ("Латвия", "Latvija", True),
        ("Люксембург", "Luxemburg", True),
        ("Монако", "Monako", True),
        ("ОАЭ", "OAE", True),
        ("Португалия", "Portugalija", True),
        ("Словакия", "Slovakija", True),
        ("Великобритания", "Velikobritanija", True),
        ("США", "SShA", True),
        ("Таиланд", "Tailand", True),
        ("Турция", "Turcija", True),
        ("Финляндия", "Finljandija", True),
        ("Франция", "Francija", True),
        ("Хорватия", "Horvatija", True),
        ("Черногория", "Chernogorija", True),
        ("Швейцария", "Shvejcarija", True),
        ("Эстония", "Estonija", True),
    ]
    
    base_path= r"C:\Users\Алина\Desktop\Недвижимость (HomesOverseas)"

    for (country_rus, url_part, info_flag) in countries:
        print(f"\n=== HomesOverseas: {country_rus} => {url_part} ===")
        df_raw= parse_homesoverseas(url_part, has_info=info_flag, max_pages=20)
        df_clean= transform_homesoverseas_data(df_raw, has_info=info_flag)

        file_path= rf"{base_path}\{country_rus}.xlsx"
        save_homesoverseas_excel(df_clean, file_path)
        print(f"[{country_rus}] => {file_path}")

    print("\nГотово!")

if __name__=="__main__":
    main()



=== HomesOverseas: Австралия => Avstralija ===
[Avstralija] p=0: https://www.homesoverseas.ru/country/Avstralija/apartments?p=0&s=50
Пустая страница => завершение.
[Австралия] => C:\Users\Алина\Desktop\Недвижимость (HomesOverseas)\Австралия.xlsx

=== HomesOverseas: Австрия => Austria ===
[Austria] p=0: https://www.homesoverseas.ru/country/Austria/apartments?p=0&s=50
[Austria] p=1: https://www.homesoverseas.ru/country/Austria/apartments?p=1&s=50
[Austria] p=2: https://www.homesoverseas.ru/country/Austria/apartments?p=2&s=50
Пустая страница => завершение.
[Австрия] => C:\Users\Алина\Desktop\Недвижимость (HomesOverseas)\Австрия.xlsx

=== HomesOverseas: Андорра => Andorra ===
[Andorra] p=0: https://www.homesoverseas.ru/country/Andorra/apartments?p=0&s=50
[Andorra] p=1: https://www.homesoverseas.ru/country/Andorra/apartments?p=1&s=50
Пустая страница => завершение.
[Андорра] => C:\Users\Алина\Desktop\Недвижимость (HomesOverseas)\Андорра.xlsx

=== HomesOverseas: Болгария => Bolgarija ===
[Bo

In [None]:
# Парсинг Tranio

def parse_tranio_country(country_url_part: str, max_pages=20):
    unique_entries = set()
    page = 1

    while True:
        # первая страница: ?order=rank
        # остальные: ?order=rank&page=2,3, etc.
        if page == 1:
            url = f"https://tranio.ru/{country_url_part}/apartments/?order=rank"
        else:
            url = f"https://tranio.ru/{country_url_part}/apartments/?order=rank&page={page}"

        print(f"Парсим страницу {page}: {url}")
        resp = requests.get(url)
        if resp.status_code != 200:
            print(f"Ответ {resp.status_code}. Останавливаемся.")
            break

        soup = BeautifulSoup(resp.text, 'html.parser')
        snippets = soup.find_all("div", class_="snippet slide")

        if not snippets:
            print("Нет объявлений на странице. Останавливаемся.")
            break

        found_new = False
        for snippet in snippets:
            title_el = snippet.find("div", class_="snippet-title")
            price_el = snippet.find("div", class_="snippet-price")
            info_el = snippet.find("div", class_="snippet-features")

            l_name = title_el.get_text(strip=True) if title_el else "No name"
            l_price = price_el.get_text(strip=True) if price_el else "No price"
            l_info = info_el.get_text(strip=True) if info_el else "No info"

            entry = (l_name, l_price, l_info)
            if entry not in unique_entries:
                unique_entries.add(entry)
                found_new = True

        if not found_new:
            print(f"Страница {page} полностью дублирует предыдущие. Остановка.")
            break

        page += 1
        if page > max_pages:
            print(f"Достигнут лимит max_pages={max_pages}. Остановка.")
            break

    data = list(unique_entries)
    df = pd.DataFrame(data, columns=["name", "price", "info"])
    return df


# Преобразование

def transform_tranio_data(df: pd.DataFrame) -> pd.DataFrame:

    df = df.copy()
    df["info"] = df["info"].astype(str)

    # 1) Убираем жилые комплексы
    df["info"] = df["info"].str.replace(r"Всего.*?квартир", "", case=False, regex=True)
    mask = ~df["info"].str.contains("Срок сдачи", case=False, regex=True)
    df = df[mask].copy()

    df["info"] = df["info"].str.strip()
    df = df[df["info"] != ""].copy()

    # 2) Очищаем info
    df["info"] = df["info"].str.replace("м?", " ", regex=False)
    for pat in ["площадь", "общая", "зеи:", "\xa0"]:
        df["info"] = df["info"].str.replace(pat, "", regex=False)
    df["info"] = df["info"].str.replace(r"\s+", " ", regex=True).str.strip()

    # извлекаем площадь
    def extract_square(txt: str):
        match = re.search(r"([\d.,]+)\s*м", txt)
        if match:
            val = match.group(1).replace(",", ".")
            try:
                return float(val)
            except:
                return np.nan
        return np.nan

    df["square"] = df["info"].apply(extract_square)

    # извлекаем комнаты
    def extract_rooms(txt: str):
        match = re.search(r"(\d+)\s*спал", txt, re.IGNORECASE)
        if match:
            return float(match.group(1))
        return np.nan

    df["rooms"] = df["info"].apply(extract_rooms)

    # 3) Приводим price + конвертируем в евро
    RATES = {
        "€": 1.0,
        "$": 0.96,   # доллар
        "£": 1.20,   # фунт
        "C$": 0.67,  # канадский доллар
    }

    def clean_price(p: str):
        if not isinstance(p, str):
            return np.nan
        p = p.replace("От", "").replace("от", "").replace("\xa0", " ").strip()

        # На случай 'C$609 000C$609 000', берем первое совпадение
        # Шаблон: (C\$|\$|£)?digits(€|\$|£|C\$)?
        match = re.search(r"(C\$|\$|£)?[\d.,\s]+(€|\$|£|C\$)?", p)
        if not match:
            return np.nan

        chunk = match.group(0)

        # Выделим валюту впереди (если есть)
        currency_front = None
        currency_back = None
        numeric_part = chunk.strip()

        front_match = re.match(r"(C\$|\$|£)", numeric_part)
        if front_match:
            currency_front = front_match.group(1)
            numeric_part = numeric_part[len(currency_front):]

        numeric_part = numeric_part.strip()

        back_match = re.search(r"(C\$|\$|£|€)$", numeric_part)
        if back_match:
            currency_back = back_match.group(1)
            numeric_part = numeric_part[: -len(currency_back)]

        numeric_part = numeric_part.strip()

        # Определяем валюту
        if currency_front:
            currency_symbol = currency_front
        elif currency_back:
            currency_symbol = currency_back
        else:
            currency_symbol = "€"  # по умолчанию евро

        # Убираем все лишние символы кроме . , цифр
        numeric_part = re.sub(r"[^0-9.,]", "", numeric_part)
        numeric_part = numeric_part.replace(",", ".").replace(" ", "").strip()
        if not numeric_part:
            return np.nan

        try:
            val = float(numeric_part)
        except:
            return np.nan

        rate = RATES.get(currency_symbol, 1.0)
        return val * rate

    df["price"] = df["price"].apply(clean_price)
    df = df.dropna(subset=["price"])
    
    # 4) Цена за м²
    df["price_per_sq"] = np.where(
        (df["price"].notna()) & (df["square"] > 0),
        df["price"] / df["square"],
        np.nan
    )

    # 5) Нумеруем
    df.insert(0, "№", range(1, len(df) + 1))

    # Переименуем колонки
    rename_map = {
        "name": "Название",
        "price": "Цена",
        "square": "Площадь (м²)",
        "rooms": "Комнаты",
        "price_per_sq": "Цена за м²",
    }
    df = df.rename(columns=rename_map)

    # Оставим нужные столбцы
    desired_cols = ["№", "Название", "Цена", "Площадь (м²)", "Комнаты", "Цена за м²"]
    existing = [c for c in desired_cols if c in df.columns]
    df = df[existing]


    return df


# Сохранение

def save_tranio_excel(df: pd.DataFrame, file_path: str):
    wb=Workbook()
    ws=wb.active
    ws.title="Data"

    if df.empty:
        df.to_excel(file_path,index=False)
        return

    cols=df.columns.to_list()
    nrows=len(df)
    ncols=len(cols)

    # Заголовки => row=2
    header_row=2
    for j,c_name in enumerate(cols):
        ws.cell(row=header_row, column=j+1, value=c_name)

    # Данные => row=3..(2 + nrows)
    data_start=3
    for i in range(nrows):
        for j,c_name in enumerate(cols):
            ws.cell(row=data_start + i, column=j+1, value=df.iat[i,j])

    data_end_row = data_start + nrows -1  # = 2 + nrows
    ref_range= f"A{header_row}:{get_column_letter(ncols)}{data_end_row}"

    tab=Table(displayName="Table1", ref=ref_range)
    style=TableStyleInfo(name="TableStyleMedium9",showRowStripes=True, showColumnStripes=False)
    tab.tableStyleInfo=style
    tab.totalsRowShown=True

    for i,colObj in enumerate(tab.tableColumns):
        hdr=cols[i]
        if hdr=="Название":
            colObj.totalsRowLabel="СРЕДНЕЕ:"
            colObj.totalsRowFunction=None
        elif hdr in ["Цена","Цена за м²"]:
            colObj.totalsRowFunction="average"
        else:
            colObj.totalsRowFunction=None

    ws.add_table(tab)

    # формат
    for j,c_name in enumerate(cols,1):
        ws.column_dimensions[get_column_letter(j)].width=15
        if c_name in ["Цена","Цена за м²"]:
            for row_i in range(data_start, data_end_row+1):
                ws.cell(row=row_i,column=j).number_format='#,##0.00 "€"'

    # мини-сводная => H1..I2
    ws["H1"]="Средняя цена"
    ws["I1"]="Средняя цена за м²"
    ws["H2"]= f"=SUBTOTAL(101,Table1[[#All],[Цена]])"
    ws["I2"]= f"=SUBTOTAL(101,Table1[[#All],[Цена за м²]])"
    ws["H2"].number_format= '#,##0.00 "€"'
    ws["I2"].number_format= '#,##0.00 "€"'
    ws.column_dimensions["H"].width=20
    ws.column_dimensions["I"].width=20

    wb.save(file_path)

def main():
    countries = [
        ("Австралия",       "australia"),
        ("Австрия",         "austria"),
        ("Андорра",         "andorra"),
        ("Болгария",        "bulgaria"),
        ("Великобритания",  "united-kingdom"),
        ("Венгрия",         "hungary"),
        ("Германия",        "germany"),
        ("Греция",          "greece"),
        ("Грузия",          "georgia"),
        ("Израиль",         "israel"),
        ("Испания",         "spain"),
        ("Италия",          "italy"),
        ("Канада",          "canada"),
        ("Кипр",            "cyprus"),
        ("Латвия",          "latvia"),
        ("Люксембург",      "luxembourg"),
        ("Монако",          "monaco"),
        ("ОАЭ",             "uae"),
        ("Португалия",      "portugal"),
        ("Словакия",        "slovakia"),
        ("США",             "usa"),
        ("Таиланд",         "thailand"),
        ("Турция",          "turkey"),
        ("Финляндия",       "finland"),
        ("Франция",         "france"),
        ("Хорватия",        "croatia"),
        ("Черногория",      "montenegro"),
        ("Чехия",           "czech-republic"),
        ("Швейцария",       "switzerland"),
        ("Швеция",          "sweden"),
        ("Эстония",         "estonia"),
    ]
    base_path= r"C:\Users\Алина\Desktop\Недвижимость (Tranio)"

    for (country_rus,url_part) in countries:
        print(f"\n=== Tranio: {country_rus} => {url_part} ===")
        df_raw= parse_tranio_country(url_part, max_pages=20)
        df_clean= transform_tranio_data(df_raw)

        file_path= rf"{base_path}\{country_rus}.xlsx"
        save_tranio_excel(df_clean, file_path)
        print(f"[{country_rus}] => {file_path}")

    print("\nГотово!")

if __name__=="__main__":
    main()

In [None]:
# Парсинг Prian

def extract_data(url):
    HEADERS = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/110.0.0.0 Safari/537.36"
        )
    }

    resp = requests.get(url, headers=HEADERS)
    if resp.status_code == 429:
        print(f"[extract_data] HTTP 429 => Ждём 30 сек и повторяем: {url}")
        time.sleep(30)
        resp = requests.get(url, headers=HEADERS)
        if resp.status_code == 429:
            print(f"[extract_data] Снова 429. Пропускаем страницу: {url}")
            return [], [], [], []
    elif resp.status_code != 200:
        print(f"[extract_data] HTTP {resp.status_code}. Пропускаем: {url}")
        return [], [], [], []

    soup = LxmlSoup(resp.text)

# Ищем
    names  = soup.find_all("div", class_="b-title")
    prices = soup.find_all("div", class_="price")
    icons  = soup.find_all("div", class_="b-icon")

    l_name   = []
    l_price  = []
    l_square = []
    l_rooms  = []

    length = min(len(names), len(prices), len(icons))
    for i in range(length):
        name_el  = names[i]
        price_el = prices[i]
        icon_el  = icons[i]

        name_str  = name_el.text().strip() if name_el else "No name"
        price_str = price_el.text().strip() if price_el else "No price"

# Ищем <span> => проверяем 'square' or 'room'
        spans = icon_el.find_all("span")
        sq_val = np.nan
        rm_val = np.nan
        for sp in spans:
# вместо sp.html(), используем str(sp) для получения HTML
            sp_str = str(sp)  
            sp_text = sp.text().strip()  # напр. "39 м2" или "1"

            if "#sheme-square" in sp_str:
                nums = re.findall(r"\d+", sp_text)
                if nums:
                    try:
                        sq_val = float(nums[0])
                    except:
                        sq_val = np.nan

            elif "#sheme-room" in sp_str:
                nums = re.findall(r"\d+", sp_text)
                if nums:
                    try:
                        rm_val = float(nums[0])
                    except:
                        rm_val = np.nan

        l_name.append(name_str)
        l_price.append(price_str)
        l_square.append(sq_val)
        l_rooms.append(rm_val)

    return l_name, l_price, l_square, l_rooms

def parse_prian_country(country_url_part: str, max_pages=20, step=30):
    unique_entries = set()
    all_names   = []
    all_prices  = []
    all_squares = []
    all_rooms   = []

    offset = 0
    page_count = 1

    while True:
        if offset == 0:
            url = f"https://prian.ru/{country_url_part}/apartments/"
        else:
            url = f"https://prian.ru/{country_url_part}/apartments/?next={offset}"

        print(f"[parse_prian_country] page={page_count}, offset={offset}: {url}")
        time.sleep(random.uniform(1, 2))

        names, prices, squares, rooms = extract_data(url)
        length = min(len(names), len(prices), len(squares), len(rooms))
        if length == 0:
            print("Нет (новых) объявлений. Остановка.")
            break

        found_new = False
        for i in range(length):
            entry = (names[i], prices[i], squares[i], rooms[i])
            if entry not in unique_entries:
                unique_entries.add(entry)
                all_names.append(names[i])
                all_prices.append(prices[i])
                all_squares.append(squares[i])
                all_rooms.append(rooms[i])
                found_new = True

        if not found_new:
            print("Все объявления этой страницы уже были. Остановка.")
            break

        offset += step
        page_count += 1
        if page_count > max_pages:
            print(f"max_pages={max_pages} достигнут. Остановка.")
            break

    df = pd.DataFrame({
        "name":   all_names,
        "price":  all_prices,
        "square": all_squares,
        "rooms":  all_rooms
    })
    return df


# Преобразование

def transform_prian_data(df: pd.DataFrame, country_rus: str)-> pd.DataFrame:
    # подготовим DataFrame с нужными колонками на случай пустоты
    columns_needed = ["№","Название","Цена","Площадь (м²)","Комнаты","Цена за м²"]
    df_final = pd.DataFrame(columns=columns_needed)

    if df.empty:
        print("[transform_prian_data] DF пуст => возвращаем пустую таблицу.")
        return df_final

    df = df.copy()

    # (1) Фильтр name
    suffix = ", " + country_rus
    mask = df["name"].str.endswith(suffix, na=False)
    df = df[mask].copy()
    if df.empty:
        print("[transform_prian_data] После фильтрации по стране => пусто.")
        return df_final

    # (2) price => float, удаляем NaN
    def clean_price(p: str):
        if not isinstance(p, str):
            return np.nan
        s = p.replace("от ","").replace("€","").replace("\xa0","").replace(" ","")
        if s.lower() == "ценапозапросу":
            return np.nan
        try:
            return float(s)
        except:
            return np.nan

    df["price"] = df["price"].apply(clean_price)
    df = df.dropna(subset=["price"])
    if df.empty:
        print("[transform_prian_data] Все price=NaN => пусто.")
        return df_final

    # (3) убираем "Земля в Пафосе, Кипр"/"Магазин"
    bad_vals = ["Земля в Пафосе, Кипр", "Магазин"]
    for val in bad_vals:
        mask_n = df["name"].str.contains(val, case=False, na=False)
        df = df[~mask_n].copy()
    if df.empty:
        print("[transform_prian_data] После удаления bad_vals => пусто.")
        return df_final

    # (4) price_per_sq
    def calc_pps(r):
        pr = r["price"]
        sq = r["square"]
        if pd.isna(pr) or pd.isna(sq) or sq==0:
            return np.nan
        return pr/sq

    df["price_per_sq"] = df.apply(calc_pps, axis=1)

    # (5) нумерация, rename
    df.insert(0, "№", range(1, len(df)+1))

    rename_map = {
        "name": "Название",
        "price": "Цена",
        "square": "Площадь (м²)",
        "rooms": "Комнаты",
        "price_per_sq": "Цена за м²"
    }
    df = df.rename(columns=rename_map)

    df = df[["№","Название","Цена","Площадь (м²)","Комнаты","Цена за м²"]]

    # (6) Если df пуст => вернем пустой
    if df.empty:
        return df_final


    return df


# Сохранение

def save_prian_excel(df: pd.DataFrame, file_path: str):
    wb=Workbook()
    ws=wb.active
    ws.title="Data"

    if df.empty:
        df.to_excel(file_path, index=False)
        return

    cols= df.columns.tolist()
    nrows= len(df)
    ncols= len(cols)

    # Заголовки => row=2
    header_row=2
    for j,c_name in enumerate(cols):
        ws.cell(row=header_row,column=j+1,value=c_name)

    # Данные => row=3..(2+nrows)
    data_start=3
    for i in range(nrows):
        for j,c_name in enumerate(cols):
            ws.cell(row=data_start + i, column=j+1, value=df.iat[i,j])

    data_end_row = data_start + nrows -1  # = 2 + nrows
    ref_range= f"A{header_row}:{get_column_letter(ncols)}{data_end_row}"

    tab=Table(displayName="Table1", ref=ref_range)
    style=TableStyleInfo(name="TableStyleMedium9", showRowStripes=True, showColumnStripes=False)
    tab.tableStyleInfo=style
    tab.totalsRowShown=True

    # Ищем "Название" => СРЕДНЕЕ:, "Цена"/"Цена за м²" => average
    for i,colObj in enumerate(tab.tableColumns):
        hdr=cols[i]
        if hdr=="Название":
            colObj.totalsRowLabel= "СРЕДНЕЕ:"
            colObj.totalsRowFunction= None
        elif hdr in ["Цена","Цена за м²"]:
            colObj.totalsRowFunction="average"
        else:
            colObj.totalsRowFunction=None

    ws.add_table(tab)

    # формат
    for j,c_name in enumerate(cols,1):
        ws.column_dimensions[get_column_letter(j)].width=15
        if c_name in ["Цена","Цена за м²"]:
            for row_i in range(data_start, data_end_row+1):
                ws.cell(row=row_i,column=j).number_format='#,##0.00 "€"'

    # мини-сводная => H1..I2
    ws["H1"]="Средняя цена"
    ws["I1"]="Средняя цена за м²"
    ws["H2"]= f"=SUBTOTAL(101,Table1[[#All],[Цена]])"
    ws["I2"]= f"=SUBTOTAL(101,Table1[[#All],[Цена за м²]])"
    ws["H2"].number_format='#,##0.00 "€"'
    ws["I2"].number_format='#,##0.00 "€"'
    ws.column_dimensions["H"].width=20
    ws.column_dimensions["I"].width=20

    wb.save(file_path)
def main():
    countries = [
        ("Австралия",        "australia"),
        ("Австрия",          "austria"),
        ("Андорра",          "andorra"),
        ("Болгария",         "bulgaria"),
        ("Венгрия",          "hungary"),
        ("Германия",         "germany"),
        ("Греция",           "greece"),
        ("Грузия",           "georgia"),
        ("Израиль",          "israel"),
        ("Испания",          "spain"),
        ("Италия",           "italy"),
        ("Кипр",             "cyprus"),
        ("Латвия",           "latvia"),
        ("Литва",            "lithuania"),
        ("Люксембург",       "luxembourg"),
        ("Монако",           "monaco"),
        ("Нидерланды",       "netherlands"),
        ("ОАЭ",              "united-arab-emirates"),
        ("Португалия",       "portugal"),
        ("Великобритания",   "great-britain"),
        ("США",              "usa"),
        ("Таиланд",          "thailand"),
        ("Турция",           "turkey"),
        ("Финляндия",        "finland"),
        ("Франция",          "france"),
        ("Хорватия",         "croatia"),
        ("Черногория",       "montenegro"),
        ("Чехия",            "czech"),
        ("Швейцария",        "switzerland"),
        ("Швеция",           "sweden"),
        ("Эстония",          "estonia"),
    ]
    base_path= r"C:\Users\Алина\Desktop\Недвижимость (Prian)"

    for (country_rus, cpart) in countries:
        print(f"\n=== Prian: {country_rus} => {cpart} ===")
        df_raw= parse_prian_country(cpart, max_pages=20, step=30)
        df_clean= transform_prian_data(df_raw, country_rus)

        file_path= rf"{base_path}\{country_rus}.xlsx"
        save_prian_excel(df_clean, file_path)
        print(f"[{country_rus}] => {file_path}")

    print("\nГотово!")

if __name__=="__main__":
    main()

In [None]:
# Создание совокупной таблицы

COUNTRIES_33 = [
    "Австралия","Австрия","Андорра","Болгария","Венгрия","Германия","Греция","Грузия",
    "Израиль","Испания","Италия","Канада","Кипр","Латвия","Литва","Люксембург",
    "Монако","Нидерланды","ОАЭ","Португалия","Словакия","Великобритания","США","Таиланд",
    "Турция","Финляндия","Франция","Хорватия","Черногория","Чехия","Швейцария","Швеция","Эстония"
]

HOMESOVERSEAS_DIR = r"C:\Users\Алина\Desktop\Недвижимость (HomesOverseas)"
PRIAN_DIR          = r"C:\Users\Алина\Desktop\Недвижимость (Prian)"
TRANIO_DIR         = r"C:\Users\Алина\Desktop\Недвижимость (Tranio)"

OUTPUT_EXCEL       = r"C:\Users\Алина\Desktop\Недвижимость.xlsx"


# Функции


def load_data_only(site_dir: str, country: str, site_name:str) -> pd.DataFrame:

    needed = ["№","Название","Цена","Площадь (м²)","Комнаты","Цена за м²","Сайт"]
    path = os.path.join(site_dir, f"{country}.xlsx")
    if not os.path.isfile(path):
        print(f"[load_data_only] {site_name}: Нет файла: {path} => пуст.")
        return pd.DataFrame(columns=needed)

    print(f"[load_data_only] {site_name}, {country} => {path}")
    try:
        # header=1 => строка 2 (A2:F2) - заголовки
        df0 = pd.read_excel(path, header=1)
    except Exception as e:
        print(f"  Ошибка чтения: {e}")
        return pd.DataFrame(columns=needed)

    # Убедимся, что есть 6 столбцов
    base_cols = ["№","Название","Цена","Площадь (м²)","Комнаты","Цена за м²"]
    for col in base_cols:
        if col not in df0.columns:
            df0[col] = np.nan

    df = df0[base_cols].copy()
    # Удалим полностью пустые строки
    df = df.dropna(how="all")

    # Добавляем "Сайт"
    df["Сайт"] = site_name

    # Итог
    df = df[["№","Название","Цена","Площадь (м²)","Комнаты","Цена за м²","Сайт"]]
    print(f"  shape={df.shape}")
    return df


def write_df_as_table(
    ws,
    df: pd.DataFrame,
    start_col: int,
    start_row: int,
    table_name: str,
    table_width: int=14,
    fill_if_empty=None
):
    """
    Записываем df (7 столбцов) => (start_row, start_col).
    Включаем totalsRow.
    'Название' => 'СРЕДНЕЕ:',
    'Цена','Цена за м²' => average
    'Сайт' => None
    Возвращаем (end_row, end_col).
    """
    nrows = len(df)
    ncols = len(df.columns)

    # Печатаем заголовки
    for j, col_name in enumerate(df.columns):
        ws.cell(row=start_row, column=start_col + j, value=col_name)

    if nrows==0:
        # Пустая
        if fill_if_empty:
            fill = PatternFill(start_color=fill_if_empty, end_color=fill_if_empty, fill_type="solid")
            for j in range(ncols):
                ws.cell(row=start_row, column=start_col+j).fill = fill
        end_row = start_row
        end_col = start_col + ncols -1
    else:
        # Данные
        for i in range(nrows):
            for j in range(ncols):
                val = df.iat[i, j]
                ws.cell(row=start_row + 1 + i, column=start_col + j, value=val)
        end_row = start_row + nrows
        end_col = start_col + ncols -1

    from openpyxl.utils import get_column_letter
    ref = f"{get_column_letter(start_col)}{start_row}:{get_column_letter(end_col)}{end_row}"
    ex_table = Table(displayName=table_name, ref=ref)
    style= TableStyleInfo(name="TableStyleMedium9", showRowStripes=True, showColumnStripes=False)
    ex_table.tableStyleInfo = style
    ex_table.totalsRowShown = True

    col_list = df.columns.tolist()
    for i,colObj in enumerate(ex_table.tableColumns):
        hdr= col_list[i]
        if hdr=="Название":
            colObj.totalsRowLabel="СРЕДНЕЕ:"
            colObj.totalsRowFunction=None
        elif hdr in ["Цена","Цена за м²"]:
            colObj.totalsRowFunction="average"
        else:
            colObj.totalsRowFunction=None

    ws.add_table(ex_table)

    # Ширина
    for c in range(start_col, start_col + ncols):
        col_letter = get_column_letter(c)
        ws.column_dimensions[col_letter].width = table_width

    # Формат чисел
    for j,hdr in enumerate(col_list):
        if hdr in ["Цена","Цена за м²"]:
            col_x = start_col + j
            row_data_begin= start_row+1
            row_data_end= end_row
            for r_i in range(row_data_begin, row_data_end+1):
                ws.cell(row=r_i, column=col_x).number_format='#,##0.00 "€"'

    return (end_row, end_col)


def main():
    wb = Workbook()
    wb.remove(wb.active)

    RED_FONT   = Font(color="FF0000", bold=True)
    CENTER_AL  = Alignment(horizontal="center", vertical="center")
    BLUE_COLOR = "99CCFF"

    for country in COUNTRIES_33:
        print(f"\n=== {country} ===")
        ws = wb.create_sheet(title=country[:31])

        # 1) Читаем три сайта => df (7 колонок) + "Сайт"
        df_homes = load_data_only(HOMESOVERSEAS_DIR, country, "HomesOverseas")
        df_prian = load_data_only(PRIAN_DIR,          country, "Prian")
        df_tran  = load_data_only(TRANIO_DIR,         country, "Tranio")

        # 2) Объединяем
        df_all = pd.concat([df_homes, df_prian, df_tran], ignore_index=True)
        print(f"[{country}] Объединённый shape={df_all.shape}")

        # 3) Перенумеровываем "№" в итоговой таблице (общая нумерация)
        if not df_all.empty:
            df_all = df_all.copy()
            df_all["№"] = range(1, len(df_all)+1)

        # 4) Записываем => B8 (шапка => B8, data => B9..)
        table_name_all = f"All_{country[:8]}".replace(" ","_")
        end_row, end_col = write_df_as_table(
            ws, df_all,
            start_col=2,   # B
            start_row=8,
            table_name=table_name_all,
            fill_if_empty=BLUE_COLOR
        )

        # 5) Делаем сводную => B2:C3
        ws["B2"] = "Средняя цена"
        ws["C2"] = "Средняя цена за м²"

        # С помощью SUBTOTAL(101,<TableName>[[#All],[Цена]]) и т.д.
        ws["B3"] = f"=SUBTOTAL(101,{table_name_all}[[#All],[Цена]])"
        ws["C3"] = f"=SUBTOTAL(101,{table_name_all}[[#All],[Цена за м²]])"
        ws["B3"].number_format='#,##0.00 "€"'
        ws["C3"].number_format='#,##0.00 "€"'

        # Оформим mini-Table
        pivot_ref = "B2:C3"
        pivot_table_name= f"Pivot_{country[:8]}".replace(" ","_")
        pivot_table = Table(displayName=pivot_table_name, ref=pivot_ref)
        pivot_style= TableStyleInfo(name="TableStyleMedium9", 
                                    showRowStripes=True, showColumnStripes=False)
        pivot_table.tableStyleInfo= pivot_style
        ws.add_table(pivot_table)

        ws.column_dimensions["B"].width=18
        ws.column_dimensions["C"].width=20


        if not df_all.empty:
            # Возьмём max длину строк:
            max_len = max(len(str(x)) for x in df_all["Сайт"].dropna())
            # + небольшой запас
            desired_width = max(10, min(40, max_len + 5))
        else:
            desired_width = 15

        ws.column_dimensions["H"].width = desired_width

    # Сохраняем
    wb.save(OUTPUT_EXCEL)
    print(f"\nИтоговый Excel-файл создан: {OUTPUT_EXCEL}")


if __name__=="__main__":
    main()
