In [76]:
import time
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import display
import ipywidgets as widgets
import numpy as np

# -----------------------------------------------------------------------------
# Configuration
# -----------------------------------------------------------------------------
URLS = {
    "Opole": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/opolskie/opole/opole/opole?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Toru≈Ñ": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/kujawsko--pomorskie/torun/torun?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Bydgoszcz": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/kujawsko--pomorskie/bydgoszcz/bydgoszcz?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Lublin": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/lubelskie/lublin/lublin?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Zielona G√≥ra": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/lubuskie/zielona-gora/zielona-gora?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "≈Å√≥dz": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/lodzkie/lodz/lodz?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Opole": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/opolskie/opole/opole?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Rzesz√≥w": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/podkarpackie/rzeszow/rzeszow?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Bia≈Çystok": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/podlaskie/bialystok/bialystok?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Gda≈Ñsk": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/pomorskie/gdansk/gdansk?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Katowice": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/slaskie/katowice/katowice?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Kielce": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/swietokrzyskie/kielce/kielce?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Olsztyn": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/warminsko--mazurskie/olsztyn/olsztyn/olsztyn?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Pozna≈Ñ": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/wielkopolskie/poznan/poznan?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Szczecin": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/zachodniopomorskie/szczecin/szczecin?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Krak√≥w": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/malopolskie/krakow/krakow?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
    "Warszawa": "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/mazowieckie/warszawa/warszawa?limit=72&page={}&ownerTypeSingleSelect=ALL&buildYearMax=2025&by=DEFAULT&direction=DESC",
}


HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
MAX_PAGES = 500

pd.set_option('display.max_colwidth', None)

# -----------------------------------------------------------------------------
# Data references
# -----------------------------------------------------------------------------
# =============================================================================
# üó∫Ô∏è Polish Administrative Data ‚Äî Regions, Cities & Districts
# =============================================================================
# This cell defines the mapping between Polish voivodeships (regions),
# their main cities, and the corresponding districts (dzielnice).
# It‚Äôs used for parsing and categorizing location strings from real-estate data.
# =============================================================================

# --- Voivodeships (regions) and their main cities ---
REGIONS = {
    "dolno≈õlƒÖskie": ["Wroc≈Çaw"],
    "kujawsko-pomorskie": ["Bydgoszcz", "Toru≈Ñ"],
    "lubelskie": ["Lublin"],
    "lubuskie": ["Zielona G√≥ra", "Gorz√≥w Wielkopolski"],
    "≈Ç√≥dzkie": ["≈Å√≥d≈∫"],
    "ma≈Çopolskie": ["Krak√≥w"],
    "mazowieckie": ["Warszawa"],
    "opolskie": ["Opole"],
    "podkarpackie": ["Rzesz√≥w"],
    "podlaskie": ["Bia≈Çystok"],
    "pomorskie": ["Gda≈Ñsk"],
    "≈õlƒÖskie": ["Katowice"],
    "≈õwiƒôtokrzyskie": ["Kielce"],
    "warmi≈Ñsko-mazurskie": ["Olsztyn"],
    "wielkopolskie": ["Pozna≈Ñ"],
    "zachodniopomorskie": ["Szczecin"]
}

# --- Major cities and their districts ---
WAW_DISTRICTS = [
    "Bemowo", "Bia≈Ço≈Çƒôka", "Bielany", "Mokot√≥w", "Ochota",
    "Praga-Po≈Çudnie", "Praga-P√≥≈Çnoc", "Rembert√≥w", "≈ör√≥dmie≈õcie",
    "Targ√≥wek", "Ursus", "Ursyn√≥w", "Wawer", "Weso≈Ça",
    "Wilan√≥w", "W≈Çochy", "Wola", "≈ªoliborz"
]

KRA_DISTRICTS = [
    "Stare Miasto", "Grzeg√≥rzki", "PrƒÖdnik Czerwony", "PrƒÖdnik Bia≈Çy",
    "Krowodrza", "Bronowice", "Zwierzyniec", "Dƒôbniki",
    "≈Åagiewniki-Borek Fa≈Çƒôcki", "Swoszowice", "Podg√≥rze Duchackie",
    "Bie≈ºan√≥w-Prokocim", "Podg√≥rze", "Czy≈ºyny", "Mistrzejowice",
    "Bie≈Ñczyce", "Wzg√≥rza Krzes≈Çawickie"
]

BIA_DISTRICTS = [
    "Centrum", "Bia≈Çostoczek", "Sienkiewicza", "Bojary", "Piaski",
    "Antoniuk", "Jarosz√≥wka", "Wygoda", "Piasta I i II", "Wysoki Stoczek",
    "Dziesiƒôciny I i II", "Bacieczki", "Starosielce", "Dojlidy"
]

BYD_DISTRICTS = [
    "Babia Wie≈õ", "Bartodzieje", "Bielawy", "B≈Çonie", "Bocianowo-≈ör√≥dmie≈õcie-Stare Miasto",
    "Brdyuj≈õcie", "Bydgoszcz Wsch√≥d-Siernieczek", "Czy≈ºk√≥wko", "Flisy",
    "Glinki-Rupienica", "G√≥rzyskowo", "Jachcice", "Kapu≈õciska", "Le≈õne",
    "≈Åƒôgnowo", "≈Åƒôgnowo Wie≈õ", "Miedzy≈Ñ-PrƒÖdy", "Nowy Fordon", "Okole",
    "Osowa G√≥ra", "Piaski", "Smuka≈Ça-Op≈Çawiec-Janowo", "Stary Fordon",
    "Szwederowo", "Tatrza≈Ñskie", "Teren√≥w Nadwi≈õla≈Ñskich", "Wilczak-Jary",
    "Wy≈ºyny", "Wzg√≥rze Wolno≈õci", "Zimne Wody‚ÄìCzersko Polskie"
]

GDA_DISTRICTS = [
    "Anio≈Çki", "Brƒôtowo", "Brze≈∫no", "Che≈Çm", "Jasie≈Ñ", "Kokoszki",
    "Krakowiec-G√≥rki Zachodnie", "Letnica", "Matarnia", "M≈Çyniska",
    "Nowy Port", "Oliwa", "Olszynka", "Orunia-≈öw. Wojciech-Lipce",
    "Orunia G√≥rna-Gda≈Ñsk Po≈Çudnie", "Osowa", "Piecki-Migowo", "Przer√≥bka",
    "Przymorze Ma≈Çe", "Przymorze Wielkie", "Rudniki", "Siedlce", "Stogi",
    "Suchanino", "≈ör√≥dmie≈õcie", "Uje≈õcisko-≈Åostowice", "Wrzeszcz Dolny",
    "Wrzeszcz G√≥rny", "Zaspa-M≈Çyniec", "Zaspa-Rozstaje",
    "≈ªabianka-Wejhera-Jelitkowo-TysiƒÖclecia"
]

GOR_DISTRICTS = [
    "Baczyna", "Chr√≥≈õcik", "Chwalƒôcice", "G√≥rczyn", "Janice", "Karnin",
    "Ma≈Çyszyn Wielki", "Ma≈Çyszyn Ma≈Çy", "Nowy Dw√≥r", "Piaski", "Sady",
    "≈ör√≥dmie≈õcie", "Zakanale"
]

KIE_DISTRICTS = [
    "Baran√≥wek", "Barwinek", "Bia≈Çogon", "Biesak", "Bocianek", "Buk√≥wka",
    "Cedro-Mazur", "Cegielnia", "Centrum", "Chƒôci≈Ñskie", "Czarn√≥w",
    "DƒÖbrowa", "Dobromy≈õl", "Domaszowice Wikaryjskie", "Dyminy-Wie≈õ",
    "G≈Çƒôboczka", "Herby", "Jagiello≈Ñskie", "Karcz√≥wka", "≈Åazy", "Malik√≥w",
    "Na Stoku", "Nowy Folwark", "Niewachl√≥w I", "Niewachl√≥w II",
    "Osiedle Jana Czarnockiego", "Osiedle Jana Kochanowskiego", "Ostra G√≥rka",
    "Pakosz", "Panorama", "Piaski", "Pietraszki", "Pod DalniƒÖ", "Podhale",
    "Podkarcz√≥wka", "Pod Telegrafem", "Pos≈Çowice", "Sady", "Sandomierskie",
    "Sieje", "Sitk√≥wka", "Skrzetle", "S≈Çoneczne Wzg√≥rze", "S≈Çowik",
    "Szyd≈Ç√≥wek", "≈ölichowice", "≈öwiƒôtokrzyskie", "Uroczysko", "Wielkopole",
    "Wietrznia", "Zacisze", "Zalesie", "Zag√≥rska Po≈Çudnie",
    "Zag√≥rska P√≥≈Çnoc", "Zag√≥rze", "ZwiƒÖzkowiec"
]

KAT_DISTRICTS = [
    "≈ör√≥dmie≈õcie", "Koszutka", "Bogucice", "Osiedle Paderewskiego ‚Äì Muchowiec",
    "Za≈Çƒô≈ºe", "Osiedle Wincentego Witosa", "Osiedle TysiƒÖclecia", "DƒÖb",
    "We≈Çnowiec-J√≥zefowiec", "Ligota-Panewniki", "Bryn√≥w-Osiedle Zgrzebnioka",
    "Za≈Çƒôska Ha≈Çda-Bryn√≥w", "Piotrowice-Ochojec", "Szopienice-Burowiec",
    "Murkowice", "Kostuchna", "Piotrowice", "Ochojec", "Zarzecze",
    "DƒÖbr√≥wka Ma≈Ça", "Stare Bogucice", "Nowe Bogucice"
]

LUB_DISTRICTS = [
    "Abramowice", "Bronowice", "Czech√≥w Po≈Çudniowy", "Czech√≥w P√≥≈Çnocny",
    "Czuby Po≈Çudniowe", "Czuby P√≥≈Çnocne", "DziesiƒÖta", "Felin", "G≈Çusk",
    "Hajd√≥w-Zadƒôbie", "Kalinowszczyzna", "Konstantyn√≥w", "Ko≈õminek",
    "Ponikwoda", "Rury", "S≈Çawin", "S≈Çawinek", "Stare Miasto", "Szerokie",
    "≈ör√≥dmie≈õcie", "Tatary", "Wƒôglin Po≈Çudniowy", "Wƒôglin P√≥≈Çnocny",
    "Wieniawa", "Wrotk√≥w", "Za CukrowniƒÖ", "Zemborzyce"
]

LOD_DISTRICTS = ["Ba≈Çuty", "G√≥rna", "Polesie", "≈ör√≥dmie≈õcie", "Widzew"]

OLS_DISTRICTS = [
    "Brzeziny", "Dajtki", "Genera≈Ç√≥w", "Grunwaldzkie", "Gutkowo", "Jaroty",
    "Kƒôtrzy≈Ñskiego", "Kormoran", "Kortowo", "Ko≈õciuszki", "Likusy",
    "Mazurskie", "Mleczna", "Nad Jeziorem D≈Çugim", "Nag√≥rki", "Pieczewo",
    "Podgrodzie", "Podle≈õna", "Pojezierze", "Redykajny", "≈ör√≥dmie≈õcie",
    "Wojska Polskiego", "Zatorze", "Zielona G√≥rka"
]

OPO_DISTRICTS = [
    "Borki", "Brzezie", "CzarnowƒÖsy", "≈öwierkle", "Krzanowice",
    "Wr√≥blin", "Zakrz√≥w", "Chabry", "Armii Krajowej", "Gos≈Çawice",
    "Malinka", "Nowa Wie≈õ Kr√≥lewska", "P√≥≈Çwie≈õ"
]

POZ_DISTRICTS = [
    "Antoninek-Zieliniec-Kobylepole", "Chartowo", "Fabianowo-Kotowo",
    "G≈Ç√≥wna", "G≈Çuszyna", "G√≥rczyn", "Grunwald P√≥≈Çnoc", "Grunwald Po≈Çudnie",
    "Je≈ºyce", "Junikowo", "Kiekrz", "Krzesiny-Pokrzywno-Garaszewo",
    "Krzy≈ºowniki-Smochowice", "Kwiatowe", "≈Åawica", "Morasko-Radojewo",
    "Naramowice", "Nowe Winogrady Po≈Çudnie", "Nowe Winogrady P√≥≈Çnoc",
    "Nowe Winogrady Wsch√≥d", "Ogrody", "Ostr√≥w Tumski-≈ör√≥dka-Zawady-Komandoria",
    "PiƒÖtkowo", "Podolany", "Rataje", "So≈Çacz", "Stare Miasto",
    "Staro≈Çƒôka-Minikowo-Marlewo", "Strzeszyn", "Szczepankowo-Sp≈Çawie-Krzesinki",
    "Stare Winogrady", "≈öwierczewo", "≈öw. ≈Åazarz", "Umultowo", "Wilda",
    "Warszawskie-Pomet-Malta≈Ñskie", "Winiary", "Wola", "≈ªegrze", "Zielony Dƒôbiec"
]

RZE_DISTRICTS = [
    "1000-Lecia", "Baran√≥wka", "Bia≈Ça", "Budziw√≥j", "Bzianka", "DƒÖbrowskiego",
    "Drabinianka", "Franciszka Kotuli", "Genera≈Ça Grota Roweckiego",
    "Genera≈Ça W≈Çadys≈Çawa Andersa", "Kmity", "Krakowska ‚Äì Po≈Çudnie",
    "Kr√≥la Stanis≈Çawa Augusta", "Matys√≥wka", "Mieszka I", "Mi≈Çocin ‚Äì ≈õw. Huberta",
    "Mi≈Çocin", "Nowe Miasto", "Paderewskiego", "Piast√≥w", "Pobitno",
    "Pogwizd√≥w Nowy", "Pu≈Çaskiego", "Przybysz√≥wka", "Staromie≈õcie",
    "≈ör√≥dmie≈õcie", "Wilkowyja", "Zalesie", "Zawiszy Czarnego", "Zwiƒôczyca"
]

SZC_DISTRICTS = [
    "Arko≈Ñskie-Niemierzyn", "Bukowe-Klƒôskowo", "Bukowo", "Centrum",
    "DƒÖbie", "Drzetowo-Grabowo", "G≈Çƒôbokie-Pilchowo", "Golƒôcino-Goc≈Çaw",
    "Gumie≈Ñce", "Kijewo", "Krzekowo-Bezrzecze", "≈Åƒôkno", "Majowe",
    "Miƒôdzyodrze-Wyspa Pucka", "Niebuszewo", "Niebuszewo-Bolinko",
    "Nowe Miasto", "Os√≥w", "P≈Çonia-≈ömierdnica-Jezierzyce", "Podjuchy",
    "Pogodno", "Pomorzany", "Skolwin", "S≈Çoneczne", "Stare Miasto",
    "Sto≈Çczyn", "≈ör√≥dmie≈õcie-P√≥≈Çnoc", "≈ör√≥dmie≈õcie-Zach√≥d", "≈öwierczewo",
    "Turzyn", "Za≈Çom", "Zawadzkiego", "Zdroje", "Z≈Çocie≈Ñ"
]

TOR_DISTRICTS = [
    "Barbarka", "Bielany", "Bielawy", "Bydgoskie Przedmie≈õcie",
    "Che≈Çmi≈Ñskie Przedmie≈õcie", "Czerniewice", "Glinki", "Grƒôbocin nad StrugƒÖ",
    "Jakubskie Przedmie≈õcie", "Kaszczorek", "Katarzynka", "Koniuchy",
    "Mokre", "Na Skarpie", "Piaski", "Podg√≥rz", "Rubinkowo", "Rudak",
    "Rybaki", "Stare Miasto", "Starotoru≈Ñskie Przedmie≈õcie", "Stawki",
    "Winnica", "Wrzosy"
]

ZIE_DISTRICTS = [
    "Barcikowice", "Drzonk√≥w", "Jany", "Jarogniewice", "Jeleni√≥w", "Kie≈Çpin",
    "Krƒôpa", "≈Åƒô≈ºyca", "≈Åugowo", "Nowy Kisielin", "Ochla", "Przylep", "Racula",
    "Raculka", "Sucha", "Zatonie", "Zawada", "Zielona G√≥ra",
    "Zielona G√≥ra ‚Äì Centrum", "Zielona G√≥ra ‚Äì P√≥≈Çnoc", "Zielona G√≥ra ‚Äì Po≈Çudnie",
    "Zielona G√≥ra ‚Äì Wsch√≥d", "Zielona G√≥ra ‚Äì Zach√≥d",
    "Zielona G√≥ra ‚Äì Osiedle M≈Çodych", "Zielona G√≥ra ‚Äì Osiedle Piastowskie",
    "Zielona G√≥ra ‚Äì Osiedle S≈Çowia≈Ñskie", "Zielona G√≥ra ‚Äì Osiedle Zawiszy Czarnego",
    "Zielona G√≥ra ‚Äì Osiedle Wyszy≈Ñskiego", "Zielona G√≥ra ‚Äì Osiedle Wroc≈Çawskie"
]

# --- All supported cities ---
CITIES = [
    "Warszawa", "Krak√≥w", "Bia≈Çystok", "Bydgoszcz", "Gda≈Ñsk",
    "Gorz√≥w Wielkopolski", "Kielce", "Katowice", "Lublin", "≈Å√≥d≈∫",
    "Olsztyn", "Opole", "Pozna≈Ñ", "Szczecin", "Rzesz√≥w",
    "Toru≈Ñ", "Wroc≈Çaw", "Zielona G√≥ra"
]

# --- Mapping cities to their districts ---
CITY_TO_DISTRICTS = {
    "Warszawa": WAW_DISTRICTS,
    "Krak√≥w": KRA_DISTRICTS,
    "Bia≈Çystok": BIA_DISTRICTS,
    "Bydgoszcz": BYD_DISTRICTS,
    "Gda≈Ñsk": GDA_DISTRICTS,
    "Gorz√≥w Wielkopolski": GOR_DISTRICTS,
    "Kielce": KIE_DISTRICTS,
    "Katowice": KAT_DISTRICTS,
    "Lublin": LUB_DISTRICTS,
    "≈Å√≥d≈∫": LOD_DISTRICTS,
    "Olsztyn": OLS_DISTRICTS,
    "Opole": OPO_DISTRICTS,
    "Pozna≈Ñ": POZ_DISTRICTS,
    "Szczecin": SZC_DISTRICTS,
    "Rzesz√≥w": RZE_DISTRICTS,
    "Toru≈Ñ": TOR_DISTRICTS,
    "Wroc≈Çaw": WAW_DISTRICTS,  # Example placeholder ‚Äî update with real districts if needed
    "Zielona G√≥ra": ZIE_DISTRICTS
}

def clean_street_name(name: str) -> str:
    if not isinstance(name, str) or not name.strip():
        return "nieznana"
    name = re.sub(r'^(ul\.|ulica|al\.|aleja)\s+', '', name.strip(), flags=re.I)
    return name.strip()


def parse_localization(text: str) -> pd.Series:
    # Usuniƒôto street
    district = "nieznana"
    city = "nieznane"
    region = "nieznane"

    if not isinstance(text, str) or not text.strip():
        return pd.Series([district, city, region])

    tokens = [t.strip() for t in text.split(',') if t.strip()]
    tokens = [t.replace("‚Äì", "-") for t in tokens]

    for token in tokens:
        if token.lower() in REGIONS:
            region = token.lower()
            break

    for token in tokens:
        if token in CITIES:
            city = token
            break

    possible_districts = [t for t in tokens if t not in CITIES and t.lower() not in REGIONS]
    if possible_districts:
        district = possible_districts[-1]

    if city in CITY_TO_DISTRICTS:
        for known_district in CITY_TO_DISTRICTS[city]:
            if district.lower() in known_district.lower() or known_district.lower() in district.lower():
                district = known_district
                break

    if city != "nieznane" and region == "nieznane":
        for voivodeship, cities in REGIONS.items():
            if city in cities:
                region = voivodeship
                break

    if city == "nieznane" and region != "nieznane":
        cities_in_region = REGIONS.get(region.lower(), [])
        if len(cities_in_region) == 1:
            city = f"okolice {cities_in_region[0]}"

    return pd.Series([district, city, region])


def scrape_multiple_cities(urls: dict, limit_per_page: int = 72) -> pd.DataFrame:
    all_data = []

    for city_name, base_url in urls.items():
        print(f"Scraping {city_name}...")

        response = requests.get(base_url.format(1), headers=HEADERS)
        soup = BeautifulSoup(response.content, "html.parser")

        try:
            total_listings_tag = soup.find("span", class_="css-1cwh6ya ehod8gt0")
            total_listings = int(re.sub(r"[^\d]", "", total_listings_tag.get_text(strip=True).split()[-1]))
            pages = (total_listings // limit_per_page) + 1
        except:
            total_listings = "N/A"
            pages = 1
        print(f"{city_name}: total listings = {total_listings}, pages = {pages}")

        progress = widgets.IntProgress(
            value=0,
            min=0,
            max=pages,
            description=f'{city_name}:',
            bar_style='success',
            orientation='horizontal'
        )
        display(progress)

        for page in range(1, pages + 1):
            url = base_url.format(page)
            resp = requests.get(url, headers=HEADERS)
            page_soup = BeautifulSoup(resp.content, "html.parser")
            page_offers = extract_offers_from_page(page_soup)
            all_data.extend(page_offers)

            print(f"{city_name} - page {page}: {len(page_offers)} offers", end="\r")
            progress.value = page
            time.sleep(np.random.uniform(1, 3))

        print(f"\nFinished {city_name}. Total listings collected so far: {len(all_data)}\n")

    df = pd.DataFrame(all_data)

    # --- Walidacja i konwersje ---
    # rooms i floor jako int lub NaN
    for col in ['rooms', 'floor']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int')

    # price jako float, walidacja du≈ºych liczb
    if 'price' in df.columns:
        df['price'] = pd.to_numeric(df['price'].astype(str).str.replace(r"[^\d\.]", "", regex=True), errors='coerce')
        df.loc[df['price'] > 1e7, 'price'] = np.nan  # przyk≈Çadowa granica: 10 mln

    # area jako float
    if 'area' in df.columns:
        df['area'] = pd.to_numeric(df['area'].astype(str).str.replace(",", ".").str.replace("m¬≤", "").str.replace(" ", ""), errors='coerce')

    # price per m2 tylko je≈õli price > 0 i area > 0
    df['price_per_m2'] = np.where(
        (df['price'] > 0) & (df['area'] > 0),
        df['price'] / df['area'],
        np.nan
    )

    # Parse localization
    if 'localization' in df.columns:
        df[['district', 'city', 'region']] = df['localization'].apply(parse_localization)

    return df


def extract_offers_from_page(soup: BeautifulSoup) -> list:
    data = []
    offers = soup.find_all("section", class_="css-ito1if")

    for offer in offers:
        row = {}
        title_tag = offer.find("p", {"data-cy": "listing-item-title"})
        row["title"] = title_tag.get_text(strip=True) if title_tag else None

        loc_tag = offer.find("p", {"data-sentry-component": "Address"})
        row["localization"] = loc_tag.get_text(strip=True) if loc_tag else None

        price_tag = offer.find("span", {"data-sentry-element": "MainPrice"})
        row["price"] = price_tag.get_text(strip=True).replace("\xa0", " ") if price_tag else None

        price_m2_tag = offer.find("span", class_="css-u0t81v")
        row["price_per_m2"] = price_m2_tag.get_text(strip=True).replace("\xa0", " ") if price_m2_tag else None

        seller_wrapper = offer.find("div", {"data-sentry-element": "SellerInfoWrapper"})
        row["type"] = None
        if seller_wrapper:
            type_tag = seller_wrapper.find("span", class_="css-1ig9uyl e11ruw5v4")
            if type_tag:
                seller_type = type_tag.get_text(strip=True).lower()
                if "biuro" in seller_type:
                    row["type"] = "Biuro nieruchomo≈õci"
                elif "oferta prywatna" in seller_type:
                    row["type"] = "Oferta prywatna"
                elif "deweloper" in seller_type:
                    continue
                else:
                    row["type"] = seller_type

        dd_tags = offer.find_all("dd", class_="css-17je0kd")
        if len(dd_tags) >= 3:
            rooms_text = dd_tags[0].get_text(strip=True)
            match_rooms = re.search(r'\d+', rooms_text)
            row["rooms"] = int(match_rooms.group()) if match_rooms else None
        
            area_text = dd_tags[1].get_text(strip=True)
            row["area"] = area_text
        
            floor_text = dd_tags[2].get_text(strip=True).lower()
            if "parter" in floor_text:
                row["floor"] = 0
            else:
                match_floor = re.search(r"\d+", floor_text)
                row["floor"] = int(match_floor.group()) if match_floor else None


        data.append(row)
    return data


def scrape_multiple_cities(urls: dict, limit_per_page: int = 72) -> pd.DataFrame:
    """
    urls: dict w formacie {"Miasto": "URL_do_scrapowania z {} dla page"}
    limit_per_page: ile ofert na stronƒô (zgodne z URL)
    """
    all_data = []

    for city_name, base_url in urls.items():
        print(f"Scraping {city_name}...")

        # Pobierz pierwszƒÖ stronƒô, ≈ºeby wykryƒá liczbƒô ofert
        response = requests.get(base_url.format(1), headers=HEADERS)
        soup = BeautifulSoup(response.content, "html.parser")

        try:
            total_listings_tag = soup.find("span", class_="css-1cwh6ya ehod8gt0")
            total_listings = int(re.sub(r"[^\d]", "", total_listings_tag.get_text(strip=True).split()[-1]))
            pages = (total_listings // limit_per_page) + 1
        except:
            total_listings = "N/A"
            pages = 1
        print(f"{city_name}: total listings = {total_listings}, pages = {pages}")

        # Widget postƒôpu
        progress = widgets.IntProgress(
            value=0,
            min=0,
            max=pages,
            description=f'{city_name}:',
            bar_style='success',
            orientation='horizontal'
        )
        display(progress)

        for page in range(1, pages + 1):
            url = base_url.format(page)
            resp = requests.get(url, headers=HEADERS)
            page_soup = BeautifulSoup(resp.content, "html.parser")
            page_offers = extract_offers_from_page(page_soup)
            all_data.extend(page_offers)

            print(f"{city_name} - page {page}: {len(page_offers)} offers", end="\r")
            progress.value = page
            time.sleep(np.random.uniform(1, 3))  # losowy delay 1-3s

        print(f"\nFinished {city_name}. Total listings collected so far: {len(all_data)}\n")

    # --- Tworzenie DataFrame ---
    df = pd.DataFrame(all_data)

    # --- Czyszczenie kolumn ---
    if 'price' in df.columns:
        df['price'] = pd.to_numeric(df['price'].astype(str).str.replace(r"[^\d]", "", regex=True), errors='coerce')
    if 'area' in df.columns:
        df['area'] = pd.to_numeric(df['area'].astype(str).str.replace(",", ".").str.replace("m¬≤", "").str.replace(" ", ""), errors='coerce')
    for col in ['rooms', 'floor']:
        if col in df.columns:
            # 1. Konwertuj na numeric i wymu≈õ NaN tam, gdzie nie da siƒô konwertowaƒá
            df[col] = pd.to_numeric(df[col], errors='coerce')
            # 2. Konwertuj na nullable Int64
            df[col] = df[col].astype('Int64')
            
    # Price per m2 fallback
    df['price_per_m2'] = (df['price'] / df['area']).round(2)

    # Parse localization
    if 'localization' in df.columns:
        df[['district', 'city', 'region']] = df['localization'].apply(parse_localization)

    return df




# -----------------------------------------------------------------------------
# Run scraper
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    df = scrape_multiple_cities(URLS)

    print("\nScraping done 100%")
    print("Cities found:", df['city'].unique())
    print("Regions found:", df['region'].unique())
    print(f"Saved {len(df)} records.")




Scraping Opole...
Opole: total listings = 267, pages = 4


IntProgress(value=0, bar_style='success', description='Opole:', max=4)

Opole - page 4: 47 offers
Finished Opole. Total listings collected so far: 259

Scraping Toru≈Ñ...
Toru≈Ñ: total listings = 905, pages = 13


IntProgress(value=0, bar_style='success', description='Toru≈Ñ:', max=13)

Toru≈Ñ - page 13: 27 offers
Finished Toru≈Ñ. Total listings collected so far: 916

Scraping Bydgoszcz...
Bydgoszcz: total listings = 1750, pages = 25


IntProgress(value=0, bar_style='success', description='Bydgoszcz:', max=25)

Bydgoszcz - page 25: 23 offers
Finished Bydgoszcz. Total listings collected so far: 2464

Scraping Lublin...
Lublin: total listings = 1908, pages = 27


IntProgress(value=0, bar_style='success', description='Lublin:', max=27)

Lublin - page 27: 36 offers
Finished Lublin. Total listings collected so far: 4053

Scraping Zielona G√≥ra...
Zielona G√≥ra: total listings = 478, pages = 7


IntProgress(value=0, bar_style='success', description='Zielona G√≥ra:', max=7)

Zielona G√≥ra - page 7: 42 offers
Finished Zielona G√≥ra. Total listings collected so far: 4436

Scraping ≈Å√≥dz...
≈Å√≥dz: total listings = 3385, pages = 48


IntProgress(value=0, bar_style='success', description='≈Å√≥dz:', max=48)

≈Å√≥dz - page 48: 2 offerss
Finished ≈Å√≥dz. Total listings collected so far: 7172

Scraping Rzesz√≥w...
Rzesz√≥w: total listings = 2046, pages = 29


IntProgress(value=0, bar_style='success', description='Rzesz√≥w:', max=29)

Rzesz√≥w - page 29: 29 offers
Finished Rzesz√≥w. Total listings collected so far: 8851

Scraping Bia≈Çystok...
Bia≈Çystok: total listings = 1204, pages = 17


IntProgress(value=0, bar_style='success', description='Bia≈Çystok:', max=17)

Bia≈Çystok - page 17: 52 offers
Finished Bia≈Çystok. Total listings collected so far: 9936

Scraping Gda≈Ñsk...
Gda≈Ñsk: total listings = 3839, pages = 54


IntProgress(value=0, bar_style='success', description='Gda≈Ñsk:', max=54)

Gda≈Ñsk - page 54: 24 offers
Finished Gda≈Ñsk. Total listings collected so far: 13247

Scraping Katowice...
Katowice: total listings = 1398, pages = 20


IntProgress(value=0, bar_style='success', description='Katowice:', max=20)

Katowice - page 20: 31 offers
Finished Katowice. Total listings collected so far: 14311

Scraping Kielce...
Kielce: total listings = 746, pages = 11


IntProgress(value=0, bar_style='success', description='Kielce:', max=11)

Kielce - page 11: 27 offers
Finished Kielce. Total listings collected so far: 15053

Scraping Olsztyn...
Olsztyn: total listings = 527, pages = 8


IntProgress(value=0, bar_style='success', description='Olsztyn:', max=8)

Olsztyn - page 8: 24 offers
Finished Olsztyn. Total listings collected so far: 15532

Scraping Pozna≈Ñ...
Pozna≈Ñ: total listings = 3301, pages = 46


IntProgress(value=0, bar_style='success', description='Pozna≈Ñ:', max=46)

Pozna≈Ñ - page 46: 28 offers
Finished Pozna≈Ñ. Total listings collected so far: 17804

Scraping Szczecin...
Szczecin: total listings = 2111, pages = 30


IntProgress(value=0, bar_style='success', description='Szczecin:', max=30)

Szczecin - page 30: 23 offers
Finished Szczecin. Total listings collected so far: 19638

Scraping Krak√≥w...
Krak√≥w: total listings = 7192, pages = 100


IntProgress(value=0, bar_style='success', description='Krak√≥w:')

Krak√≥w - page 100: 65 offers
Finished Krak√≥w. Total listings collected so far: 25124

Scraping Warszawa...
Warszawa: total listings = 12323, pages = 172


IntProgress(value=0, bar_style='success', description='Warszawa:', max=172)

Warszawa - page 172: 12 offers
Finished Warszawa. Total listings collected so far: 35956


Scraping done 100%
Cities found: ['Opole' 'okolice Opole' 'Toru≈Ñ' 'Bydgoszcz' 'nieznane' 'Lublin'
 'okolice Lublin' 'Zielona G√≥ra' '≈Å√≥d≈∫' 'okolice ≈Å√≥d≈∫' 'Rzesz√≥w'
 'okolice Rzesz√≥w' 'Bia≈Çystok' 'okolice Bia≈Çystok' 'Gda≈Ñsk'
 'okolice Gda≈Ñsk' 'Katowice' 'okolice Katowice' 'Kielce' 'okolice Kielce'
 'Olsztyn' 'okolice Olsztyn' 'Pozna≈Ñ' 'okolice Pozna≈Ñ' 'Szczecin'
 'okolice Szczecin' 'Krak√≥w' 'okolice Krak√≥w' 'Warszawa'
 'okolice Warszawa']
Regions found: ['opolskie' 'kujawsko-pomorskie' 'lubelskie' 'lubuskie' '≈Ç√≥dzkie'
 'podkarpackie' 'podlaskie' 'pomorskie' '≈õlƒÖskie' '≈õwiƒôtokrzyskie'
 'warmi≈Ñsko-mazurskie' 'wielkopolskie' 'zachodniopomorskie' 'ma≈Çopolskie'
 'mazowieckie']
Saved 35956 records.


In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34793 entries, 0 to 34792
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         34793 non-null  object 
 1   localization  34793 non-null  object 
 2   price         34793 non-null  float64
 3   price_per_m2  34610 non-null  float64
 4   type          34793 non-null  object 
 5   rooms         34793 non-null  Int64  
 6   area          34793 non-null  float64
 7   floor         34793 non-null  Int64  
 8   district      34793 non-null  object 
 9   city          34793 non-null  object 
 10  region        34793 non-null  object 
dtypes: Int64(2), float64(3), object(6)
memory usage: 3.0+ MB


In [103]:
print(len(df))

34610


In [99]:
df.isnull().sum()

title             0
localization      0
price             0
price_per_m2    183
type              0
rooms             0
area              0
floor             0
district          0
city              0
region            0
dtype: int64

In [102]:
df.describe().round(2)

Unnamed: 0,price,price_per_m2,rooms,area,floor
count,34610.0,34610.0,34610.0,34610.0,34610.0
mean,841307.4,14423.34,2.63,58.19,2.68
std,585033.73,5677.54,0.96,24.54,2.42
min,79999.0,5000.0,1.0,11.0,0.0
25%,519000.0,10153.76,2.0,42.1,1.0
50%,690000.0,13483.5,3.0,53.12,2.0
75%,946033.0,17516.45,3.0,67.12,4.0
max,9200000.0,50000.0,10.0,200.0,10.0


In [109]:
df.to_csv("cities_data/otodom_apartments_demo.csv", index=False, encoding="utf-8-sig", sep=";")

In [33]:
print(df['city'].unique(), df['region'].unique())

['Olsztyn' 'okolice Olsztyn'] ['warmi≈Ñsko-mazurskie']


In [101]:
# --- Filtracja ekstremalnych warto≈õci ---
# Realistyczne zakresy dla kolumn
PRICE_MIN, PRICE_MAX = 1e4, 1e7       # cena w PLN
PRICEM2_MIN, PRICEM2_MAX = 5000, 50000       # cena za metr
AREA_MIN, AREA_MAX = 10, 200          # powierzchnia w m¬≤
ROOMS_MIN, ROOMS_MAX = 1, 10          # liczba pokoi
FLOOR_MIN, FLOOR_MAX = 0, 20          # piƒôtro

# Usu≈Ñ rekordy z ekstremalnymi warto≈õciami lub zamie≈Ñ na NaN
df['price'] = df['price'].where(df['price'].between(PRICE_MIN, PRICE_MAX), np.nan)
df['area'] = df['area'].where(df['area'].between(AREA_MIN, AREA_MAX), np.nan)
df['rooms'] = df['rooms'].where(df['rooms'].between(ROOMS_MIN, ROOMS_MAX), np.nan)
df['floor'] = df['floor'].where(df['floor'].between(FLOOR_MIN, FLOOR_MAX), np.nan)

# --- Price per m2 ---
df['price_per_m2'] = np.where(
    (df['price'] > 0) & (df['area'] > 0),
    df['price'] / df['area'],
    np.nan
)

df['price_per_m2'] = df['price_per_m2'].where(
    df['price_per_m2'].between(PRICEM2_MIN, PRICEM2_MAX),
    np.nan
)

# Usuwamy rekordy, kt√≥re majƒÖ NaN w price_per_m2
df = df.dropna(subset=['price_per_m2'])

# --- Opcjonalnie: usu≈Ñ rekordy bez powierzchni lub ceny ---
df = df.dropna(subset=['price', 'area'])

# --- Reset indeksu po filtracji ---
df = df.reset_index(drop=True)

# --- Podsumowanie po oczyszczeniu ---
print("Po oczyszczeniu:")
display(df.describe().round(2))
print("\nBraki w danych po oczyszczeniu:")
display(df.isnull().sum())

Po oczyszczeniu:


Unnamed: 0,price,price_per_m2,rooms,area,floor
count,34610.0,34610.0,34610.0,34610.0,34610.0
mean,841307.4,14423.34,2.63,58.19,2.68
std,585033.73,5677.54,0.96,24.54,2.42
min,79999.0,5000.0,1.0,11.0,0.0
25%,519000.0,10153.76,2.0,42.1,1.0
50%,690000.0,13483.5,3.0,53.12,2.0
75%,946033.0,17516.45,3.0,67.12,4.0
max,9200000.0,50000.0,10.0,200.0,10.0



Braki w danych po oczyszczeniu:


title           0
localization    0
price           0
price_per_m2    0
type            0
rooms           0
area            0
floor           0
district        0
city            0
region          0
dtype: int64

In [15]:
if (df['city'] == "nieznane").any():
    df['city'] = df['city'].replace("nieznane", "okolice Bydgoszczy")

In [108]:
# Filtrujemy tylko rekordy, kt√≥re **nie** zawierajƒÖ 'okolice'
df = df[~df['city'].str.contains('okolice', case=False, na=False)].copy()

# Sprawdzenie
print(df['city'].unique())

['Opole' 'Toru≈Ñ' 'Bydgoszcz' 'nieznane' 'Lublin' 'Zielona G√≥ra' '≈Å√≥d≈∫'
 'Rzesz√≥w' 'Bia≈Çystok' 'Gda≈Ñsk' 'Katowice' 'Kielce' 'Olsztyn' 'Pozna≈Ñ'
 'Szczecin' 'Krak√≥w' 'Warszawa']


In [84]:
df = df.dropna(subset=['floor'])

In [23]:
if (df['district'] == "bydgoski").any():
    df['district'] = df['district'].replace("bydgoski", "inna")

In [42]:
files = [
    "otodom_apartments1.csv",
    "otodom_apartments2.csv",
    "otodom_apartments3.csv"
]

# Wczytanie i po≈ÇƒÖczenie wszystkich CSV w jeden DataFrame
dfs = [pd.read_csv(f, sep=";", encoding="utf-8-sig") for f in files]
df_combined = pd.concat(dfs, ignore_index=True)

print(f"Combined shape: {df_combined.shape}")
df.to_csv("cities_data/otodom_apartments.csv", index=False, sep=";", encoding="utf-8-sig")
print("zapisano")


Combined shape: (40467, 12)
zapisano


In [66]:
df[df['price'] == 7924]

Unnamed: 0,title,localization,price,price_per_m2,type,rooms,area,floor,district,city,region
85,Opole - Chmielowice mieszkania i domy STRUXI,"Chmielowice, Opole, opolskie",7924.0,,Deweloper,4,,72,Chmielowice,Opole,opolskie
