Mietwohnungen in Wien

Ziel dieses Projekts ist es, mithilfe gescrapter Daten von Immobilienseiten wie Willhaben einerseits ein Modell zu entwickeln, das Mietpreise vorhersagen kann, und andererseits die Daten explorativ zu analysieren, um spannende Erkenntnisse und interessante Insights zu gewinnen.

1. Scrape Data Willhaben

In [26]:
import time, random, requests
from requests import RequestException

def wh_fetch(url: str, max_tries: int = 3) -> str | None:
    for attempt in range(1, max_tries + 1):
        try:
            r = requests.get(url, headers=WH_HEADERS, timeout=20, allow_redirects=True)

            # Block/RateLimit -> backoff & retry
            if r.status_code in (401, 403, 429):
                wait = 1.5 * attempt + random.uniform(0.5, 1.5)
                print(f"[willhaben] blocked {r.status_code} -> retry in {wait:.1f}s | {url}")
                time.sleep(wait)
                continue

            r.raise_for_status()
            time.sleep(random.uniform(1.5, 2.5))
            return r.text

        except (requests.Timeout, requests.ConnectionError) as e:
            wait = 1.5 * attempt + random.uniform(0.5, 1.5)
            print(f"[willhaben] network error -> retry in {wait:.1f}s | {url} | {e}")
            time.sleep(wait)
        except RequestException as e:
            # alles andere (z.B. 404, 500, etc.)
            print(f"[willhaben] request failed -> skip | {url} | {e}")
            return None

    print(f"[willhaben] failed after {max_tries} tries -> skip | {url}")
    return None


def scrape_willhaben(pages: int = 2) -> list[dict]:
    all_items = []
    seen = set()

    for page in range(1, pages + 1):
        list_url = f"{WH_BASE}?page={page}"
        list_html = wh_fetch(list_url)

        if not list_html:
            print(f"[willhaben] list page failed -> continue | page={page}")
            continue

        detail_urls = wh_parse_list_page(list_html)
        for durl in detail_urls:
            if durl in seen:
                continue
            seen.add(durl)

            detail_html = wh_fetch(durl)
            if not detail_html:
                print(f"[willhaben] detail failed -> skip | {durl}")
                continue

            try:
                all_items.append(wh_parse_detail_page(detail_html, durl))
            except Exception as e:
                print(f"[willhaben] parse failed -> skip | {durl} | {e}")

        print(f"[willhaben] scraped pages: {page} | items total: {len(all_items)}")

    return all_items


1. Scrape Data Immowelt

In [27]:
import time
import random
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

IW_BASE = "https://www.immowelt.at/suche/wien/wohnungen/mieten?d=true&sd=DESC&sf=TIMESTAMP"
IW_SITE = "https://www.immowelt.at/"
IW_HEADERS = {"User-Agent": "Mozilla/5.0 (EducationalScraper/1.0)"}

def iw_fetch(url: str) -> str:
    r = requests.get(url, headers=IW_HEADERS, timeout=20)
    r.raise_for_status()
    time.sleep(random.uniform(0.8, 1.8))
    return r.text

def iw_parse_list_page(html: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    links = []
    for a in soup.select('a[href^="https://www.immowelt.at/expose/"]'):
        links.append(urljoin(IW_SITE, a["href"]))
    return list(dict.fromkeys(links))

def iw_get_hardfact(soup, wanted_label: str):
    wanted = wanted_label.casefold().strip()
    for hf in soup.select("app-hardfacts .hardfact"):
        label_el = hf.select_one(".hardfact__label")
        if not label_el:
            continue
        label = label_el.get_text(" ", strip=True).casefold()
        if wanted not in label:
            continue
        val_el = hf.select_one("strong, span.has-font-300")
        return val_el.get_text(" ", strip=True) if val_el else None
    return None

def iw_get_address(soup):
    street_el = soup.select_one('[data-cy="address-street"]')
    city_el = soup.select_one('[data-cy="address-city"]')
    street = street_el.get_text(" ", strip=True) if street_el else ""
    city = city_el.get_text(" ", strip=True) if city_el else ""
    return " ".join(x for x in [street, city] if x).strip()

def iw_get_equipment_value(soup, wanted_label: str):
    wanted = wanted_label.casefold().strip()
    for cell in soup.select("sd-card .equipment sd-cell-col"):
        ps = cell.select("p")
        if len(ps) < 2:
            continue
        label = ps[0].get_text(" ", strip=True).casefold()
        value = ps[1].get_text(" ", strip=True)
        if label == wanted:
            return value
    return None

def iw_get_details_text(soup) -> str:
    parts = []
    for li in soup.select("sd-card .textlist li"):
        t = li.get_text(" ", strip=True)
        if t:
            parts.append(t)
    return "\n".join(parts)

def iw_extract_features_from_details(details_text: str) -> dict:
    t = (details_text or "").casefold()

    def has_any(*needles):
        return any(n.casefold() in t for n in needles)

    out = {
        "balkon": has_any("balkon", "loggia"),
        "terrasse": has_any("terrasse"),
        "fahrstuhl": has_any("personenaufzug", "aufzug", "lift"),
        "einbauküche": has_any("einbauküche"),
        "keller": has_any("keller"),
        "haustiere_erlaubt": has_any("haustiere erlaubt"),
        "teilmöbliert_/_möbliert": has_any("möbliert", "teilweise möbliert", "teilmöbliert"),
    }

    m = re.search(r"böden:\s*([^\n\r]+)", details_text or "", flags=re.IGNORECASE)
    out["böden"] = m.group(1).strip() if m else None

    zustand_all = re.findall(r"zustand:\s*([^\n\r]+)", details_text or "", flags=re.IGNORECASE)
    out["zustand"] = None
    out["bautyp"] = None
    for v in [x.strip() for x in zustand_all]:
        vl = v.casefold()
        if vl in ("altbau", "neubau"):
            out["bautyp"] = v
        else:
            if out["zustand"] is None:
                out["zustand"] = v

    m = re.search(r"baujahr:\s*(\d{4})", details_text or "", flags=re.IGNORECASE)
    out["baujahr"] = int(m.group(1)) if m else None

    return out

def iw_extract_garage_parkplatz(details_text: str) -> dict:
    t = (details_text or "").casefold()
    garage = ("garage" in t) or ("tiefgarage" in t)
    parkplatz = ("stellplatz" in t) or ("parkplatz" in t) or ("carport" in t)
    return {"garage": garage, "parkplatz": parkplatz}

def iw_parse_detail_page(html: str, url: str) -> dict:
    soup = BeautifulSoup(html, "lxml")

    title_el = soup.select_one("app-objectmeta h1")
    title = title_el.get_text(" ", strip=True) if title_el else ""

    preis = iw_get_hardfact(soup, "Gesamtmiete")
    area = iw_get_hardfact(soup, "Wohnfläche")
    rooms = iw_get_hardfact(soup, "Zimmer")
    address = iw_get_address(soup)

    stockwerk = iw_get_equipment_value(soup, "Wohnungslage")
    bezug = iw_get_equipment_value(soup, "Bezug")

    details_text = iw_get_details_text(soup)
    feats = iw_extract_features_from_details(details_text)
    gp = iw_extract_garage_parkplatz(details_text)

    return {
        "url": url,
        "titel": title,
        "preis": preis,
        "wohnfläche": area,
        "zimmer": rooms,
        "address": address,
        "stockwerk": stockwerk,
        "verfügbar": bezug,
        "garage": gp["garage"],
        "parkplatz": gp["parkplatz"],
        "details": details_text,
        **feats,
    }

def scrape_immowelt(pages: int = 2) -> list[dict]:
    all_items = []
    seen = set()

    for page in range(1, pages + 1):
        list_url = f"{IW_BASE}&sp={page}"
        list_html = iw_fetch(list_url)

        detail_urls = iw_parse_list_page(list_html)
        for durl in detail_urls:
            if durl in seen:
                continue
            seen.add(durl)

            detail_html = iw_fetch(durl)
            all_items.append(iw_parse_detail_page(detail_html, durl))

        print(f"[immowelt] scraped pages: {page}")

    return all_items


1. Scrape Data ImmoScout

In [28]:
import time, random, requests
from requests import RequestException

def is_fetch(session: requests.Session, url: str, max_tries: int = 3) -> str | None:
    for attempt in range(1, max_tries + 1):
        try:
            r = session.get(url, timeout=25, allow_redirects=True)

            if r.status_code in (401, 403, 429):
                wait = 2.0 * attempt + random.uniform(0.5, 1.5)
                print(f"[scout24] blocked {r.status_code} -> retry in {wait:.1f}s | {url}")
                time.sleep(wait)
                session.get(IS_HOME, timeout=20)  # warm-up
                continue

            r.raise_for_status()
            time.sleep(random.uniform(1.2, 2.2))
            return r.text

        except (requests.Timeout, requests.ConnectionError) as e:
            wait = 2.0 * attempt + random.uniform(0.5, 1.5)
            print(f"[scout24] network error -> retry in {wait:.1f}s | {url} | {e}")
            time.sleep(wait)
        except RequestException as e:
            print(f"[scout24] request failed -> skip | {url} | {e}")
            return None

    print(f"[scout24] failed after {max_tries} tries -> skip | {url}")
    return None


def scrape_scout(pages: int = 2) -> list[dict]:
    items = []
    seen = set()

    with requests.Session() as s:
        s.headers.update({
            "User-Agent": IS_UA,
            "Accept-Language": "de-AT,de;q=0.9,en;q=0.8",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Referer": IS_HOME,
        })

        # warm-up (auch fail-safe)
        try:
            s.get(IS_HOME, timeout=20)
        except Exception:
            pass

        for page in range(1, pages + 1):
            lurl = is_list_url(page)
            list_html = is_fetch(s, lurl)

            if not list_html:
                print(f"[scout24] list page failed -> continue | page={page}")
                continue

            detail_urls = is_parse_list_page(list_html)
            print(f"[scout24] page {page}: {len(detail_urls)} urls")

            for durl in detail_urls:
                if durl in seen:
                    continue
                seen.add(durl)

                dhtml = is_fetch(s, durl)
                if not dhtml:
                    print(f"[scout24] detail failed -> skip | {durl}")
                    continue

                try:
                    items.append(is_parse_detail_page(dhtml, durl))
                except Exception as e:
                    print(f"[scout24] parse failed -> skip | {durl} | {e}")

    return items


2. Data Parsing und Zummenfügen

Hier werden alle Daten vereinheitlich und zusammengefügt und gleichzeit geparst in die passenden Datentypen

In [31]:
import pandas as pd

def safe_scrape(fn, *args, **kwargs) -> list[dict]:
    try:
        return fn(*args, **kwargs) or []
    except Exception as e:
        print(f"[merge] scraper crashed -> continue with empty | {fn.__name__} | {e}")
        return []

def run_all_and_merge(pages_willhaben=2, pages_immowelt=1, pages_scout=2, save_csv_path="wohnungen_merged.csv"):
    willhaben_items = safe_scrape(scrape_willhaben, pages=pages_willhaben)
    immowelt_items  = safe_scrape(scrape_immowelt, pages=pages_immowelt)
    scout_items     = safe_scrape(scrape_scout, pages=pages_scout)

    df_w = to_common_schema(pd.DataFrame(willhaben_items), "willhaben")
    df_i = to_common_schema(pd.DataFrame(immowelt_items), "immowelt")
    df_s = to_common_schema(pd.DataFrame(scout_items), "immobilienscout24")

    df_all = pd.concat([df_w, df_i, df_s], ignore_index=True)
    df_all = dedupe_listings(df_all)

    df_all = df_all.sort_values(
        by=["preis", "wohnfläche"],
        ascending=[True, False],
        na_position="last"
    ).reset_index(drop=True)

    df_all.to_csv(save_csv_path, index=False)
    return df_all

df = run_all_and_merge(
    pages_willhaben=200,
    pages_immowelt=200,
    pages_scout=200,
    save_csv_path="wohnungen_merged.csv"
)

df.head(20)


[willhaben] scraped pages: 1 | items total: 5
[willhaben] scraped pages: 2 | items total: 10
[willhaben] scraped pages: 3 | items total: 15
[willhaben] scraped pages: 4 | items total: 20
[willhaben] scraped pages: 5 | items total: 25
[willhaben] scraped pages: 6 | items total: 30
[willhaben] scraped pages: 7 | items total: 35
[willhaben] scraped pages: 8 | items total: 40
[willhaben] scraped pages: 9 | items total: 45
[willhaben] scraped pages: 10 | items total: 50
[willhaben] scraped pages: 11 | items total: 55
[willhaben] scraped pages: 12 | items total: 60
[willhaben] scraped pages: 13 | items total: 65
[willhaben] scraped pages: 14 | items total: 70
[willhaben] scraped pages: 15 | items total: 75
[willhaben] scraped pages: 16 | items total: 80
[willhaben] scraped pages: 17 | items total: 85
[willhaben] scraped pages: 18 | items total: 90
[willhaben] scraped pages: 19 | items total: 95
[willhaben] scraped pages: 20 | items total: 100
[willhaben] scraped pages: 21 | items total: 105


Unnamed: 0,quelle,url,titel,address,plz,bezirk,preis,wohnfläche,zimmer,balkon,...,lage,ausstattung,preis_und_detailinformation,zusatzinformationen,sonstiges,energieausweis_heizung,details,beschreibung,merkmale,highlight_raw
0,willhaben,https://www.willhaben.at/iad/immobilien/d/miet...,Exklusive Penthouse-Wohnung in bester Lage - T...,"1010 Wien, 01. Bezirk, Innere Stadt",1010,1,,480.0,7.0,,...,"Kärnterstrasse, Ring, Johannesgasse Region: Kä...",Terrassenanzahl:1 Abstellraum Anzahl: 2 Terras...,"Gesamtbelastung (exkl. MWSt): 22582,5 Eur",Anzahl Badezimmer: 3 Verfügbar ab: sofort Anza...,,,,,,
1,willhaben,https://www.willhaben.at/iad/immobilien/d/miet...,Exklusive Stilaltbauwohnung,"1010 Wien, 01. Bezirk, Innere Stadt",1010,1,,345.63,7.0,,...,,Keller: vorhanden,"Gesamtbelastung (exkl. MWSt): 17610,36 Eur mon...",Anzahl Etagen: 1 Garagenplätze: 2 Anzahl Schla...,,,,,,
2,willhaben,https://www.willhaben.at/iad/immobilien/d/miet...,LUXUSDACHTERRASSENWOHNUNG AM ROCHUSMARKT !,"Landstraßer Hauptstraße, 1030 Wien, 03. Bezirk...",1030,3,,345.27,7.0,,...,,"Terrassenanzahl:2.00 Terrassenfläche: 79,00 m²...","Gesamtbelastung (exkl. MWSt): 5662,03 Eur mona...",Anzahl Schlafzimmer: 5 Anzahl Badezimmer: 3 Kl...,,,,,,
3,willhaben,https://www.willhaben.at/iad/immobilien/d/miet...,Moderne 7er WG-Geignete Wohnung in TOP Lage mi...,"1090 Wien, 09. Bezirk, Alsergrund",1090,9,,270.53,7.0,True,...,Franz-Josefs-Bahnhof Infrastruktur / Entfernun...,Balkonanzahl:1 Bad mit Dusche und Wanne Boden:...,"Gesamtbelastung (exkl. MWSt): 3664,03 Eur Kaut...",Stockwerk: 4. Etage Anzahl Badezimmer: 3 Verfü...,,,,,,
4,willhaben,https://www.willhaben.at/iad/immobilien/d/miet...,Höchste Wohnqualität in Wien - exklusiv und ei...,"1030 Wien, 03. Bezirk, Landstraße",1030,3,,256.26,7.0,,...,Das Haus liegt in unmittelbarer Nähe zur Innen...,"Balkonanzahl:1 Balkonfläche:6,48 m² Boden: Fli...","Gesamtbelastung (exkl. MWSt): 4125,78 Eur Kaut...",Anzahl Etagen: 6 Stockwerk: Mezzanin Anzahl Ba...,,,,,,
5,willhaben,https://www.willhaben.at/iad/immobilien/d/miet...,256m² LUXUS APPARTEMENT voll möbliert für 3-5 ...,"1040 Wien, 04. Bezirk, Wieden",1040,4,,256.0,5.0,,...,"4. Bezirk, in Nachbarschaft zu Palais Schaumbu...",Abstellraum Anzahl: 1 Bad mit Dusche Einbauküc...,"Gesamtbelastung (exkl. MWSt): 3110,36 Eur Miet...",Stockwerk: 3. Etage Anzahl Badezimmer: 3 Kabel...,,,,,,
6,willhaben,https://www.willhaben.at/iad/immobilien/d/miet...,Stilvolle 6-Zimmer Wohnung im repräsentativen ...,"1040 Wien, 04. Bezirk, Wieden",1040,4,,252.49,6.0,True,...,Karlsplatz Infrastruktur / Entfernungen Gesund...,Abstellraum Anzahl: 1 Bad mit Dusche und Wanne...,"Gesamtbelastung (exkl. MWSt): 4701,36 Eur Heiz...",Stockwerk: 3. Etage Stiege: 1 Anzahl Badezimme...,,,,,,
7,willhaben,https://www.willhaben.at/iad/immobilien/d/miet...,Luxusdachgeschoss mit Hofburgblick,"1010 Wien, 01. Bezirk, Innere Stadt",1010,1,,247.9,5.0,,...,,Balkonanzahl:1.00 Terrassenanzahl:3.00 Balkonf...,Autostellplatz MWSt Prozent: 20.0% Kaution: 75...,Anzahl Etagen: 3 Autostellplatz frei: 1 Anzahl...,,,,,,
8,willhaben,https://www.willhaben.at/iad/immobilien/d/miet...,Ihr exklusiver Wohnsitz in Wien - Prestige & K...,"1030 Wien, 03. Bezirk, Landstraße",1030,3,,243.74,5.0,,...,Das Haus liegt in unmittelbarer Nähe zur Innen...,Balkonanzahl:1 Terrassenanzahl:1 Balkonfläche:...,"Gesamtbelastung (exkl. MWSt): 5444,45 Eur Kaut...",Anzahl Etagen: 6 Stockwerk: 1. DG Anzahl Badez...,,,,,,
9,willhaben,https://www.willhaben.at/iad/immobilien/d/miet...,DG Wohnung mit Blick in die Weinberge,"1190 Wien, 19. Bezirk, Döbling",1190,19,,237.76,6.5,,...,,Terrassenanzahl:3.00 Wintergarten Keller: vorh...,"Gesamtbelastung (exkl. MWSt): 6181,82 Eur",Anzahl Etagen: 3 Autostellplatz frei: 3 Garage...,,,,,,


In [37]:
df_old = run_all_and_merge(
    pages_willhaben=0,
    pages_immowelt=30,
    pages_scout=0,
    save_csv_path="wohnungen_merged_old.csv"
)

[immowelt] scraped pages: 1
[immowelt] scraped pages: 2
[immowelt] scraped pages: 3
[immowelt] scraped pages: 4
[immowelt] scraped pages: 5
[immowelt] scraped pages: 6
[immowelt] scraped pages: 7
[immowelt] scraped pages: 8
[immowelt] scraped pages: 9
[immowelt] scraped pages: 10
[immowelt] scraped pages: 11
[immowelt] scraped pages: 12
[immowelt] scraped pages: 13
[immowelt] scraped pages: 14
[immowelt] scraped pages: 15
[immowelt] scraped pages: 16
[immowelt] scraped pages: 17
[immowelt] scraped pages: 18
[immowelt] scraped pages: 19
[immowelt] scraped pages: 20
[immowelt] scraped pages: 21
[immowelt] scraped pages: 22
[immowelt] scraped pages: 23
[immowelt] scraped pages: 24
[immowelt] scraped pages: 25
[immowelt] scraped pages: 26
[immowelt] scraped pages: 27
[immowelt] scraped pages: 28
[immowelt] scraped pages: 29
[immowelt] scraped pages: 30


2. MongoDB

Hier wird die Verbindung zur MongoDB-Datenbank hergestellt und die notwendigen Funktionen zum Hochladen der Daten implementiert.

In [5]:
import os
from datetime import datetime, timezone
from pymongo import MongoClient, UpdateOne

MONGO_URI = os.getenv("MONGO_URI", "mongodb://mongo:27017")
DB_NAME = os.getenv("MONGO_DB", "willhaben")
COLL_NAME = os.getenv("MONGO_COLLECTION", "immobilien")

def get_collection(database):
    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
    db = client[os.getenv("MONGO_DB", database)]
    col = db[COLL_NAME]

    # Unique per Listing (damit du nicht doppelt speicherst)
    col.create_index("url", unique=True)

    col.create_index("scraped_at")

    return col

def save_items_to_mongo(items: list[dict], database: str) -> dict:
    if not items:
        return {"matched": 0, "upserted": 0, "modified": 0}

    col = get_collection(database)
    now = datetime.now(timezone.utc)

    ops = []
    for item in items:
        url = item.get("url")
        if not url:
            continue

        doc = {**item, "scraped_at": now}

        ops.append(
            UpdateOne(
                {"url": url},
                {
                    "$set": doc,
                    "$setOnInsert": {"first_seen_at": now},
                },
                upsert=True,
            )
        )

    if not ops:
        return {"matched": 0, "upserted": 0, "modified": 0}

    res = col.bulk_write(ops, ordered=False)

    return {
        "matched": res.matched_count,
        "upserted": len(res.upserted_ids or {}),
        "modified": res.modified_count,
    }

stats = save_items_to_mongo(df_save, "willhaben")

ServerSelectionTimeoutError: mongo:27017: [Errno 11001] getaddrinfo failed (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 5.0s, Topology Description: <TopologyDescription id: 6960bc17e8d76b74012083e4, topology_type: Unknown, servers: [<ServerDescription ('mongo', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('mongo:27017: [Errno 11001] getaddrinfo failed (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

3. Clean Data

Hier werden die Daten bereinigt, getrimmt und geparst, also zum Beispiel aus Strings wie 1.120€ wird der Float-Wert 1120.0. Da es viele Daten sind, ist dieser Teil entsprechend relativ umfangreich.

In [25]:
import time
import random
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

BASE = "https://www.immobilienscout24.at/regional/wien/wien/wohnung-mieten"
SITE = "https://www.immobilienscout24.at"
HOME = "https://www.immobilienscout24.at/"

UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
      "AppleWebKit/537.36 (KHTML, like Gecko) "
      "Chrome/122.0.0.0 Safari/537.36")

EURO_RE = re.compile(r"\b(?:ab\s*)?\d{1,3}(?:\.\d{3})*(?:,\d+)?\s*€\b")
AREA_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*m²", re.IGNORECASE)
ROOMS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*Zimmer", re.IGNORECASE)

def list_url(page: int) -> str:
    return BASE if page <= 1 else f"{BASE}/seite-{page}"

def fetch(session: requests.Session, url: str, max_tries: int = 3) -> str:
    for attempt in range(1, max_tries + 1):
        r = session.get(url, timeout=25, allow_redirects=True)

        # typisch bei Anti-Bot / Rate limit
        if r.status_code in (401, 403, 429):
            wait = 2.0 * attempt + random.uniform(0.5, 1.5)
            print(f"Blocked ({r.status_code}) on {url} -> retry in {wait:.1f}s")
            time.sleep(wait)
            # "warm up" neu
            session.get(HOME, timeout=20)
            continue

        r.raise_for_status()
        time.sleep(random.uniform(1.2, 2.2))
        return r.text

    raise requests.HTTPError(f"Failed after {max_tries} tries: {url}")

def parse_list_page(html: str) -> list[str]:
    soup = BeautifulSoup(html, "lxml")
    links = []
    for a in soup.select('a[href*="/expose/"]'):
        href = a.get("href")
        if not href:
            continue
        links.append(urljoin(SITE, href))
    return list(dict.fromkeys(links))

def _clean_lines(soup: BeautifulSoup) -> list[str]:
    txt = soup.get_text("\n", strip=True)
    return [ln.strip() for ln in txt.splitlines() if ln.strip()]

def _find_highlight_line(lines: list[str], title: str) -> str:
    try:
        idx = next(i for i, ln in enumerate(lines) if ln.strip() == title.strip())
    except StopIteration:
        idx = 0
    for ln in lines[idx: idx + 60]:
        if "m²" in ln and "Zimmer" in ln:
            return ln
    return ""

def _extract_section(lines: list[str], heading: str) -> str:
    h = heading.casefold()
    start = None
    for i, ln in enumerate(lines):
        if ln.casefold() == h or ln.casefold().endswith(h) or h in ln.casefold():
            start = i
            break
    if start is None:
        return ""
    out = []
    for ln in lines[start + 1:]:
        if ln.startswith("### ") or ln.startswith("#### "):
            break
        out.append(ln)
    return "\n".join(out).strip()

def parse_detail_page(html: str, url: str) -> dict:
    soup = BeautifulSoup(html, "lxml")
    title_el = soup.select_one("h1")
    title = title_el.get_text(" ", strip=True) if title_el else ""

    lines = _clean_lines(soup)

    # Preis: erster € nahe beim Titel
    preis = ""
    if title:
        try:
            idx = next(i for i, ln in enumerate(lines) if ln.strip() == title.strip())
        except StopIteration:
            idx = 0
        for ln in lines[idx: idx + 20]:
            m = EURO_RE.search(ln)
            if m:
                preis = m.group(0)
                break

    address = ""
    for ln in lines:
        if re.search(r"\b\d{4}\s+Wien\b", ln):
            address = ln.replace("Adresse anfragen", "").strip()
            break

    hl = _find_highlight_line(lines, title)
    wohnflaeche = ""
    zimmer = ""
    m = AREA_RE.search(hl)
    if m:
        wohnflaeche = f"{m.group(1).replace('.', '').replace(',', '.')} m²"
    m = ROOMS_RE.search(hl)
    if m:
        zimmer = m.group(1).replace(",", ".")

    full = "\n".join(lines).casefold()
    def has_any(*needles): return any(n.casefold() in full for n in needles)

    return {
        "url": url,
        "titel": title,
        "preis": preis,
        "address": address,
        "wohnfläche": wohnflaeche,
        "zimmer": zimmer,
        "balkon": has_any("balkon", "loggia"),
        "terrasse": has_any("terrasse"),
        "einbauküche": has_any("einbauküche"),
        "fahrstuhl": has_any("aufzug", "lift", "personenaufzug"),
        "keller": has_any("keller", "unterkellert"),
        "garage": has_any("garage", "tiefgarage"),
        "parkplatz": has_any("parkplatz", "stellplatz", "carport"),
        "beschreibung": _extract_section(lines, "### Beschreibung"),
        "merkmale": _extract_section(lines, "### Merkmale"),
        "ausstattung": _extract_section(lines, "### Ausstattung"),
        "highlight_raw": hl,
    }

def scrape_pages(pages: int = 2) -> list[dict]:
    items = []
    seen = set()

    with requests.Session() as s:
        s.headers.update({
            "User-Agent": UA,
            "Accept-Language": "de-AT,de;q=0.9,en;q=0.8",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Referer": HOME,
        })

        # warm up einmal
        s.get(HOME, timeout=20)

        for page in range(1, pages + 1):
            lurl = list_url(page)
            list_html = fetch(s, lurl)

            detail_urls = parse_list_page(list_html)
            print(f"page {page}: {len(detail_urls)} detail urls")

            for durl in detail_urls:
                if durl in seen:
                    continue
                seen.add(durl)
                dhtml = fetch(s, durl)
                items.append(parse_detail_page(dhtml, durl))

            print(f"items total: {len(items)}")

    return items

if __name__ == "__main__":
    data = scrape_pages(pages=2)
    df = pd.DataFrame(data)
    print(df.head())
    # df.to_csv("immobilienscout24_wien_miete.csv", index=False)


KeyboardInterrupt: 

3. Clean Data Teil 2

Hier werden True/False-Spalten (z. B. ob es einen Fahrstuhl gibt oder nicht) auf 1 bzw. 0 gemappt.

In [43]:
df_model = df.copy()

df_model = df_model.dropna(subset=["preis", "wohnfläche", "zimmer", "Bezirk"])

# true/false columns
bool_cols = ["einbauküche", "fahrstuhl", "balkon", "terrasse", "garage", "parkplatz", "teilmöbliert_/_möbliert"]
for c in bool_cols:
    df_model[c] = df_model[c].fillna(False).astype(int)

cat_cols = ["bautyp", "zustand"]
for c in cat_cols:
    df_model[c] = df_model[c].fillna("Unknown").astype(str)


X = df_model[["wohnfläche", "zimmer", "Bezirk",
              "einbauküche", "fahrstuhl", "balkon", "terrasse",
              "garage", "parkplatz", "teilmöbliert_/_möbliert",
              "bautyp", "zustand"]]

y = df_model["preis"]

X = pd.get_dummies(X, columns=["bautyp", "zustand"])

  df_model[c] = df_model[c].fillna(False).astype(int)
  df_model[c] = df_model[c].fillna(False).astype(int)
  df_model[c] = df_model[c].fillna(False).astype(int)
  df_model[c] = df_model[c].fillna(False).astype(int)
  df_model[c] = df_model[c].fillna(False).astype(int)


4. Model Training / Test Split

Hier werden die Daten gesplittet und in ein Trainings- und ein Testset aufgeteilt.

In [45]:
from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.15,
    random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval,
    test_size=0.15 / 0.85,
    random_state=42
)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)


Train: (412, 18) Val: (89, 18) Test: (89, 18)


5. Modell Training

Hier wird unser Modell trainiert. Wir haben uns dafür für einen Random Forest Regressor entschieden, da er nicht-lineare Zusammenhänge gut abbilden kann, mit gemischten Feature-Typen (numerisch/kategorisch) robust funktioniert und auch ohne aufwendiges Feature-Engineering solide Ergebnisse liefert.

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Train
rf = RandomForestRegressor(
    n_estimators=600,
    random_state=42,
    n_jobs=-1,
    min_samples_leaf=2
)

rf.fit(X_train, y_train)

# Evaluate
def eval_split(name, Xp, yp):
    pred = rf.predict(Xp)
    mae = mean_absolute_error(yp, pred)
    rmse = np.sqrt(mean_squared_error(yp, pred))
    r2 = r2_score(yp, pred)
    print(f"{name}: MAE={mae:.0f} €, RMSE={rmse:.0f} €, R²={r2:.3f}")
    return pred

pred_train = eval_split("Train", X_train, y_train)
pred_val   = eval_split("Val  ", X_val, y_val)
pred_test  = eval_split("Test ", X_test, y_test)

test_err = pd.DataFrame({
    "actual": y_test.values,
    "pred": pred_test,
    "abs_err": np.abs(y_test.values - pred_test),
}, index=y_test.index).sort_values("abs_err", ascending=False)

test_err.head(10)

NameError: name 'X_train' is not defined