In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

print("‚úÖ Selenium is working and imported successfully!")
print("ChromeDriver path:", ChromeDriverManager().install())

‚úÖ Selenium is working and imported successfully!
ChromeDriver path: C:\Users\leonb\.wdm\drivers\chromedriver\win64\141.0.7390.122\chromedriver-win32/chromedriver.exe


In [2]:
# üè† Homegate Web Scraper (for multiple Swiss cities)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.chrome.options import Options
import time, re
import pandas as pd

# ---------------------------------------------------
# üßπ Helper Function - Clean numeric strings
# ---------------------------------------------------
def clean_number_string(s):
    """Remove spaces, quotes, and non-numeric characters from strings like 'CHF 1'250'000'."""
    if not s: 
        return None
    s = re.sub(r"[^\d,\.]", "", s)
    s = s.replace("'", "").replace(" ", "")
    digits = re.findall(r"\d+", s)
    return digits[0] if digits else None

# ---------------------------------------------------
# üß© Extract fields from a single property card
# ---------------------------------------------------
def extract_from_card(card):
    text = card.text
    price = None
    rooms = None
    living = None
    address = None

    # Price (look for CHF or Fr.)
    price_match = re.search(r'(?:CHF|Fr\.)\s*([0-9\'\.\s,]+)', text, re.I)
    if price_match:
        price = clean_number_string(price_match.group(1))

    # Rooms (look for "Zimmer" or "rooms")
    rooms_match = re.search(r'(\d+(?:[.,]\d+)?)\s*(?:Zimmer|Zi\.|rooms|room)', text, re.I)
    if rooms_match:
        rooms = rooms_match.group(1).replace(",", ".")

    # Living space (m¬≤)
    living_match = re.search(r'(\d{1,4}(?:[.,]\d+)?)\s*(m¬≤|m2|sqm)', text, re.I)
    if living_match:
        living = living_match.group(1).replace(",", ".")

    # Address (try direct CSS selector first, fallback to text line)
    try:
        addr_el = card.find_element(By.CSS_SELECTOR, '[itemprop="address"], [data-test*="address"]')
        address = addr_el.text.strip()
    except Exception:
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        address = lines[-1] if lines else None

    return {"price_chf": price, "rooms": rooms, "living_m2": living, "address": address}

# ---------------------------------------------------
# üöÄ Main scraping function
# ---------------------------------------------------
def scrape_homegate(base_url, max_pages=3, delay=2.5):
    """Scrape multiple pages of property listings from Homegate."""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    results = []

    for page in range(1, max_pages + 1):
        sep = '&' if '?' in base_url else '?'
        url = f"{base_url}{sep}page={page}"
        print(f"Loading page {page}: {url}")
        driver.get(url)
        time.sleep(delay)

        cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-test="result-list-item"], div[role="listitem"]')
        if not cards:
            print("No more results found. Stopping.")
            break

        for card in cards:
            try:
                data = extract_from_card(card)
                results.append(data)
            except Exception as e:
                print("Error parsing card:", e)

        time.sleep(1.5)

    driver.quit()
    df = pd.DataFrame(results)
    return df

# ---------------------------------------------------
# üßæ Run scraper and export CSV
# ---------------------------------------------------
base_url = "https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne"

# You can adjust `max_pages` (e.g. 10) for more data
df = scrape_homegate(base_url, max_pages=5, delay=3)

# Preview first few rows
display(df.head(10))

# Save results to CSV
df.to_csv("homegate_listings.csv", index=False)
print(f"‚úÖ Saved {len(df)} listings to homegate_listings.csv")

Loading page 1: https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=1
Loading page 2: https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=2
Loading page 3: https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=3
Loading page 4: https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=4
Loading page 5: https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=5


Unnamed: 0,price_chf,rooms,living_m2,address
0,,6.5,153,3 min.listingTravelTime.station: Gen√®ve-Eaux-V...
1,216.0,,850,9 min.listingTravelTime.station: Prilly-Chasseur
2,6.0,21.0,520,4 min.listingTravelTime.station: Kalkbreite/Bh...
3,3.0,5.0,190,21 min.listingTravelTime.station: Ch√™ne-Bourg-...
4,2.0,4.5,114,3 min.listingTravelTime.station: Wartau
5,5.0,7.5,280,33 min.listingTravelTime.station: Meierhofplatz
6,2.0,5.5,149,3 min.listingTravelTime.station: Wartau
7,2.0,5.5,149,3 min.listingTravelTime.station: Wartau
8,3.0,6.5,220,4 min.listingTravelTime.station: Sallaz
9,2.0,6.5,180,4 min.listingTravelTime.station: Sallaz


‚úÖ Saved 100 listings to homegate_listings.csv


In [3]:
# Debug: show raw HTML/text for first few cards (run after driver.get(url) or at top-level)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time

chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

url = "https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=1"
driver.get(url)
time.sleep(3)

cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-test="result-list-item"], div[role="listitem"]')
print("Found cards:", len(cards))
if cards:
    for i, c in enumerate(cards[:3]):
        print(f"\n--- CARD {i} TEXT ---\n")
        print(c.text[:1000])   # print first 1000 chars of text
        print(f"\n--- CARD {i} INNER HTML SNIPPET ---\n")
        print(c.get_attribute("innerHTML")[:2000])  # print first 2000 chars of HTML
else:
    print("No cards found.")
driver.quit()


Found cards: 20

--- CARD 0 TEXT ---

1 / 13
Price on requestPremium
6.5 rooms
Av. de Chamonix 3BIS, 1207 Gen√®ve
LA LAC TOWER : nouvelle promotion immobili√®re de 18 appartements √† vendre au Eaux-Vives
Au c≈ìur du quartier des Eaux-Vives, nouvelle promotion immobili√®re de 18 appartements √† vendre alliant confort, technologie, mobilit√©, qualit√© de vie et luxe. Livraison pr√©vue pour la fin de l‚Äôann√©e.Attention pour info dans cette nouvelle promotion immobili√®re pratiquement achev√©e, nous avons d√©j√† 8 appartements vendus. Il nous reste donc plus que 7 appartements libres avec 3 chambres, 2 Studios, un penthouse et 3 arcades/bureauxDISPONIBLES √Ä LA VENTE : 15 appartements, 2 studios, un penthouse, 3 arcades/bureaux √† vendre, au total ce sont 18 logements de haut standing qui se d√©clinent en 7 typologies diff√©rentes : ‚Ä¢ 1 Appartement de 153 m2 avec terrasse de 65 m2 ‚Äì √† 3'339'000.-- ‚Ä¢ 2 Appartements de 161 m2 ‚Äì √† partir de 3'381'000.-- ‚Ä¢ 1 Appartements de 189 m

In [4]:
# Improved Homegate scraper: robust price element, image_url, listing_id, badge
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time, re
import pandas as pd
from selenium.common.exceptions import NoSuchElementException

def clean_number_string(s):
    if not s:
        return None
    # keep digits only (remove thousands separators)
    s = re.sub(r"[^\d,\.]", "", s)
    s = s.replace("'", "").replace(" ", "")
    s = s.replace(",", "")  # treat comma as thousands sep for prices
    m = re.search(r"(\d+)", s)
    return m.group(1) if m else None

def find_price_leaf_text(card):
    """
    Find a leaf DOM node containing CHF or Fr. Use XPath to prefer nodes without child elements.
    """
    # XPath: any element that contains 'CHF' or 'Fr.' and does NOT have element children
    xpaths = [
        ".//*[contains(text(),'CHF') and not(.//*[normalize-space()])]",
        ".//*[contains(text(),'Fr.') and not(.//*[normalize-space()])]",
        # fallback: any element that contains CHF (even if not leaf)
        ".//*[contains(text(),'CHF')]",
        ".//*[contains(text(),'Fr.')]"
    ]
    for xp in xpaths:
        try:
            el = card.find_element(By.XPATH, xp)
            txt = el.text.strip()
            if txt:
                return txt
        except Exception:
            continue
    return None

def extract_image_url_from_card(card):
    """
    Prefer <source srcset> (take the largest url), otherwise <img src>.
    """
    try:
        # first prefer source[srcset]
        sources = card.find_elements(By.CSS_SELECTOR, "picture source[srcset], source[srcset]")
        if sources:
            # pick the first source, parse srcset to pick largest url (by width if provided)
            srcset = sources[0].get_attribute("srcset") or ""
            if srcset:
                parts = [p.strip() for p in srcset.split(",") if p.strip()]
                # pick last part (often largest) and extract URL before the space
                last = parts[-1]
                url = last.split()[0]
                return url
    except Exception:
        pass
    try:
        img = card.find_element(By.CSS_SELECTOR, "img[src]")
        src = img.get_attribute("src")
        if src and not src.startswith("data:"):
            return src
        # if src is data:, try data-src or data-lazy attributes
        for attr in ("data-src", "data-lazy", "data-original"):
            v = img.get_attribute(attr)
            if v:
                return v
    except Exception:
        pass
    return None

def extract_badge(card):
    """Try to find small badge text like 'Premium' or 'Price on request'."""
    # common badge selectors or small labels near price
    badge_selectors = [
        '[data-test*="badge"]', '.badge', '.ListingCard_badge', '.HgBadge', '[aria-label="Premium"]'
    ]
    for sel in badge_selectors:
        try:
            els = card.find_elements(By.CSS_SELECTOR, sel)
            for el in els:
                t = el.text.strip()
                if t:
                    return t
        except Exception:
            continue
    # fallback: small element near top with <= 25 chars and not numeric
    try:
        top_texts = [ln.strip() for ln in card.text.splitlines() if ln.strip()][:6]
        for t in top_texts:
            if 2 <= len(t) <= 25 and not re.search(r'^\d+$', t):
                # exclude obvious non-badges like address lines that contain numbers
                if not re.search(r'\d', t) or "CHF" in t or "Fr." in t:
                    continue
                return t
    except Exception:
        pass
    return None

def extract_from_card(card):
    # listing id from href if available
    listing_id = None
    try:
        a = card.find_element(By.TAG_NAME, "a")
        href = a.get_attribute("href") or ""
        if href:
            # last URL path segment usually is numeric id
            listing_id = href.rstrip("/").split("/")[-1]
    except Exception:
        pass

    price_text = find_price_leaf_text(card)
    price_val = clean_number_string(price_text) if price_text else None

    # rooms & living: try previous heuristics on card.text and also on meta blocks
    text = card.text or ""
    rooms = None
    m = re.search(r'(\d+(?:[.,]\d+)?)\s*(?:Zimmer|Zi\.|rooms|room)\b', text, re.I)
    if m:
        rooms = m.group(1).replace(",", ".")
    living = None
    m2 = re.search(r'(\d{1,4}(?:[.,]\d+)?)\s*(?:m¬≤|m2|sqm)\b', text, re.I)
    if m2:
        living = m2.group(1).replace(",", ".")

    # address: try dedicated selectors first
    address = None
    try:
        addr_el = card.find_element(By.CSS_SELECTOR, '[itemprop="address"], [data-test*="address"], .ListingCard_address, .listingAddress')
        address = addr_el.text.strip()
    except Exception:
        # fallback: pick a line that looks like street + postal/city
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        for ln in lines:
            if re.search(r'\d{4}\s+[A-Za-z√Ñ√ñ√ú√§√∂√º\- ]+|[A-Za-z]+\s+\d+\b', ln):
                address = ln
                break
        if not address and lines:
            address = lines[-1]

    image_url = extract_image_url_from_card(card)
    badge = extract_badge(card)

    return {
        "listing_id": listing_id,
        "price_text": price_text,
        "price_chf": int(price_val) if (price_val and price_val.isdigit()) else price_val,
        "badge": badge,
        "rooms": rooms,
        "living_m2": living,
        "address": address,
        "image_url": image_url,
        "raw_text_snippet": (text[:400] if text else None)
    }

def scrape_homegate(base_url, max_pages=3, delay=3):
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    results = []
    for page in range(1, max_pages+1):
        sep = '&' if '?' in base_url else '?'
        url = f"{base_url}{sep}page={page}"
        print("Loading page", page, url)
        driver.get(url)
        time.sleep(delay)
        cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-test="result-list-item"], div[role="listitem"]')
        print("  found cards:", len(cards))
        if not cards:
            break
        for i, card in enumerate(cards):
            try:
                rec = extract_from_card(card)
                results.append(rec)
            except Exception as e:
                print("card parse error:", e)
        time.sleep(1.0)
    driver.quit()
    return pd.DataFrame(results)

# ---- Run scraper and save ----
base_url = "https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne"
df = scrape_homegate(base_url, max_pages=4, delay=3)   # adjust pages/delay as needed
display(df.head(20))
df.to_csv("homegate_listings_final.csv", index=False)
print("Saved to homegate_listings_final.csv")


Loading page 1 https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=1
  found cards: 20
Loading page 2 https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=2
  found cards: 20
Loading page 3 https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=3
  found cards: 20
Loading page 4 https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=4
  found cards: 20


Unnamed: 0,listing_id,price_text,price_chf,badge,rooms,living_m2,address,image_url,raw_text_snippet
0,3001941475,,,1 / 13,6.5,153.0,"Av. de Chamonix 3BIS, 1207 Gen√®ve",https://media2.homegate.ch/f_auto/t_listing_ca...,1 / 13\nPrice on requestPremium\n6.5 rooms\nAv...
1,4002624585,"CHF 216,000.‚Äì",216000.0,1 / 7,,850.0,"CHF 216,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"1 / 7\nCHF 216,000.‚Äì Premium\n850m¬≤ living spa..."
2,4002563923,"CHF 6,480,000.‚Äì",6480000.0,1 / 15,,520.0,"CHF 6,480,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"1 / 15\nCHF 6,480,000.‚Äì Premium\n21 rooms520m¬≤..."
3,4002558048,"CHF 3,312,000.‚Äì",3312000.0,1 / 12,,190.0,"CHF 3,312,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"1 / 12\nCHF 3,312,000.‚Äì Premium\n5 rooms190m¬≤ ..."
4,4002276051,"CHF 2,404,800.‚Äì",2404800.0,1 / 6,,114.0,"CHF 2,404,800.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 6\nCHF 2,404,800.‚Äì Premium\n..."
5,4002653043,"CHF 5,587,200.‚Äì",5587200.0,1 / 9,,280.0,"CHF 5,587,200.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 9\nCHF 5,587,200.‚Äì Premium\n..."
6,4002275996,"CHF 2,635,200.‚Äì",2635200.0,1 / 5,,149.0,"CHF 2,635,200.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 5\nCHF 2,635,200.‚Äì Premium\n..."
7,4002275550,"CHF 2,592,000.‚Äì",2592000.0,1 / 5,,149.0,"CHF 2,592,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 5\nCHF 2,592,000.‚Äì Premium\n..."
8,4001755176,"CHF 3,384,000.‚Äì",3384000.0,1 / 11,,220.0,"CHF 3,384,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 11\nCHF 3,384,000.‚Äì Premium\..."
9,4001755174,"CHF 2,505,600.‚Äì",2505600.0,1 / 12,,180.0,"CHF 2,505,600.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 12\nCHF 2,505,600.‚Äì Premium\..."


Saved to homegate_listings_final.csv


In [6]:
# Replace / add these helpers + updated extract_from_card in your notebook

import re
from selenium.webdriver.common.by import By

def find_rooms_leaf_text(card):
    """
    Find a leaf DOM node containing room keywords (rooms, Zimmer, Zi.)
    Prefer leaf nodes (no element children) so we don't match long paragraphs.
    """
    # Case-insensitive search using translate() to lower-case the text in XPath
    lower_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ√Ñ√ñ√ú"
    lower_map   = "abcdefghijklmnopqrstuvwxyz√§√∂√º"
    # XPath expressions prefer leaf elements (no element children).
    xpaths = [
        # look for 'rooms' or 'room'
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' rooms') and not(*)]",
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' room') and not(*)]",
        # german 'Zimmer' and abbreviation 'Zi.'
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' zimmer') and not(*)]",
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' zi.') and not(*)]",
        # last-resort: any element containing the words even if not leaf
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' rooms')]",
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' zimmer')]"
    ]
    for xp in xpaths:
        try:
            el = card.find_element(By.XPATH, xp)
            txt = el.text.strip()
            if txt:
                return txt
        except Exception:
            continue
    return None

def parse_rooms_from_text(text):
    """
    Given a string containing rooms info (like '6.5 rooms' or '17 rooms' or '6.5 Zimmer'),
    return a numeric value (float or int), or None.
    """
    if not text:
        return None
    # Try to find number immediately before the room-word
    # We'll match patterns like '6.5 rooms', '3.5 Zi.', '17 rooms', '4 rooms 135m¬≤'
    m = re.search(r'(\d+(?:[.,]\d+)?)\s*(?=(?:rooms|room|zimmer|zi\.))', text, re.I)
    if not m:
        # fallback: find the first reasonable number in the text but ensure it's not a page index like '1 / 7'
        # skip matches like '1 / 13' or '1/13' by preferring numbers > 1 OR with decimals
        candidates = re.findall(r'(\d+(?:[.,]\d+)?)', text)
        for cand in candidates:
            # ignore single-digit 1 used in pagination '1 / 7'
            try_val = cand.replace(',', '.')
            try:
                num = float(try_val)
                if num > 1 or '.' in try_val:
                    # plausible rooms number
                    m_val = num
                    break
            except:
                continue
        else:
            return None
        # return numeric candidate
        return int(m_val) if float(m_val).is_integer() else float(m_val)

    raw = m.group(1).replace(',', '.')
    try:
        val = float(raw)
        return int(val) if val.is_integer() else val
    except:
        return None

# Updated extract_from_card (only showing the parts relevant to rooms; keep the rest of your extractor)
def extract_from_card(card):
    """
    Full extractor but with improved rooms extraction. Keep your existing price/image logic.
    Returns a dict with rooms numeric.
    """
    # (existing extraction code for id/price/image/address/etc. can be here)
    # ---- rooms extraction (improved) ----
    rooms = None

    # 1) fast DOM leaf search
    try:
        rooms_leaf = find_rooms_leaf_text(card)
        if rooms_leaf:
            rooms = parse_rooms_from_text(rooms_leaf)
    except Exception:
        rooms = None

    # 2) fallback: search in card.text lines (pick line that contains room keywords)
    if rooms is None:
        try:
            text = card.text or ""
            lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
            # prefer lines that contain 'rooms' or 'Zimmer' (case-insensitive)
            for ln in lines:
                if re.search(r'(?:rooms|room|zimmer|zi\.)', ln, re.I):
                    rooms = parse_rooms_from_text(ln)
                    if rooms is not None:
                        break
            # last-resort: check first few lines for a small numeric value (avoid page indices like "1 / 13")
            if rooms is None:
                for ln in lines[:6]:
                    m = re.search(r'(\d+(?:[.,]\d+)?)', ln)
                    if m:
                        candidate = m.group(1)
                        val = candidate.replace(',', '.')
                        try:
                            f = float(val)
                            if f > 1 or '.' in val:
                                rooms = int(f) if f.is_integer() else f
                                break
                        except:
                            continue
        except Exception:
            rooms = None

    # ---- continue with other fields ----
    # For demonstration: return only rooms (in your full extractor you should include price/address/etc.)
    return {"rooms": rooms}

In [7]:
# Full Homegate scraper with improved rooms extraction (paste into one cell and run)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import time, re, pandas as pd

# -------------------------
# Rooms helpers (your pasted functions)
# -------------------------
def find_rooms_leaf_text(card):
    """
    Find a leaf DOM node containing room keywords (rooms, Zimmer, Zi.)
    Prefer leaf nodes (no element children) so we don't match long paragraphs.
    """
    lower_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ√Ñ√ñ√ú"
    lower_map   = "abcdefghijklmnopqrstuvwxyz√§√∂√º"
    xpaths = [
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' rooms') and not(*)]",
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' room') and not(*)]",
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' zimmer') and not(*)]",
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' zi.') and not(*)]",
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' rooms')]",
        f".//*[contains(translate(., '{lower_alpha}', '{lower_map}'), ' zimmer')]"
    ]
    for xp in xpaths:
        try:
            el = card.find_element(By.XPATH, xp)
            txt = el.text.strip()
            if txt:
                return txt
        except Exception:
            continue
    return None

def parse_rooms_from_text(text):
    """
    Given a string containing rooms info (like '6.5 rooms'), return numeric value (float or int) or None.
    """
    if not text:
        return None
    m = re.search(r'(\d+(?:[.,]\d+)?)\s*(?=(?:rooms|room|zimmer|zi\.))', text, re.I)
    if not m:
        # fallback: pick a plausible numeric candidate (>1 or with decimal)
        candidates = re.findall(r'(\d+(?:[.,]\d+)?)', text)
        for cand in candidates:
            val = cand.replace(',', '.')
            try:
                num = float(val)
                if num > 1 or '.' in val:
                    return int(num) if num.is_integer() else num
            except:
                continue
        return None
    raw = m.group(1).replace(',', '.')
    try:
        val = float(raw)
        return int(val) if val.is_integer() else val
    except:
        return None

# -------------------------
# Other helpers (price, image, badge, address)
# -------------------------
def clean_number_string(s):
    if not s:
        return None
    s = re.sub(r"[^\d,\.]", "", s)
    s = s.replace("'", "").replace(" ", "")
    s = s.replace(",", "")
    m = re.search(r"(\d+)", s)
    return m.group(1) if m else None

def find_price_leaf_text(card):
    xpaths = [
        ".//*[contains(text(),'CHF') and not(.//*[normalize-space()])]",
        ".//*[contains(text(),'Fr.') and not(.//*[normalize-space()])]",
        ".//*[contains(text(),'CHF')]",
        ".//*[contains(text(),'Fr.')]"
    ]
    for xp in xpaths:
        try:
            el = card.find_element(By.XPATH, xp)
            txt = el.text.strip()
            if txt:
                return txt
        except Exception:
            continue
    return None

def extract_image_url_from_card(card):
    try:
        sources = card.find_elements(By.CSS_SELECTOR, "picture source[srcset], source[srcset]")
        if sources:
            srcset = sources[0].get_attribute("srcset") or ""
            if srcset:
                parts = [p.strip() for p in srcset.split(",") if p.strip()]
                last = parts[-1]
                url = last.split()[0]
                return url
    except Exception:
        pass
    try:
        img = card.find_element(By.CSS_SELECTOR, "img[src]")
        src = img.get_attribute("src")
        if src and not src.startswith("data:"):
            return src
        for attr in ("data-src", "data-lazy", "data-original"):
            v = img.get_attribute(attr)
            if v:
                return v
    except Exception:
        pass
    return None

def extract_badge(card):
    badge_selectors = [
        '[data-test*="badge"]', '.badge', '.ListingCard_badge', '.HgBadge', '[aria-label="Premium"]'
    ]
    for sel in badge_selectors:
        try:
            els = card.find_elements(By.CSS_SELECTOR, sel)
            for el in els:
                t = el.text.strip()
                if t:
                    return t
        except Exception:
            continue
    # fallback heuristic
    try:
        top_texts = [ln.strip() for ln in card.text.splitlines() if ln.strip()][:6]
        for t in top_texts:
            if 2 <= len(t) <= 25 and not re.search(r'^\d+$', t):
                if not re.search(r'\d', t) or "CHF" in t or "Fr." in t:
                    continue
                return t
    except Exception:
        pass
    return None

# -------------------------
# Core extractor (integrates rooms helper)
# -------------------------
def extract_from_card(card):
    # listing id from href
    listing_id = None
    try:
        a = card.find_element(By.TAG_NAME, "a")
        href = a.get_attribute("href") or ""
        if href:
            listing_id = href.rstrip("/").split("/")[-1]
    except Exception:
        pass

    # price
    price_text = find_price_leaf_text(card)
    price_val = clean_number_string(price_text) if price_text else None

    # rooms (use leaf detection + fallback)
    rooms = None
    try:
        leaf = find_rooms_leaf_text(card)
        if leaf:
            rooms = parse_rooms_from_text(leaf)
    except Exception:
        rooms = None
    if rooms is None:
        # fallback scan card text lines
        try:
            lines = [ln.strip() for ln in (card.text or "").splitlines() if ln.strip()]
            for ln in lines:
                if re.search(r'(?:rooms|room|zimmer|zi\.)', ln, re.I):
                    rooms = parse_rooms_from_text(ln)
                    if rooms is not None:
                        break
            if rooms is None:
                for ln in lines[:6]:
                    m = re.search(r'(\d+(?:[.,]\d+)?)', ln)
                    if m:
                        cand = m.group(1).replace(',', '.')
                        try:
                            f = float(cand)
                            if f > 1 or '.' in cand:
                                rooms = int(f) if f.is_integer() else f
                                break
                        except:
                            continue
        except Exception:
            rooms = None

    # living space
    living = None
    try:
        m2 = re.search(r'(\d{1,4}(?:[.,]\d+)?)\s*(?:m¬≤|m2|sqm)\b', card.text or "", re.I)
        if m2:
            living = m2.group(1).replace(",", ".")
    except:
        living = None

    # address
    address = None
    try:
        addr_el = card.find_element(By.CSS_SELECTOR, '[itemprop="address"], [data-test*="address"], .ListingCard_address, .listingAddress')
        address = addr_el.text.strip()
    except Exception:
        lines = [ln.strip() for ln in (card.text or "").splitlines() if ln.strip()]
        for ln in lines:
            if re.search(r'\d{4}\s+[A-Za-z√Ñ√ñ√ú√§√∂√º\- ]+|[A-Za-z]+\s+\d+\b', ln):
                address = ln
                break
        if not address and lines:
            address = lines[-1]

    # image and badge
    image_url = extract_image_url_from_card(card)
    badge = extract_badge(card)

    return {
        "listing_id": listing_id,
        "price_text": price_text,
        "price_chf": int(price_val) if (price_val and price_val.isdigit()) else price_val,
        "badge": badge,
        "rooms": rooms,
        "living_m2": living,
        "address": address,
        "image_url": image_url,
        "raw_text_snippet": (card.text[:400] if card.text else None)
    }

# -------------------------
# Scraper runner
# -------------------------
def scrape_homegate(base_url, max_pages=3, delay=3):
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-gpu")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    results = []
    for page in range(1, max_pages+1):
        sep = '&' if '?' in base_url else '?'
        url = f"{base_url}{sep}page={page}"
        print("Loading page", page, url)
        driver.get(url)
        time.sleep(delay)
        cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-test="result-list-item"], div[role="listitem"]')
        print("  found cards:", len(cards))
        if not cards:
            break
        for card in cards:
            try:
                rec = extract_from_card(card)
                results.append(rec)
            except Exception as e:
                print("card parse error:", e)
        time.sleep(1.0)
    driver.quit()
    return pd.DataFrame(results)

# -------------------------
# Run scraper & save
# -------------------------
base_url = "https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne"
df = scrape_homegate(base_url, max_pages=4, delay=3)   # adjust max_pages/delay if you want
display(df[['listing_id','price_text','price_chf','badge','rooms','living_m2','address','image_url']].head(30))
df.to_csv("homegate_listings_rooms_fixed.csv", index=False)
print("Saved to homegate_listings_rooms_fixed.csv (rows:", len(df), ")")


Loading page 1 https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=1
  found cards: 20
Loading page 2 https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=2
  found cards: 20
Loading page 3 https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=3
  found cards: 20
Loading page 4 https://www.homegate.ch/buy/real-estate/city-basel/matching-list?loc=geo-city-zurich%2Cgeo-city-bern%2Cgeo-city-geneve%2Cgeo-city-lausanne&page=4
  found cards: 20


Unnamed: 0,listing_id,price_text,price_chf,badge,rooms,living_m2,address,image_url
0,3001941475,,,1 / 13,6.5,153.0,"Av. de Chamonix 3BIS, 1207 Gen√®ve",https://media2.homegate.ch/f_auto/t_listing_ca...
1,4002624585,"CHF 216,000.‚Äì",216000.0,1 / 7,216.0,850.0,"CHF 216,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...
2,4002563923,"CHF 6,480,000.‚Äì",6480000.0,1 / 15,21.0,520.0,"CHF 6,480,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...
3,4002558048,"CHF 3,312,000.‚Äì",3312000.0,1 / 12,5.0,190.0,"CHF 3,312,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...
4,4002276051,"CHF 2,404,800.‚Äì",2404800.0,1 / 6,4.5,114.0,"CHF 2,404,800.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...
5,4002653043,"CHF 5,587,200.‚Äì",5587200.0,1 / 9,7.5,280.0,"CHF 5,587,200.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...
6,4002275996,"CHF 2,635,200.‚Äì",2635200.0,1 / 5,5.5,149.0,"CHF 2,635,200.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...
7,4002275550,"CHF 2,592,000.‚Äì",2592000.0,1 / 5,5.5,149.0,"CHF 2,592,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...
8,4001755176,"CHF 3,384,000.‚Äì",3384000.0,1 / 11,6.5,220.0,"CHF 3,384,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...
9,4001755174,"CHF 2,505,600.‚Äì",2505600.0,1 / 12,6.5,180.0,"CHF 2,505,600.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...


Saved to homegate_listings_rooms_fixed.csv (rows: 80 )


In [8]:
import pandas as pd

# Read the saved CSV file
df = pd.read_csv("homegate_listings_rooms_fixed.csv")

# Display first 10 rows
df.head(10)

Unnamed: 0,listing_id,price_text,price_chf,badge,rooms,living_m2,address,image_url,raw_text_snippet
0,3001941475,,,1 / 13,6.5,153.0,"Av. de Chamonix 3BIS, 1207 Gen√®ve",https://media2.homegate.ch/f_auto/t_listing_ca...,1 / 13\nPrice on requestPremium\n6.5 rooms\nAv...
1,4002624585,"CHF 216,000.‚Äì",216000.0,1 / 7,216.0,850.0,"CHF 216,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"1 / 7\nCHF 216,000.‚Äì Premium\n850m¬≤ living spa..."
2,4002563923,"CHF 6,480,000.‚Äì",6480000.0,1 / 15,21.0,520.0,"CHF 6,480,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"1 / 15\nCHF 6,480,000.‚Äì Premium\n21 rooms520m¬≤..."
3,4002558048,"CHF 3,312,000.‚Äì",3312000.0,1 / 12,5.0,190.0,"CHF 3,312,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"1 / 12\nCHF 3,312,000.‚Äì Premium\n5 rooms190m¬≤ ..."
4,4002276051,"CHF 2,404,800.‚Äì",2404800.0,1 / 6,4.5,114.0,"CHF 2,404,800.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 6\nCHF 2,404,800.‚Äì Premium\n..."
5,4002653043,"CHF 5,587,200.‚Äì",5587200.0,1 / 9,7.5,280.0,"CHF 5,587,200.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 9\nCHF 5,587,200.‚Äì Premium\n..."
6,4002275996,"CHF 2,635,200.‚Äì",2635200.0,1 / 5,5.5,149.0,"CHF 2,635,200.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 5\nCHF 2,635,200.‚Äì Premium\n..."
7,4002275550,"CHF 2,592,000.‚Äì",2592000.0,1 / 5,5.5,149.0,"CHF 2,592,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 5\nCHF 2,592,000.‚Äì Premium\n..."
8,4001755176,"CHF 3,384,000.‚Äì",3384000.0,1 / 11,6.5,220.0,"CHF 3,384,000.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 11\nCHF 3,384,000.‚Äì Premium\..."
9,4001755174,"CHF 2,505,600.‚Äì",2505600.0,1 / 12,6.5,180.0,"CHF 2,505,600.‚Äì Premium",https://media2.homegate.ch/f_auto/t_listing_ca...,"New building\n1 / 12\nCHF 2,505,600.‚Äì Premium\..."


In [9]:
from IPython.display import FileLink
FileLink("homegate_listings_rooms_fixed.csv")