In [22]:
%cd /Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/
%pip install beautifulsoup4 lxml

import time, re
import requests
from bs4 import BeautifulSoup
import pandas as pd

HEADERS = {"User-Agent": "research-bot/1.0 (contact: you@example.com)"}
URL = "https://tieraerztekammer-berlin.de/notdienst/"  

def clean_text(s):
    return re.sub(r"\s+", " ", s).strip()

def scrape_taek_berlin_emergency(url=URL):
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # Find the main container that lists clinics (adjust selector to the page)
    container = soup.select_one("main, .content, .article") or soup
    cards = container.select(".clinic, .entry, article, li")  # be flexible

    rows = []
    for card in cards:
        txt = clean_text(card.get_text(" ", strip=True))

        name_el = card.select_one("h2, h3, .title, .clinic-name")
        name = clean_text(name_el.get_text()) if name_el else None

        addr = None
        addr_el = card.select_one(".address, address, .adr")
        if addr_el:
            addr = clean_text(addr_el.get_text(" "))

        phone = None
        tel_el = card.select_one("a[href^='tel'], .phone, .tel")
        if tel_el:
            phone = clean_text(tel_el.get_text() or tel_el.get("href"))
        else:
            m = re.search(r"(?:\+49|0)\s?[\d ()\-\/]{6,}", txt)
            phone = m.group(0) if m else None

        oh = None
        oh_el = card.find(text=re.compile("Öffnungs|opening", re.I))
        if oh_el:
            oh = clean_text(oh_el.parent.get_text(" "))

        rows.append({
            "name": name or None,
            "full_text": txt,
            "address": addr,
            "phone": phone,
            "opening_hours_raw": oh
        })

    df = pd.DataFrame(rows).drop_duplicates()
    return df

df = scrape_taek_berlin_emergency()
df.to_csv("taek_berlin_emergency.csv", index=False)
print(df.head())


/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics
Note: you may need to restart the kernel to use updated packages.
   name                                          full_text address  \
0  None  Klinik für Klein- und Heimtiere, Alt-Biesdorf ...    None   
1  None  Klinik für Kleintiere (Olof Löwe), Märkische A...    None   
2  None  valera – Medizinisches Kleintierzentrum Berlin...    None   
3  None  Tierarztpraxis Bärenwiese, Uhlandstr 151, 1071...    None   
4  None  Tierarztpraxis Rödiger, Scharnweberstr. 136, 1...    None   

              phone opening_hours_raw  
0     030 51 43 760              None  
1     030 93 22 093              None  
2  030 201 80 57 50              None  
3   030 23 36 26 27              None  
4     030 412 73 57              None  


  oh_el = card.find(text=re.compile("Öffnungs|opening", re.I))


In [23]:
import pandas as pd
import re

# 1) Load your uploaded CSV
df = pd.read_csv("/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/taek_berlin_emergency.csv", encoding="utf-8")
assert "full_text" in df.columns, "full_text column missing"

# --- Regexes (tune for your data) ---
RE_EMAIL   = r'[\w\.-]+@[\w\.-]+\.\w+'
RE_WEBSITE = r'(https?://[^\s,;]+|www\.[^\s,;]+)'
RE_PHONE   = r'(?:(?:\+?\s?49)|(?:\+?\s?49\(0\))|(?:0))[\s()/\-]*\d[\d\s()/\-]{5,}'
RE_POSTCODE_CITY = r'(?P<postcode>\b\d{5}\b)\s+(?P<city>[A-Za-zÄÖÜäöüß\-\s]+)'
RE_STREET  = r'(?P<street>[A-Za-zÄÖÜäöüß\.\-\s]+?)\s+(?P<housenumber>\d+[A-Za-z]?)'
RE_OPENING = r'(Öffnungszeiten|Opening hours|Öffnung|hours)\s*[:\-]?\s*(?P<opening_hours>[^•\|;]+)'

def first_match(pattern, text, flags=re.IGNORECASE):
    if pd.isna(text): return None
    m = re.search(pattern, str(text), flags)
    return m.group(0) if m else None

def extract_group(pattern, text, group, flags=re.IGNORECASE):
    if pd.isna(text): return None
    m = re.search(pattern, str(text), flags)
    return m.group(group) if m else None

def norm_space(s):
    return re.sub(r"\s+", " ", s).strip() if isinstance(s, str) else s

def normalize_phone(p):
    if not p: return None
    s = re.sub(r"[^\d+]", "", p)
    if s.startswith("0"):  # naive DE normalization
        s = "+49" + s[1:]
    return s

def normalize_url(u):
    if not u: return None
    return u if u.startswith(("http://","https://")) else "https://" + u

# --- Extract ---
s = df["full_text"].fillna("")

df["email"]          = s.map(lambda t: first_match(RE_EMAIL, t))
df["website"]        = s.map(lambda t: normalize_url(first_match(RE_WEBSITE, t)))
df["phone_number"]   = s.map(lambda t: normalize_phone(first_match(RE_PHONE, t)))
df["opening_hours"]  = s.map(lambda t: extract_group(RE_OPENING, t, "opening_hours"))

df["street"]         = s.map(lambda t: extract_group(RE_STREET, t, "street")).map(norm_space)
df["house_number"]   = s.map(lambda t: extract_group(RE_STREET, t, "housenumber")).map(norm_space)
df["postcode"]       = s.map(lambda t: extract_group(RE_POSTCODE_CITY, t, "postcode"))
df["city"]           = s.map(lambda t: extract_group(RE_POSTCODE_CITY, t, "city")).map(norm_space)

# Build full_address from parts (only non-null pieces)
def compose_address(row):
    a = " ".join([x for x in [row.street, row.house_number] if pd.notna(x) and x])
    b = " ".join([x for x in [row.postcode, row.city] if pd.notna(x) and x])
    return ", ".join([x for x in [a, b] if x]) or None

df["full_address"] = df.apply(compose_address, axis=1)

# Optional: clean up whitespace in extracted columns
for col in ["email","website","phone_number","opening_hours","street","house_number","postcode","city","full_address"]:
    df[col] = df[col].map(norm_space)

# --- Save result ---
df.to_csv("taek_berlin_emergency_parsed.csv", index=False)
print(df.head(3)[["full_text","street","house_number","postcode","city","phone_number","email","website","opening_hours","full_address"]])


                                           full_text           street  \
0  Klinik für Klein- und Heimtiere, Alt-Biesdorf ...     Alt-Biesdorf   
1  Klinik für Kleintiere (Olof Löwe), Märkische A...  Märkische Allee   
2  valera – Medizinisches Kleintierzentrum Berlin...   Potsdamer Str.   

  house_number postcode    city    phone_number email website opening_hours  \
0           22    12683  Berlin    +49305143760  None    None          None   
1          258    12679  Berlin    +49309322093  None    None          None   
2           23    14163  Berlin  +4930201805750  None    None          None   

                        full_address  
0      Alt-Biesdorf 22, 12683 Berlin  
1  Märkische Allee 258, 12679 Berlin  
2    Potsdamer Str. 23, 14163 Berlin  


In [24]:
import re
import pandas as pd

# German day abbreviations
DAYS_DE = r"(?:Mo|Di|Mi|Do|Fr|Sa|So)"

# OSM-ish compact patterns like: "Mo-Fr 09:00-18:00; Sa 10:00-14:00"
RE_HOURS_OSMISH = re.compile(
    rf"\b{DAYS_DE}(?:[,\-/ ]\s*{DAYS_DE})*\s+\d{{1,2}}[:.]?\d{{2}}\s*-\s*\d{{1,2}}[:.]?\d{{2}}"
    rf"(?:\s*;\s*{DAYS_DE}(?:[,\-/ ]\s*{DAYS_DE})*\s+\d{{1,2}}[:.]?\d{{2}}\s*-\s*\d{{1,2}}[:.]?\d{{2}})*",
    re.IGNORECASE
)

# Labeled variants like "Öffnungszeiten: Mo–Fr 9-18 Uhr" or "Sprechzeiten - ..."
RE_HOURS_LABELED = re.compile(
    r"(Öffnungs(?:zeiten|zeit)|Sprechzeiten?|Sprechstunde|Opening hours)\s*[:\-–]?\s*(?P<label_hours>[^\n\r|•;]+)",
    re.IGNORECASE
)

# Common "emergency" cues
RE_EMERGENCY_FLAG = re.compile(
    r"\b(Notfall(?:e)?|Notfälle|Notdienst|Notfallsprechstunde|Notaufnahme|24\s*h|24h|24\s*Std\.?|rund um die Uhr|emergency)\b",
    re.IGNORECASE
)

# Labeled emergency details like "Notdienst: 24h" or "Notfälle – Tel. 030 ..."
RE_EMERGENCY_DETAILS = re.compile(
    r"(Notdienst|Notfälle?|Notfallsprechstunde)\s*[:\-–]?\s*(?P<em_details>[^\n\r|•;]+)",
    re.IGNORECASE
)

# Optional phone pattern if you want to capture emergency phone on the same line
RE_PHONE = re.compile(r"(?:\+49|0)[\d\s()/\-]{6,}")


In [25]:
def extract_opening_hours(text: str):
    if not isinstance(text, str):
        return None
    # Prefer structured/OSM-like first
    m = RE_HOURS_OSMISH.search(text)
    if m:
        return m.group(0).strip()
    # Fallback to labeled phrase
    m2 = RE_HOURS_LABELED.search(text)
    if m2:
        return m2.group("label_hours").strip()
    return None

def extract_emergency(text: str):
    if not isinstance(text, str):
        return False, None, None
    flag = bool(RE_EMERGENCY_FLAG.search(text))
    details = None
    phone = None

    m = RE_EMERGENCY_DETAILS.search(text)
    if m:
        details = m.group("em_details").strip()
        # try to find a phone number inside the details
        pm = RE_PHONE.search(details)
        if pm:
            phone = pm.group(0).strip()

    return flag, details, phone


In [26]:
# df = pd.read_csv("your_file.csv", encoding="utf-8")
assert "full_text" in df.columns, "full_text column missing"

# Opening hours
df["opening_hours_raw"] = df["full_text"].apply(extract_opening_hours)

# Emergency fields
out = df["full_text"].apply(extract_emergency)
df["emergency_flag"]    = out.apply(lambda t: t[0])
df["emergency_details"] = out.apply(lambda t: (t[1] or None))
df["emergency_phone"]   = out.apply(lambda t: (t[2] or None))
df.to_csv("taek_berlin_emergency_parsed_v2.csv", index=False)

In [27]:
import re
import pandas as pd
from pathlib import Path

# --- 0) Load existing file (v2) ---

df = pd.read_csv("/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/taek_berlin_emergency.csv", encoding="utf-8")
assert "full_text" in df.columns, "full_text column missing in v2 file"

# --- 1) Regex patterns (tuned for your German examples) ---
DAYS_DE  = r"(?:Mo|Di|Mi|Do|Fr|Sa|So)"
TIME     = r"\d{1,2}[:.]?\d{0,2}\s*(?:Uhr)?"
RANGE    = rf"{TIME}\s*[-–]\s*{TIME}"
RANGE_OR = rf"{RANGE}(?:\s*(?:u\.?|und)\s*{RANGE})*"

RE_HOURS_BLOCK = re.compile(
    rf"(?:(?:{DAYS_DE})(?:\s*[-–]\s*{DAYS_DE})?\s+{RANGE_OR})"
    rf"(?:\s*(?:[|;,\n]\s*|\s{DAYS_DE}\s))?",
    re.IGNORECASE
)
RE_HOURS_LABELED = re.compile(
    r"(Öffnungs(?:zeiten|zeit)|Sprechzeiten?|Sprechstunde|Opening hours)\s*[:\-–]?\s*(?P<label_hours>[^|;•\n\r]+)",
    re.IGNORECASE
)
RE_EMERGENCY_FLAG = re.compile(
    r"\b(?:Notdienst|Notfälle?|Notfallsprechstunde|Notaufnahme|Feiertagsnotdienst|24\s*h|24h|rund um die Uhr|emergency)\b",
    re.IGNORECASE
)
RE_EMERGENCY_DETAILS = re.compile(
    r"(Notdienst|Notfälle?|Notfallsprechstunde|Feiertagsnotdienst)\s*[:\-–]?\s*(?P<em_details>[^|;•\n\r]+)",
    re.IGNORECASE
)

def extract_opening_hours(text: str):
    if not isinstance(text, str): 
        return None
    blocks = [m.group(0).strip(" ,;|") for m in RE_HOURS_BLOCK.finditer(text)]
    if blocks:
        return " | ".join(b for b in blocks if b)
    m = RE_HOURS_LABELED.search(text)
    return m.group("label_hours").strip() if m else None

def extract_emergency(text: str):
    if not isinstance(text, str): 
        return False, None
    flag = bool(RE_EMERGENCY_FLAG.search(text))
    details = None
    m = RE_EMERGENCY_DETAILS.search(text)
    if m:
        details = m.group("em_details").strip(" ,;|")
    return flag, details

# --- 2) Compute new values (from existing full_text) ---
s = df["full_text"].fillna("")
new_hours = s.map(extract_opening_hours)
new_em    = s.map(extract_emergency)
new_flag  = new_em.map(lambda x: x[0])
new_det   = new_em.map(lambda x: x[1])

# --- 3) Add/adjust without overwriting existing non-null values ---
for col, series in {
    "opening_hours_raw": new_hours,
    "emergency_flag":    new_flag,
    "emergency_details": new_det,
}.items():
    if col not in df.columns:
        df[col] = series                      # add fresh
    else:
        # only fill where currently missing/NaN; keep existing values
        df[col] = df[col].where(df[col].notna(), series)

# --- 4) Save to a NEW versioned file (v3, v4, …) ---
base = in_path.stem.replace("_v2", "")  # handle typical naming
parent = in_path.parent
version = 3
while True:
    out_path = parent / f"{base}_v{version}.csv"
    if not out_path.exists():
        break
    version += 1

df.to_csv(out_path, index=False, encoding="utf-8")
print(f"✅ Saved without overwriting v2 → {out_path.name}")


✅ Saved without overwriting v2 → taek_berlin_emergency_parsed_v4.csv


In [None]:
%cd /Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/

In [32]:
import pandas as pd

v2 = "/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/taek_berlin_emergency_parsed_v2.csv"
v3 = "/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/taek_berlin_emergency_parsed_v3.csv"

df2 = pd.read_csv(v2, encoding="utf-8")
df3 = pd.read_csv(v3, encoding="utf-8")

print("v2 cols:", len(df2.columns))
print("v3 cols:", len(df3.columns))
print("Missing in v3:", sorted(set(df2.columns) - set(df3.columns)))
print("New in v3:", sorted(set(df3.columns) - set(df2.columns)))


v2 cols: 17
v3 cols: 7
Missing in v3: ['city', 'email', 'emergency_phone', 'full_address', 'house_number', 'opening_hours', 'phone_number', 'postcode', 'street', 'website']
New in v3: []


In [33]:
import re, pandas as pd
from pathlib import Path

v2_path = Path( "/Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/taek_berlin_emergency_parsed_v2.csv"
)

df = pd.read_csv(v2_path, encoding="utf-8")

assert "full_text" in df.columns, "v2 must contain full_text"

# ---- regex (your earlier extractors) ----
DAYS_DE  = r"(?:Mo|Di|Mi|Do|Fr|Sa|So)"
TIME     = r"\d{1,2}[:.]?\d{0,2}\s*(?:Uhr)?"
RANGE    = rf"{TIME}\s*[-–]\s*{TIME}"
RANGE_OR = rf"{RANGE}(?:\s*(?:u\.?|und)\s*{RANGE})*"

RE_HOURS_BLOCK = re.compile(
    rf"(?:(?:{DAYS_DE})(?:\s*[-–]\s*{DAYS_DE})?\s+{RANGE_OR})(?:\s*(?:[|;,\n]\s*|\s{DAYS_DE}\s))?",
    re.IGNORECASE
)
RE_HOURS_LABELED = re.compile(
    r"(Öffnungs(?:zeiten|zeit)|Sprechzeiten?|Sprechstunde|Opening hours)\s*[:\-–]?\s*(?P<label_hours>[^|;•\n\r]+)",
    re.IGNORECASE
)
RE_EMERGENCY_FLAG = re.compile(
    r"\b(?:Notdienst|Notfälle?|Notfallsprechstunde|Notaufnahme|Feiertagsnotdienst|24\s*h|24h|rund um die Uhr|emergency)\b",
    re.IGNORECASE
)
RE_EMERGENCY_DETAILS = re.compile(
    r"(Notdienst|Notfälle?|Notfallsprechstunde|Feiertagsnotdienst)\s*[:\-–]?\s*(?P<em_details>[^|;•\n\r]+)",
    re.IGNORECASE
)

def extract_opening_hours(text: str):
    if not isinstance(text, str): return None
    blocks = [m.group(0).strip(" ,;|") for m in RE_HOURS_BLOCK.finditer(text)]
    if blocks:
        return " | ".join(b for b in blocks if b)
    m = RE_HOURS_LABELED.search(text)
    return m.group("label_hours").strip() if m else None

def extract_emergency(text: str):
    if not isinstance(text, str): return False, None
    flag = bool(RE_EMERGENCY_FLAG.search(text))
    details = None
    m = RE_EMERGENCY_DETAILS.search(text)
    if m:
        details = m.group("em_details").strip(" ,;|")
    return flag, details

# ---- compute new series from full_text ----
s = df["full_text"].fillna("")
new_hours = s.map(extract_opening_hours)
new_flag, new_det = zip(*s.map(extract_emergency))

# ---- add/only-fill (never drop, never overwrite non-nulls) ----
def add_or_fill(col, series):
    if col not in df.columns:
        df[col] = series
    else:
        df[col] = df[col].where(df[col].notna(), pd.Series(series))

add_or_fill("opening_hours_raw", new_hours)
add_or_fill("emergency_flag",   list(new_flag))
add_or_fill("emergency_details",list(new_det))

# ---- save to a NEW file, all columns preserved ----
out_path = v2_path.with_name("taek_berlin_emergency_parsed_v3.csv")
df.to_csv(out_path, index=False, encoding="utf-8")
print(f"Saved: {out_path}")


Saved: /Users/martinsvitek/layered-populate-data-pool-da/layered-populate-data-pool-da/vet_clinics/sources/taek_berlin_emergency_parsed_v3.csv


In [51]:
pip install beautifulsoup4 lxml pandas


Note: you may need to restart the kernel to use updated packages.


In [55]:
import re, time
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import requests
import pandas as pd
from bs4 import BeautifulSoup

START_URL = "https://www.tieraerzteverband.de/bpt/ueber-den-bpt/tierarztsuche/index.php?name=&zipcode=&town=berlin&radius=100"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Safari/537.36",
    "Accept-Language": "de-DE,de;q=0.9,en;q=0.8"
}

PHONE_CLEAN_RE = re.compile(r"[^\d+()\s/\-]")
POSTCODE_CITY_RE = re.compile(r"\b(\d{5})\s+(.+)")
STREET_RE = re.compile(r".*\d+\w?$")  # Zeile mit Hausnummer

def with_page(url, p):
    """Add/replace ?p=... in URL."""
    parts = urlparse(url)
    q = parse_qs(parts.query)
    q["p"] = [str(p)]
    new_q = urlencode({k: v[0] for k, v in q.items()})
    return urlunparse(parts._replace(query=new_q))

def total_pages(html):
    soup = BeautifulSoup(html, "lxml")
    sel = soup.select_one('form[name^="pageNaviList"] select[name="p"]')
    if not sel:
        return 1
    opts = sel.select("option")
    return max(int(o.get("value", "1")) for o in opts) if opts else 1

def text_lines(el):
    return [ln.strip() for ln in el.get_text("\n", strip=True).split("\n") if ln.strip()]

def parse_latlon_from_next_script(result_div):
    # Suche das nächste <script> nach diesem Ergebnis, das 'var longtitude' enthält
    sc = result_div.find_next("script")
    tries = 0
    while sc and tries < 5:
        t = sc.string or sc.get_text()
        if t and "var longtitude" in t and "var latitude" in t:
            m = re.search(r"longtitude\s*=\s*'([\d\.]+)';\s*var\s+latitude\s*=\s*'([\d\.]+)'", t)
            if m:
                return float(m.group(2)), float(m.group(1))  # lat, lon
        sc = sc.find_next("script")
        tries += 1
    return None, None

def parse_results(html):
    soup = BeautifulSoup(html, "lxml")
    rows = []
    for card in soup.select("div.elementStandard.elementResultLine.mgStyle"):
        # Name
        name = (card.select_one(".headline h2") or card).get_text(" ", strip=True)

        # Cols
        col1 = card.select_one(".columns .col1")
        col2 = card.select_one(".columns .col2")
        col3 = card.select_one(".columns .col3")

        practice = street = postcode = city = distance = None

        if col1:
            lines = text_lines(col1)
            # Entfernung
            for ln in lines:
                if ln.lower().startswith("entfernung"):
                    distance = ln.replace("Entfernung:", "").strip()
            # Postcode & City
            for ln in lines:
                m = POSTCODE_CITY_RE.search(ln)
                if m:
                    postcode, city = m.group(1), m.group(2).strip()
            # Street line: letzte Zeile vor PLZ, die Hausnummer enthält
            if postcode:
                # take the line directly before the postcode line that has a number
                for i, ln in enumerate(lines):
                    if POSTCODE_CITY_RE.search(ln) and i > 0:
                        candidate = lines[i-1]
                        if STREET_RE.match(candidate):
                            street = candidate
                        break
            # Praxis/Einrichtung: meist erste Zeile (ohne Entfernung/PLZ/Street)
            if lines:
                first = lines[0]
                if not POSTCODE_CITY_RE.search(first) and not first.lower().startswith("entfernung"):
                    practice = first

        phone_list, email, website = [], None, None
        if col2:
            for a in col2.select("a.phone, a.mobile"):
                ph = a.get_text(" ", strip=True)
                ph = PHONE_CLEAN_RE.sub("", ph).strip()
                if ph and ph not in phone_list:
                    phone_list.append(ph)
            a_mail = col2.select_one('a.wpst[href^="mailto:"]')
            if a_mail:
                email = a_mail.get("href").split("mailto:")[-1]
            a_www = col2.select_one('a.www[href^="http"]')
            if a_www:
                website = a_www.get("href")

        species = None
        if col3:
            checks = [c.get_text(" ", strip=True) for c in col3.select(".checkbox")]
            if checks:
                species = "; ".join(checks)

        lat, lon = parse_latlon_from_next_script(card)

        rows.append({
            "name": name or None,
            "practice": practice,
            "street": street,
            "postcode": postcode,
            "city": city,
            "distance": distance,
            "phone": " / ".join(phone_list) if phone_list else None,
            "email": email,
            "website": website,
            "species": species,
            "lat": lat,
            "lon": lon,
        })
    return rows

def scrape_all(start_url=START_URL, out_csv="bpt_tierarztsuche_berlin.csv"):
    s = requests.Session()
    s.headers.update(HEADERS)

    # 1) Erste Seite laden, Seitenzahl bestimmen
    r0 = s.get(start_url, timeout=30)
    r0.raise_for_status()
    n_pages = total_pages(r0.text)
    print(f"Gefundene Seiten: {n_pages}")
    all_rows = parse_results(r0.text)
    print(f"Seite 1: +{len(all_rows)}")

    # 2) Restliche Seiten p=2..N
    for p in range(2, n_pages + 1):
        url_p = with_page(start_url, p)
        r = s.get(url_p, timeout=30)
        r.raise_for_status()
        chunk = parse_results(r.text)
        print(f"Seite {p}: +{len(chunk)}")
        all_rows.extend(chunk)
        time.sleep(0.8)  # höflich

    # 3) Dedupe und speichern
    df = pd.DataFrame(all_rows)
    # einfache Dedupe-Heuristik:
    df["dedupe_key"] = (df["name"].fillna("") + "|" + df["street"].fillna("") + "|" + df["postcode"].fillna(""))
    df = df.drop_duplicates(subset=["dedupe_key"]).drop(columns=["dedupe_key"])
    df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"✅ Gespeichert: {out_csv} – {len(df)} Zeilen")

if __name__ == "__main__":
    scrape_all()


Gefundene Seiten: 9
Seite 1: +10
Seite 2: +10
Seite 3: +10
Seite 4: +10
Seite 5: +10
Seite 6: +10
Seite 7: +10
Seite 8: +10
Seite 9: +4
✅ Gespeichert: bpt_tierarztsuche_berlin.csv – 84 Zeilen
