In [72]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

# Expanded list of 10 search inputs
search_inputs = [
    {"plz": "10115", "city": "Berlin", "country": "Deutschland"},
    {"plz": "80331", "city": "München", "country": "Deutschland"},
    {"plz": "50667", "city": "Köln", "country": "Deutschland"},
    {"plz": "20095", "city": "Hamburg", "country": "Deutschland"},
    {"plz": "70173", "city": "Stuttgart", "country": "Deutschland"},
    {"plz": "01067", "city": "Dresden", "country": "Deutschland"},
    {"plz": "90402", "city": "Nürnberg", "country": "Deutschland"},
    {"plz": "04109", "city": "Leipzig", "country": "Deutschland"},
    {"plz": "28195", "city": "Bremen", "country": "Deutschland"},
    {"plz": "65183", "city": "Wiesbaden", "country": "Deutschland"},
]

# Setup options
options = Options()
# options.add_argument("--headless")  # Uncomment after testing
options.add_argument("--window-size=1920,1080")

# Start the browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)


response = []

In [73]:
def extract_results():
    try:
        expert_blocks = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'surveyor-search')]"))
        )
        for block in expert_blocks:
            try:
                name = block.find_element(By.XPATH, ".//span[contains(@class, 'surveyor-name')]").text.strip() \
                    if block.find_elements(By.XPATH, ".//span[contains(@class, 'surveyor-name')]") else ""

                cert = block.find_element(By.XPATH, ".//span[contains(@class, 'surveyor-title')]").text.strip() \
                    if block.find_elements(By.XPATH, ".//span[contains(@class, 'surveyor-title')]") else ""

                company = block.find_element(By.XPATH, ".//span[contains(@class, 'surveyor-company')]").text.strip() \
                    if block.find_elements(By.XPATH, ".//span[contains(@class, 'surveyor-company')]") else ""

                address = block.find_element(By.XPATH, "./p[2]").get_attribute("innerText").strip() \
                    if block.find_elements(By.XPATH, "./p[2]") else ""

                phone_block = block.find_element(By.XPATH, ".//span[contains(@class, 'surveyor-phone')]/..").text \
                    if block.find_elements(By.XPATH, ".//span[contains(@class, 'surveyor-phone')]/..") else ""
                phone = phone_block.replace("T", "").strip()

                email_block = block.find_element(By.XPATH, ".//span[contains(@class, 'surveyor-mail')]/..").text \
                    if block.find_elements(By.XPATH, ".//span[contains(@class, 'surveyor-mail')]/..") else ""
                email = email_block.replace("M", "").strip()

            # if name and address:
                response.append({
                        "Full Name": name,
                        "Address": address,
                        "Phone Number": phone,
                        "Email Address": email,
                        "Website": "",
                        "Certification Type / Details": cert,
                        "Source Directory Name": "HypZert"
                    })
                #else:
                #   print("Skipping block due to missing name or address")

            except Exception as block_err:
                print(f"❗ Error extracting a block: {str(block_err)}")

    except TimeoutException:
            print("⏱️ Timeout: No search results found; check selectors or form submission")
    except Exception as e:
        print(f"❗ Unexpected error extracting results: {str(e)}")


In [None]:

# Start search loop
for search in search_inputs:
    driver.get("https://www.hypzert.de/de/service/gutachtersuche")
    time.sleep(2)

    try:
        # Accept cookie banner
        try:
            cookie = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//*[@id='popup-buttons']/button[1]"))
            )
            driver.execute_script("arguments[0].click();", cookie)
            time.sleep(1)
        except:
            print("✅ Cookie banner not found or already accepted.")

        # Fill in form
        plz_input = wait.until(EC.presence_of_element_located((By.ID, "zip_code")))
        plz_input.clear()
        plz_input.send_keys(search["plz"])

        city_input = driver.find_element(By.ID, "location")
        city_input.clear()
        city_input.send_keys(search["city"])
        

        # Select country from dropdown
        try:
            country_dropdown_label = wait.until(EC.element_to_be_clickable(
            (By.XPATH, "//*[@id='valuer-search-form-block-form']/div/div/div/div/div[1]/div[2]/div[5]/div/div/label")
            ))
            driver.execute_script("arguments[0].click();", country_dropdown_label)
            time.sleep(1)
            country_option = wait.until(EC.element_to_be_clickable(
            (By.XPATH, "//*[@id='valuer-search-form-block-form']//li[contains(text(), 'Deutschland')]")
            ))
            driver.execute_script("arguments[0].click();", country_option)
            time.sleep(1)
        except Exception as block_err:
            print(f"❗ Error extracting a block: {block_err}")


        # Click the search button
        try:
            search_button = wait.until(EC.element_to_be_clickable(
                (By.XPATH, "//button[@class='button  is-unbreakable']")  # Update based on button text or class
            ))
            search_button.click()
            time.sleep(1)
        except TimeoutException:
            print("Search button not found; proceeding anyway")
        time.sleep(5)

        # Pagination loop
        while True:
            extract_results()
            try:
                next_button = driver.find_element(By.XPATH, '//a[@title="next" and contains(@class, "is-enabled")]')
                driver.execute_script("arguments[0].click();", next_button)
                time.sleep(3)
            except Exception as e:
                print(f"🔁 All the Data Extract for the search")
                break

    except Exception as e:
        print(f"❌ Error with search: {search['city']} ({search['plz']}) — {type(e).__name__}: {e}")
        driver.save_screenshot(f"error_{search['plz']}_{search['city']}.png")
        continue

driver.quit()



print("✅ HypZert scraping with pagination complete.")


In [None]:
import copy

# Make a backup before filtering out entries without addresses
backup_response = copy.deepcopy(response)


In [None]:
print(backup_response)



[{'Full Name': 'Abaew, Shirley', 'Address': '', 'Phone Number': '030/ 8818908', 'Email Address': 'sa@gleser-dalhoefer.de', 'Website': '', 'Certification Type / Details': 'HypZert F', 'Source Directory Name': 'HypZert'}, {'Full Name': '', 'Address': '', 'Phone Number': '', 'Email Address': '', 'Website': '', 'Certification Type / Details': '', 'Source Directory Name': 'HypZert'}, {'Full Name': '', 'Address': '', 'Phone Number': '', 'Email Address': '', 'Website': '', 'Certification Type / Details': '', 'Source Directory Name': 'HypZert'}, {'Full Name': '', 'Address': '', 'Phone Number': '', 'Email Address': '', 'Website': '', 'Certification Type / Details': '', 'Source Directory Name': 'HypZert'}, {'Full Name': 'Abaew, Shirley', 'Address': '', 'Phone Number': '030/ 8818908', 'Email Address': 'sa@gleser-dalhoefer.de', 'Website': '', 'Certification Type / Details': 'HypZert F', 'Source Directory Name': 'HypZert'}, {'Full Name': '', 'Address': '', 'Phone Number': '', 'Email Address': '', '

In [88]:
print(f"🔎 Original: {len(backup_response)} entries")
print(f"✅ After cleaning: {len(response)} entries")


🔎 Original: 2727 entries
✅ After cleaning: 516 entries


In [None]:
# Clean: remove entries without an address
response = [entry for entry in response if entry.get("Address")]
print(response)


[{'Full Name': 'Abaew, Shirley', 'Address': 'Kurfürstendamm 49\n10707 Berlin\nDeutschland', 'Phone Number': '030/ 8818908', 'Email Address': 'sa@gleser-dalhoefer.de', 'Website': '', 'Certification Type / Details': 'HypZert F', 'Source Directory Name': 'HypZert'}, {'Full Name': 'Berger, Udo', 'Address': 'Lützowplatz 4\n10785 Berlin\nDeutschland', 'Phone Number': '069 / 935345318', 'Email Address': 'udo.berger@kenstone.de', 'Website': '', 'Certification Type / Details': 'HypZert F', 'Source Directory Name': 'HypZert'}, {'Full Name': 'Bergmann, Ernst Ulrich H.', 'Address': 'Hilbertstr. 18\n12307 Berlin\nDeutschland', 'Phone Number': '030/224 11 55-11', 'Email Address': 'bergmann@be-wert.de', 'Website': '', 'Certification Type / Details': 'HypZert F\nHypZert M', 'Source Directory Name': 'HypZert'}, {'Full Name': 'Biehr, Anja', 'Address': 'Litfaß - Platz 2\n10178 Berlin\nDeutschland', 'Phone Number': '03025760870', 'Email Address': 'anja.biehr@wuestpartner.com', 'Website': '', 'Certificatio

In [85]:
# Save results
df = pd.DataFrame(response)
df.drop_duplicates(inplace=True)
df.to_csv("hypzert_results.csv", index=False)
df.to_excel("hypzert_results.xlsx", index=False)

In [87]:
df.head(100)

Unnamed: 0,Full Name,Address,Phone Number,Email Address,Website,Certification Type / Details,Source Directory Name
0,"Abaew, Shirley",Kurfürstendamm 49\n10707 Berlin\nDeutschland,030/ 8818908,sa@gleser-dalhoefer.de,,HypZert F,HypZert
1,"Berger, Udo",Lützowplatz 4\n10785 Berlin\nDeutschland,069 / 935345318,udo.berger@kenstone.de,,HypZert F,HypZert
2,"Bergmann, Ernst Ulrich H.",Hilbertstr. 18\n12307 Berlin\nDeutschland,030/224 11 55-11,bergmann@be-wert.de,,HypZert F\nHypZert M,HypZert
3,"Biehr, Anja",Litfaß - Platz 2\n10178 Berlin\nDeutschland,03025760870,anja.biehr@wuestpartner.com,,HypZert F,HypZert
4,"Blume, Lars",Einsteinufer 63a\n10587 Berlin\nDeutschland,015254675830,lars.blume@appcon.ag,,HypZert F,HypZert
...,...,...,...,...,...,...,...
95,"Büchner, Romy",Am Kartoffelgarten 14\n81671 München\nDeutschland,015140078223,buechner@einwert.com,,HypZert F,HypZert
96,"Dachsberger, Andreas",Parkring 28\n85748 Garching\nDeutschland,089 / 288010273,andreas.dachsberger@pfandbriefbank.com,,HypZert F,HypZert
97,"Denz, Michael",Am Floßkanal 4\n82515 Wolfratshausen\nDeutschland,08171/ 4203650,michael.denz@rileg.de,,HypZert S,HypZert
98,"Dienelt, Sebastian",Am Pfanderling 5\n85778 Haimhausen\nDeutschland,08133 / 43 99 390,sdienelt@immo-finanz-muenchen.de,,HypZert S,HypZert
