In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait, Select
import time
import pandas as pd
import copy

# Setup options
options = Options()
# options.add_argument("--headless")  # Uncomment after testing
options.add_argument("--window-size=1920,1080")

# Start the browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)

search_inputs = [
    {"plz": "10115"},
    {"plz": "20095"},
    {"plz": "50667"},
    {"plz": "20095"},
    {"plz": "70173"},
    {"plz": "01067"},
    {"plz": "90402"},
    {"plz": "04109"},
    {"plz": "28195"},
    {"plz": "65183"},
    {"plz": "80331"},
    {"plz": "39104"},
    {"plz": "99084"},
    {"plz": "24103"},
    {"plz": "66111"},
    {"plz": "45127"},
    {"plz": "79539"},
    {"plz": "02826"},

]
response=[]

In [2]:
def extract_results():
    try:
        rows = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//tr[@class='row1']"))
        )

        for i in range(len(rows)):
            try:
                row = rows[i]
                driver.execute_script("arguments[0].scrollIntoView(true);", row)
                time.sleep(0.3)

                try:
                    row.click()
                except Exception:
                    driver.execute_script("arguments[0].click();", row)

                active_row = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, "//tr[contains(@class, 'row1') and contains(@class, 'active')]"))
                )
                full_name_elem = active_row.find_element(By.XPATH, ".//td[1]/a")
                full_name = full_name_elem.text.strip()

                det = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, "//tr[@class='full active' and contains(@style, 'display')]//div[@class='personDetail']"))
                )

                lines = det.get_attribute("innerText").splitlines()
                lines = [line.strip() for line in lines if line.strip()]

                import re

                company = ""
                address_lines = []
                numbers = {"Tel": None, "Mobil": None, "Fax": None}
                cert_lines = []  # initialize cert_lines to avoid referenced before assignment

                collecting_address = False
                for line in lines:
                    if "Adresse" in line:
                        collecting_address = True
                        continue

                    # Extract phone/mobile/fax numbers anywhere in lines
                    for key in ["Tel", "Mobil", "Fax"]:
                        if key in line:
                            pattern = rf"{key}\s*\.?\s*[:.]?\s*(.*)"
                            match = re.search(pattern, line, re.IGNORECASE)
                            if match:
                                numbers[key] = match.group(1).strip()
                                collecting_address = False
                                break

                    if collecting_address:
                        if any(x in line for x in ["Tel", "Mobil", "Fax", "eMail", "Tätigkeitsgebiet"]):
                            collecting_address = False
                            continue
                        if not company:
                            company = line
                        else:
                            address_lines.append(line)
                    elif "Tätigkeitsgebiet" in line:
                        cert_lines = lines[i + 2:]
                

                # Join company and address lines for full address
                address = ", ".join(filter(None, [company] + address_lines))
                cert = ", ".join(cert_lines)
                # Prepare phone from Tel number or empty string
                phone = numbers["Tel"] or ""

                # Extract email
                try:
                    email_elem = det.find_element(By.CSS_SELECTOR, "a.mymail")
                    email = email_elem.get_attribute("href").replace("mailto:", "").strip()
                except Exception:
                    email = ""

                # Extract website
                try:
                    website_elem = det.find_element(By.XPATH, ".//a[starts-with(@href, 'http')]")
                    website = website_elem.get_attribute("href").strip()
                except Exception:
                    website = ""

                # If you want to extract cert_lines from the lines, you need the logic here.
                # For now, leave cert empty or implement your logic:
                #cert = ", ".join([c for c in cert_lines if "Drucken" not in c])

                response.append({
                    "Full Name": full_name,
                    "Address": address,
                    "Phone Number": phone,
                    "Email Address": email,
                    "Website": website,
                    "Certification Type / Details": cert,
                    "Source Directory Name": "BVS e.V."
                })

            except Exception as block_err:
                print(f"❗ Error extracting a row: {str(block_err)}")

    except TimeoutException:
        print("⏱️ Timeout: No detailed results found.")
    except Exception as e:
        print(f"❗ Error: {str(e)}")


In [3]:
# Start search loop
for search in search_inputs:
    driver.get("https://www.bvs-ev.de/sachverstaendige-suchen?keyword=")
    time.sleep(2)

    wait = WebDriverWait(driver, 15)
    try:
        cookie = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@id='uc-btn-accept-banner']"))
        )
        driver.execute_script("arguments[0].click();", cookie)
        time.sleep(1)
    except:
        print("✅ Cookie banner not found or already accepted.")
    driver.switch_to.frame("svzFrame")
    try:   
        plz_input = wait.until(EC.presence_of_element_located((By.XPATH, "//input[@id='plz']")))
        plz_input.clear()
        plz_input.send_keys(search["plz"])


        umkreis_select = wait.until(EC.presence_of_element_located((By.XPATH, "//select[@id='umkreis']")))
        driver.execute_script("arguments[0].style.display = 'block';", umkreis_select)
        Select(umkreis_select).select_by_visible_text("100 km")

        search_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@type='submit' and @value='Suchen']")))
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", search_button)
        time.sleep(5)
        search_button.click()
        
        print("✅ Form submitted successfully.")       
        try:
            extract_results()
            time.sleep(3)
        except Exception as e:
            print(f"🔁 All the Data Extract for the search")
            

    except Exception as e:
        print(f"❌ Error with search: ({search['plz']}) — {type(e).__name__}: {e}")
        driver.save_screenshot(f"error_{search['plz']}_{search['city']}.png")
        continue

driver.quit()



print("✅ BVS scraping with All Serach Inputs complete.")

✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or already accepted.
✅ Form submitted successfully.
✅ Cookie banner not found or alre

In [5]:
print(response)

[{'Full Name': 'Abbas, Tarek', 'Address': 'Kreisauer Ring 94, 24119 Kiel', 'Phone Number': '', 'Email Address': 'F_P_Z@outlook.de', 'Website': '', 'Certification Type / Details': 'Kreisauer Ring 94, 24119 Kiel, Mobil: 0175 8118778, eMail: F_P_Zoutlook.de, Tätigkeitsgebiet, GTÜ Prüfingenieur für, Kraftfahrzeugprüfwesen, Drucken', 'Source Directory Name': 'BVS e.V.'}, {'Full Name': 'Afridi, Rasul', 'Address': 'GTÜ Ing.-Büro Afridi, Eidelstedter Brook 9, 22523 Hamburg', 'Phone Number': '0152 02102303', 'Email Address': 'rasul.afridi86@gmail.com', 'Website': '', 'Certification Type / Details': 'Eidelstedter Brook 9, 22523 Hamburg, Tel.: 0152 02102303, eMail: rasul.afridi86gmail.com, Tätigkeitsgebiet, GTÜ-Vertragspartner für, Kraftfahrzeugprüfwesen, Drucken', 'Source Directory Name': 'BVS e.V.'}, {'Full Name': 'Alfter, Michael', 'Address': 'Kiefernweg 11, 21465 Reinbek', 'Phone Number': '040 7113777', 'Email Address': 'mail@ing-alfter.de', 'Website': 'http://www.ing-alfter.de/', 'Certificat

In [6]:
print(len(response))

3491


In [7]:
# Make a backup before filtering out entries without addresses
backup_response = copy.deepcopy(response)

In [8]:
response = [entry for entry in response if entry.get("Address")]

In [9]:
print(f"🔎 Original: {len(backup_response)} entries")
print(f"✅ After cleaning: {len(response)} entries")


🔎 Original: 3491 entries
✅ After cleaning: 3483 entries


In [10]:
# Save results
df = pd.DataFrame(response)
df.drop_duplicates(inplace=True)
df.to_csv("final_BVS_e.V._results.csv", index=False)
df.to_excel("Final_BVS_e.V._results.xlsx", index=False)

In [11]:
df.head(10)

Unnamed: 0,Full Name,Address,Phone Number,Email Address,Website,Certification Type / Details,Source Directory Name
0,"Abbas, Tarek","Kreisauer Ring 94, 24119 Kiel",,F_P_Z@outlook.de,,"Kreisauer Ring 94, 24119 Kiel, Mobil: 0175 811...",BVS e.V.
1,"Afridi, Rasul","GTÜ Ing.-Büro Afridi, Eidelstedter Brook 9, 22...",0152 02102303,rasul.afridi86@gmail.com,,"Eidelstedter Brook 9, 22523 Hamburg, Tel.: 015...",BVS e.V.
2,"Alfter, Michael","Kiefernweg 11, 21465 Reinbek",040 7113777,mail@ing-alfter.de,http://www.ing-alfter.de/,"Tel.: 040 7113777, Mobil: 0172 4005705, Fax.: ...",BVS e.V.
3,"Ancker, Michael","Sachverständigenbüro Ancker, Brodschrangen 4, ...",040 35743740,michael.ancker@ma-ic.de,http://www.ma-ic.de/,"Tel.: 040 35743740, Mobil: 0170 5369373, Fax.:...",BVS e.V.
4,"Dr. Baermann, Axel","Dr. Baermann & Partner, Hochallee 40D, 20149 H...",040 44809850,dr.baermann@t-online.de,http://www.baermannundpartner.de/,"Fax.: 040 44809851, eMail: dr.baermannt-online...",BVS e.V.
5,"Bartholdt, Thorsten",Ingenieurbüro für Fahrzeugtechnik Dipl.-Ing. T...,04503 7793850,fahrzeugtechnik-bartholdt@t-online.de,,"eMail: fahrzeugtechnik-bartholdtt-online.de, T...",BVS e.V.
6,"Bartsch, Eyk","Ingenieurbüro-Kfz-Motorentechnik Bartsch, In d...",04131 224915,Ing-BueroBartsch@gmx.de,http://www.kfz-pruefungen.de/,"eMail: Ing-BueroBartschgmx.de, Web: www.kfz-pr...",BVS e.V.
7,"Becker, Friederike","Baugenetik, Gluckstraße 57, 22081 Hamburg",040 334670590,becker@baugenetik.de,http://www.baugenetik.de/,"Web: www.baugenetik.de, Tätigkeitsgebiet, Hand...",BVS e.V.
8,"Behrens, Adolf","Weidling & Weidling Bauanalyse GmbH & Co. KG, ...",04181 93390,adolf.behrens@weidling-bauanalyse.de,http://www.weidling-bauanalyse.de/,"Tätigkeitsgebiet, IHK Lüneburg-Wolfsburg: öbuv...",BVS e.V.
9,"Beier, Patrick","Schölischer Straße 101 A, 21682 Stade",04141 8029080,p.beier@beierundpartner.de,http://www.sachverstandmitherz.de/,,BVS e.V.
