### Exercise

1. Use Selenium for opening the homepage of your favourite newspaper (not the New York Times, too easy)
2. Close the cookie banner (if it appears)
3. Get the link of the first article of the page and open it
4. Print the title and the content of the article

**tip:** [Newspaper3k](https://pypi.org/project/newspaper3k/) is a powerful library for scraping articles from newspapers. Have a look to the `fulltext` method.

In [30]:
# the hard version of code

import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

# Specify the path to the ChromeDriver
service = Service(r"C:\Users\fomic\WebDrivers\chromedriver-win64\chromedriver.exe")
driver = webdriver.Chrome(service=service)

# Open the page
driver.get("https://www.immoweb.be/en/search/house-and-apartment/for-sale")

# Wait for the page to load and the shadow DOM to be available
time.sleep(3)

try:
    # Locate the shadow host element
    shadow_host = driver.find_element(By.ID, 'usercentrics-root')

    # Access the shadow root
    shadow_root = shadow_host.shadow_root

    # Find the cookie consent button within the shadow DOM
    cookie_button = shadow_root.find_element(By.CSS_SELECTOR, "button[data-testid='uc-accept-all-button']")

    # Click the button to accept cookies
    cookie_button.click()
    print("Cookie consent accepted.")

    # Print cookies to verify if it's working
    print(driver.get_cookies())

except Exception as e:
    print("Error:", e)

# Close the driver
driver.quit()


Cookie consent accepted.
[{'domain': 'www.immoweb.be', 'httpOnly': False, 'name': '_hjHasCachedUserAttributes', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'true'}, {'domain': '.immoweb.be', 'expiry': 1731449208, 'httpOnly': False, 'name': '_hjSession_927717', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'eyJpZCI6ImM2NTFlZTA1LTNjNGYtNDkzYi05ZjQ3LWE1ZWQ0MDgxM2QyNyIsImMiOjE3MzE0NDc0MDgwMzcsInMiOjAsInIiOjAsInNiIjowLCJzciI6MCwic2UiOjAsImZzIjoxLCJzcCI6MX0='}, {'domain': '.immoweb.be', 'expiry': 1766007407, 'httpOnly': False, 'name': '_ga', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'GA1.1.888850712.1731447408'}, {'domain': '.immoweb.be', 'expiry': 1766007407, 'httpOnly': False, 'name': '_ga_X0HFVGCJ51', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'GS1.1.1731447407.1.0.1731447407.0.0.1449139655'}, {'domain': '.immoweb.be', 'expiry': 1739223407, 'httpOnly': False, 'name': '_fbp', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

# Set up WebDriver
service = Service(r"C:\Users\fomic\WebDrivers\chromedriver-win64\chromedriver.exe")
driver = webdriver.Chrome(service=service)

# List to store all the links
all_links = []

# Loop through pages 1 to 333
for page in range(1, 3):
    # Construct the URL for each page
    url = f"https://www.immoweb.be/en/search/house-and-apartment/for-sale?countries=BE&page={page}&orderBy=relevance"
    driver.get(url)
    time.sleep(3)  # Wait for the page to load

    # Close the cookie banner on the first page only
    if page == 1:
        shadow_host = driver.find_element(By.ID, 'usercentrics-root')
        shadow_root = shadow_host.shadow_root
        cookie_button = shadow_root.find_element(By.CSS_SELECTOR, "button[data-testid='uc-accept-all-button']")
        cookie_button.click()
        time.sleep(2)  # Ensure the cookie banner is closed before continuing

    # Find all elements with the structure that matches the links
    link_elements = driver.find_elements(By.CSS_SELECTOR, "a.card__title-link")

    # Extract and store the href attribute
    for element in link_elements:
        link = element.get_attribute("href")
        if link:
            all_links.append(link)

    # Optional: print progress
    print(f"Collected links from page {page}")

# Print the total number of links collected
print(f"Total links collected: {len(all_links)}")

# Optionally, save the links to a file
with open("immoweb_links.txt", "w") as file:
    for link in all_links:
        file.write(link + "\n")

# Close the browser
driver.quit()


Collected links from page 1
Collected links from page 2
Total links collected: 120


In [None]:
# Try 1_0 with webdriver_manager.chrome

import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Initialize the driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# List to store property details
property_data = []

# Function to get property details from a single page
def get_property_details(link):
    driver.get(link)
    time.sleep(3)  # Adjust sleep time if needed
    
    # Dictionary to hold details of the property
    details = {
        "Locality": None,
        "Type of property": None,
        "Subtype of property": None,
        "Price": None,
        "Type of sale": None,
        "Number of rooms": None,
        "Living Area": None,
        "Fully equipped kitchen": None,
        "Furnished": None,
        "Open fire": None,
        "Terrace": None,
        "Terrace Area": None,
        "Garden": None,
        "Garden Area": None,
        "Surface of the land": None,
        "Surface area of the plot of land": None,
        "Number of facades": None,
        "Swimming pool": None,
        "State of the building": None
    }
    
    # Extract property information (adjust selectors as needed)
    try:
        details["Locality"] = driver.find_element(By.CSS_SELECTOR, "selector_for_locality").text
        details["Type of property"] = driver.find_element(By.CSS_SELECTOR, "selector_for_property_type").text
        # Extract other fields similarly
        # ...
    except Exception as e:
        print(f"Error extracting details: {e}")
    
    return details

# Collect property links
base_url = "https://www.immoweb.be/en/search/house-and-apartment/for-sale"
for page in range(1, 11):  # Scrape the first 10 pages as an example
    driver.get(f"{base_url}?countries=BE&page={page}&orderBy=relevance")
    time.sleep(3)
    
    links = driver.find_elements(By.CSS_SELECTOR, "a.card__title-link")
    for link in links:
        property_url = link.get_attribute("href")
        property_details = get_property_details(property_url)
        property_data.append(property_details)

# Save data to CSV
csv_file = "real_estate_data.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=property_data[0].keys())
    writer.writeheader()
    for data in property_data:
        writer.writerow(data)

print(f"Data collection complete. Saved to {csv_file}")

driver.quit()


In [None]:
# Try 2_0

import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

# Specify the path to your ChromeDriver
chromedriver_path = (r"C:\Users\fomic\WebDrivers\chromedriver-win64\chromedriver.exe")  # Replace with the actual path

# Initialize the driver
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service)

# List to store property details
property_data = []

# Function to get property details from a single page
def get_property_details(link):
    driver.get(link)
    time.sleep(3)  # Adjust sleep time if needed
    
    # Dictionary to hold details of the property
    details = {
        "Locality": None,
        "Type of property": None,
        "Subtype of property": None,
        "Price": None,
        "Type of sale": None,
        "Number of rooms": None,
        "Living Area": None,
        "Fully equipped kitchen": None,
        "Furnished": None,
        "Open fire": None,
        "Terrace": None,
        "Terrace Area": None,
        "Garden": None,
        "Garden Area": None,
        "Surface of the land": None,
        "Surface area of the plot of land": None,
        "Number of facades": None,
        "Swimming pool": None,
        "State of the building": None
    }
    
    # Extract property information (adjust selectors as needed)
    try:
        details["Locality"] = driver.find_element(By.CSS_SELECTOR, "selector_for_locality").text
        details["Type of property"] = driver.find_element(By.CSS_SELECTOR, "selector_for_property_type").text
        # Extract other fields similarly
        # ...
    except Exception as e:
        print(f"Error extracting details: {e}")
    
    return details

# Collect property links
base_url = "https://www.immoweb.be/en/search/house-and-apartment/for-sale"
for page in range(1, 11):  # Scrape the first 10 pages as an example
    driver.get(f"{base_url}?countries=BE&page={page}&orderBy=relevance")
    time.sleep(3)
    
    links = driver.find_elements(By.CSS_SELECTOR, "a.card__title-link")
    for link in links:
        property_url = link.get_attribute("href")
        property_details = get_property_details(property_url)
        property_data.append(property_details)

# Save data to CSV
csv_file = "real_estate_data.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=property_data[0].keys())
    writer.writeheader()
    for data in property_data:
        writer.writerow(data)

print(f"Data collection complete. Saved to {csv_file}")

driver.quit()


In [1]:
# Try 3_0

import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException

# Specify the path to your ChromeDriver
chromedriver_path = (r"C:\Users\fomic\WebDrivers\chromedriver-win64\chromedriver.exe")  # Replace with the actual path

# Initialize the driver
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service)

# List to store property details
property_data = []

# Function to wait for element and get property details from a single page
def get_property_details(link):
    details = {
        "Locality": None,
        "Type of property": None,
        "Subtype of property": None,
        "Price": None,
        "Type of sale": None,
        "Number of rooms": None,
        "Living Area": None,
        "Fully equipped kitchen": None,
        "Furnished": None,
        "Open fire": None,
        "Terrace": None,
        "Terrace Area": None,
        "Garden": None,
        "Garden Area": None,
        "Surface of the land": None,
        "Surface area of the plot of land": None,
        "Number of facades": None,
        "Swimming pool": None,
        "State of the building": None
    }
    
    try:
        driver.get(link)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "selector_for_property_details")))
        
        # Extract data safely using try-except for each element
        try:
            details["Locality"] = driver.find_element(By.CSS_SELECTOR, "selector_for_locality").text
        except NoSuchElementException:
            details["Locality"] = None
        
        # Continue extracting other fields similarly, ensuring each has a try-except block
        
    except Exception as e:
        print(f"Error extracting details from {link}: {e}")
    
    return details

# Function to safely get the links from a page
def get_links_from_page(page_url):
    links = []
    driver.get(page_url)
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.card__title-link")))
    
    page_links = driver.find_elements(By.CSS_SELECTOR, "a.card__title-link")
    for link in page_links:
        try:
            href = link.get_attribute("href")
            if href:
                links.append(href)
        except StaleElementReferenceException:
            print("Encountered stale element, skipping this link.")
    
    return links

# Iterate through the pages and gather property links
base_url = "https://www.immoweb.be/en/search/house-and-apartment/for-sale"
for page in range(1, 3):  # Adjust the range as needed
    page_url = f"{base_url}?countries=BE&page={page}&orderBy=relevance"
    property_links = get_links_from_page(page_url)
    
    for property_link in property_links:
        property_details = get_property_details(property_link)
        property_data.append(property_details)

# Save data to CSV
csv_file = "real_estate_data.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=property_data[0].keys())
    writer.writeheader()
    for data in property_data:
        writer.writerow(data)

print(f"Data collection complete. Saved to {csv_file}")

driver.quit()


Error extracting details from https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/anderlecht/1070/20313048: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6A88638A5+3004357]
	(No symbol) [0x00007FF6A84F9970]
	(No symbol) [0x00007FF6A83A582A]
	(No symbol) [0x00007FF6A83F5B8E]
	(No symbol) [0x00007FF6A83F5E7C]
	(No symbol) [0x00007FF6A843EC27]
	(No symbol) [0x00007FF6A841BC1F]
	(No symbol) [0x00007FF6A843BA4C]
	(No symbol) [0x00007FF6A841B983]
	(No symbol) [0x00007FF6A83E7628]
	(No symbol) [0x00007FF6A83E8791]
	GetHandleVerifier [0x00007FF6A888A00D+3161901]
	GetHandleVerifier [0x00007FF6A88DE060+3506048]
	GetHandleVerifier [0x00007FF6A88D400D+3465005]
	GetHandleVerifier [0x00007FF6A8650EEB+830987]
	(No symbol) [0x00007FF6A850467F]
	(No symbol) [0x00007FF6A85009D4]
	(No symbol) [0x00007FF6A8500B6D]
	(No symbol) [0x00007FF6A84F0149]
	BaseThreadInitThunk [0x00007FFC727E259D+29]
	RtlUserThreadStart [0x00007FFC734AAF38+40]

Error extracting details from https:

In [4]:
# Try 4_0

import csv
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException

# Initialize the driver
chromedriver_path = "C:\\path\\to\\your\\chromedriver.exe"  # Replace with your path
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service)

# Function to accept cookies
def accept_cookies():
    try:
        driver.get("https://www.immoweb.be/en/search/house-and-apartment/for-sale")
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-testid='uc-accept-all-button']"))).click()
        time.sleep(2)  # Wait to ensure cookies are fully handled
    except Exception as e:
        print(f"Error handling cookies: {e}")

# Function to get property links from the page using BeautifulSoup
def get_property_links(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for a_tag in soup.select("a.card__title-link"):
        href = a_tag.get('href')
        if href:
            full_url = f"https://www.immoweb.be{href}"
            links.append(full_url)
    return links

# Function to extract property details from a link
def get_property_details(link):
    details = {
        "Locality": None,
        "Type of property": None,
        "Subtype of property": None,
        "Price": None,
        "Type of sale": None,
        "Number of rooms": None,
        "Living Area": None,
        "Fully equipped kitchen": None,
        "Furnished": None,
        "Open fire": None,
        "Terrace": None,
        "Terrace Area": None,
        "Garden": None,
        "Garden Area": None,
        "Surface of the land": None,
        "Surface area of the plot of land": None,
        "Number of facades": None,
        "Swimming pool": None,
        "State of the building": None
    }
    try:
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Example extraction logic (update with real selectors/logic)
        try:
            details["Locality"] = soup.select_one("selector_for_locality").text.strip()
        except AttributeError:
            details["Locality"] = None

        # Continue extracting other details similarly...
        
    except Exception as e:
        print(f"Error extracting details from {link}: {e}")
    
    return details

# Accept cookies initially
accept_cookies()

# Collect property links and details
property_data = []
base_url = "https://www.immoweb.be/en/search/house-and-apartment/for-sale"

for page in range(1, 2):  # Set the desired page range
    page_url = f"{base_url}?countries=BE&page={page}&orderBy=relevance"
    property_links = get_property_links(page_url)
    
    for property_link in property_links:
        property_details = get_property_details(property_link)
        if property_details:  # Check if extraction was successful
            property_data.append(property_details)

# Save data to CSV if there is valid data
if property_data:
    csv_file = "real_estate_data.csv"
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=property_data[0].keys())
        writer.writeheader()
        for data in property_data:
            writer.writerow(data)

    print(f"Data collection complete. Saved to {csv_file}")
else:
    print("No data extracted.")

driver.quit()


ModuleNotFoundError: No module named 'requests'

In [None]:
# the easiest version of code to OK for cookies
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

# Set up WebDriver
service = Service(r"C:\Users\fomic\WebDrivers\chromedriver-win64\chromedriver.exe")
driver = webdriver.Chrome(service=service)

# Open the page and wait for it to load
driver.get("https://www.immoweb.be/en/search/house-and-apartment/for-sale")
time.sleep(3)

# Access shadow DOM and click the cookie consent button
shadow_host = driver.find_element(By.ID, 'usercentrics-root')
shadow_root = shadow_host.shadow_root
cookie_button = shadow_root.find_element(By.CSS_SELECTOR, "button[data-testid='uc-accept-all-button']")
cookie_button.click()

# Print cookies to verify if they are set
print(driver.get_cookies())

# Close the browser
driver.quit()
