### Below block defines the function for getting a number of search results from immoweb.

In [4]:
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import random
from random import randint
import re

def get_search_results(maxpage):
    """Collect property urls and types by going through the search result pages of new 'house' and new 'appartment',
    stopping at {maxpage} and returning a dictionary of {'url1':'0/1', 'url2':'0/1', ...}. 1 means house. 0 means apartment."""
    # initialise the dictionary with the results
    search_results = {}
    # start the loop
    for i in range(1, maxpage+1):
        # for each loop, scrape one results page of houses and one of appartments
        # the results are added if they are not there yet
        for houselink in results_page_scrape(i,"house"):
            if houselink not in search_results:
                search_results[houselink] = 1
        for apartmentlink in results_page_scrape(i,"apartment"):
            if apartmentlink not in search_results:
                search_results[apartmentlink] = 0
    return search_results

def results_page_scrape(pagenumber,propertytype):
    '''A subroutine scraping links from 1 specific search result page, links to projects are ignored'''
    # initialise the return
    links = []
    # I slow down the frequency of requests to avoid being identified and therefore ban from the site
    time.sleep(random.uniform(1.0, 2.0))
    # setup the selenium webdriver; if able to find elements within the given
    # span it returns as soon as finding them, else it raises an exception after 10 seconds.
    driver = webdriver.Chrome()
    driver.implicitly_wait(10)
    url=f'https://www.immoweb.be/en/search/{propertytype}/for-sale?countries=BE&isALifeAnnuitySale=false&page={pagenumber}&orderBy=newest'
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html,'lxml')
    for elem in soup.find_all('a', attrs={"class":"card__title-link"}):
        # get hyperlink to property page
        hyperlink = elem.get('href')
        # cut the searchID off
        hyperlink = re.match("(.+)\?searchId=.+", hyperlink).group(1)
        # include in the return if it is not a -project-
        if "-project-" not in hyperlink:
            links.append(hyperlink)
    driver.close()
    return links

### Getting 12K results would need to run this function up to max page number 200. That would take about 1 hour.
#### Lets just do 4:

In [5]:
get_search_results(4)

{'https://www.immoweb.be/en/classified/house/for-sale/leopoldsburg/3970/8954996': 1,
 'https://www.immoweb.be/en/classified/house/for-sale/uccle/1180/8954956': 1,
 'https://www.immoweb.be/en/classified/house/for-sale/westerlo/2260/8954991': 1,
 'https://www.immoweb.be/en/classified/house/for-sale/rouveroy-(ht.)/7120/8954945': 1,
 'https://www.immoweb.be/en/classified/villa/for-sale/auderghem/1160/8954938': 1,
 'https://www.immoweb.be/en/classified/villa/for-sale/rhode-saint-genese/1640/8913207': 1,
 'https://www.immoweb.be/en/classified/villa/for-sale/lasne/1380/8954931': 1,
 'https://www.immoweb.be/en/classified/house/for-sale/berchem/2600/8821388': 1,
 'https://www.immoweb.be/en/classified/house/for-sale/waremme/4300/8954930': 1,
 'https://www.immoweb.be/en/classified/house/for-sale/emines/5080/8954927': 1,
 'https://www.immoweb.be/en/classified/villa/for-sale/boncelles/4100/8954922': 1,
 'https://www.immoweb.be/en/classified/house/for-sale/couillet/6010/8954926': 1,
 'https://www.im