### Below block defines the function for getting a number of search results from immoweb.

In [9]:
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import random
from random import randint
import re
from IPython.display import clear_output

def get_search_results(minresults=40):
    """Collect property urls and types by going through the search result pages of new houses and appartments,
    stopping when having reached the minimum number of results and returning a dictionary of {'url1':True/False, 'url2':True/False, ...}.
    True means house. False means apartment. Without argument only the first page is collected (~60 results)"""
    # initialise the dictionary with the results
    search_results = {}
    # initialise the running result count
    result_count = 0
    # set on which page to start the search
    page_number = 1
    # initialise the webdriver globally for use inside the subroutine
    global driver
    driver = webdriver.Chrome()
    driver.implicitly_wait(10)
    
    # start the progress indicator logic
    start_time = time.monotonic()
    
    # start the loop    
    while result_count < minresults:
        # for each loop, scrape one results page of houses and one of appartments
        # the results are added if they are not there yet
        for houselink in results_page_scrape(page_number,"house"):
            if houselink not in search_results:
                search_results[houselink] = True
        for apartmentlink in results_page_scrape(page_number,"apartment"):
            if apartmentlink not in search_results:
                search_results[apartmentlink] = False
        result_count = len(search_results)
        page_number += 1
        # update progress indicator
        clear_output(wait=True)
        time_spent = time.monotonic() - start_time
        total_time_estimation = 1/(result_count/minresults) * time_spent
        time_remaining = total_time_estimation - time_spent
        print(f"Finishing in {time_remaining/60:.1f} minutes")
        
    driver.close()
    
    clear_output(wait=True)
    print("Finished")
    return search_results

def results_page_scrape(page_number,property_type):
    '''A subroutine scraping links from 1 specific search result page, links to projects are ignored'''
    # initialise the return
    links = []
    # I slow down the frequency of requests to avoid being identified and therefore ban from the site
    time.sleep(random.uniform(1.0, 2.0))
    url=f'https://www.immoweb.be/en/search/{property_type}/for-sale?countries=BE&isALifeAnnuitySale=false&page={page_number}&orderBy=newest'
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html,'lxml')
    
    for elem in soup.find_all('a', attrs={"class":"card__title-link"}):
        # get hyperlink to property page
        hyperlink = elem.get('href')
        # include in the return if it is not a -project-
        if "-project-" not in hyperlink:
            # cut the searchID off
            hyperlink = re.match("(.+)\?searchId=.+", hyperlink).group(1)
            links.append(hyperlink)
            
    return links

### Getting 12K results would take about 20 minutes..
#### Lets run 1000 for testing:

In [10]:
get_search_results(1000)

Finished


{'https://www.immoweb.be/en/classified/house/for-sale/beauraing/5570/8958510': True,
 'https://www.immoweb.be/en/classified/house/for-sale/boussu/7300/8958506': True,
 'https://www.immoweb.be/en/classified/house/for-sale/berlare-overmere/9290/8958489': True,
 'https://www.immoweb.be/en/classified/house/for-sale/souvret/6182/8921917': True,
 'https://www.immoweb.be/en/classified/bungalow/for-sale/saive/4671/8931132': True,
 'https://www.immoweb.be/en/classified/villa/for-sale/wavre/1300/8845463': True,
 'https://www.immoweb.be/en/classified/villa/for-sale/bierges/1301/8844117': True,
 'https://www.immoweb.be/en/classified/house/for-sale/roeselare/8800/8958485': True,
 'https://www.immoweb.be/en/classified/house/for-sale/roeselare/8800/8958484': True,
 'https://www.immoweb.be/en/classified/house/for-sale/roeselare/8800/8958483': True,
 'https://www.immoweb.be/en/classified/house/for-sale/roeselare/8800/8958482': True,
 'https://www.immoweb.be/en/classified/house/for-sale/de-pinte/9840/89