import libraries

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 


In [28]:
# placeholder for data extraction

mydata=[]

# header to prevent bot detection
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}

In [29]:
# function to scrape pages

def scrape_page(url):
    response=requests.get(url, headers=headers)
    soup=BeautifulSoup(response.text,"html.parser")

    # extract each listing

    listings=soup.find_all('div', class_='text-block') 
  
    for listing in listings:
     
        try:
            #extract the necessary information from each listening
            locality = listing.find('span', class_='overview__text').text.strip()
            print(locality)
            property_type = "Apartment"
            subtype = listing.find('span', class_='overview__text').text.strip()
            #clean price
            price = listing.find('span', class_='overview__text').text.strip().replace('€', '').replace('.', '').replace(',', '').strip()
            type_of_sale = "For Sale"
            rooms = listing.find('span', class_='overview__text').text.strip()
            living_area = listing.find('span', class_='overview__text').text.strip().replace('m²', '').strip()
            kitchen = listing.find('span', class_='overview__text').text.strip()
            furnished = listing.find('span', class_='overview__text').text.strip()
            open_fire = listing.find('span', class_='overview__text').text.strip()
            terrace = listing.find('span', class_='overview__text').text.strip()
            garden = listing.find('span', class_='overview__text').text.strip()
            surface_land = listing.find('span', class_='overview__text').text.strip()
            surface_area_plot_land =listing.find('span', class_='overview__text').text.strip()
            number_of_facades =listing.find('span', class_='overview__text').text.strip()
            swimming_pool = listing.find('span', class_='overview__text').text.strip()
            state_of_building = listing.find('span', class_='overview__text').text.strip()

            # Append the extracted data
            mydata.append({
                "Locality": locality,
                "Type of property": property_type,
                "Subtype of property": subtype,
                "Price": price,
                "Type of sale": type_of_sale,
                "Number of rooms": int(rooms) if rooms.isdigit() else None,
                "Living Area": int(living_area) if living_area.isdigit() else None,
                "Fully equipped kitchen": 1 if 'equipped' in kitchen.lower() else 0,
                "Furnished": 1 if 'furnished' in furnished.lower() else 0,
                "Open fire": 1 if 'yes' in open_fire.lower() else 0,
                "Terrace": 1 if 'yes' in terrace.lower() else 0,
                "Garden": 1 if 'yes' in garden.lower() else 0,
                "Surface of land": surface_land,
                "Surface area plot of land": surface_area_plot_land,
                "Number of facades": number_of_facades,
                "Swimming pool": 1 if 'yes' in swimming_pool.lower() else 0,
                "State of the building": state_of_building
            })
        except AttributeError:
            continue

for page in range (1,10):
    url=f'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page={page}&orderBy=relevance'
    scrape_page(url)


# now let's convert into dataframe and let's save as CSV file

df=pd.DataFrame(mydata)
df.to_csv('proporties-dateset.csv', index=False)

#Adding Concurrency for Faster Scraping


In [None]:
from concurrent.futures import ThreadPoolExecutor

def scrape_multiple_pages(strating_page, ending_page):
    with ThreadPoolExecutor(max_workers=5) as executor:
        urls = [f'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page={page}&orderBy=relevance' for page in range(strating_page, ending_page + 1)]
        executor.map(scrape_page, urls)


scrape_multiple_pages(1,10) #adjust the range 10,000 listings
