## scrap_list

Main public function to scrap a list of urls.<br>
dict_urls : dictionnary with urls(string) as key and is_house(bool) as value<br>
Returns a dictionnary compatible with the dataframe model<br>

## scrap

Private function to scrap a specific url.<br>
url : url string <br>
is_house : bit indicating wheter the estate is a house or an appartment<br>
Returns a dictionnary with the propery name as key and it's value as value.<br>

## get_property_value

Private function to scrap a specific property with to get an expected value.<br>
soup : BeautifulSoup object with the html data <br>
name : name of the property on the website <br>
Returns a value for the requested property or None if there is no value.

## get_property_bool

Private unction to scrap a specific property to get Boolean.<br>
soup : BeautifulSoup object with the html data <br>
name : name of the property on the website <br>
Returns a boolean for the requested property.

    

In [83]:
from bs4 import BeautifulSoup 
import re 
import requests

def get_property_value(soup, name):
    #Looks into every row of the tables
    for elem in soup.find_all('tr'):
        #If it finds an element with text equals to property name it will return it's equivalent value
        if elem.th and re.search(name, str(elem.th.string)):
            return elem.td.contents[0].strip()
    #If nothing was found, will return None 
    return None

def get_property_bool(soup, name):
    #Looks into every row of the tables
    for elem in soup.find_all('tr'):
        #If it finds an element with text equals to property name it will return true
        if elem.th and re.search(name, str(elem.th.string)):
            return 1
    #If nothing was found, will return false
    return 0


def scrap_list(dict_urls): 
    #listing all the property names
    properties = ["hyperlink" ,"locality", "postcode", "house_is", "property_subtype",	"price", "sale", "rooms_number", "area", "kitchen_has", "furnished",	"open_fire", "terrace", "terrace_area", "garden", "garden_area", "land_surface", "land_plot_surface", "facades_number", "swimming_pool_has"]



    #making a dict with all the property names as key and an empty list as value
    dict_dataframe = {}
    for property_name in properties:
        dict_dataframe[property_name] = []

    #scrap each url of the input and put the result into a variable
    for key in dict_urls: 
        dict_result_scrapping = scrap(key, dict_urls[key])

        #for each property (key) of the scrapping out put, match it with dataframe property. If none exist, just use None
        for key1 in dict_dataframe:
            dict_dataframe[key1].append(dict_result_scrapping.get(key1, False) or None)            
    
    return dict_dataframe

def scrap(url, is_house): 
    dict = {} 
    r = requests.get(url) 
    soup = BeautifulSoup(r.content,'html.parser') 


    #for every property, call the right function to get the needed data
    dict["hyperlink"] = url
    dict["locality"] = url.split("/")[7]
    dict["postcode"] = url.split("/")[8]
    dict['house_is'] = is_house
    dict['property_subtype'] = url.split("/")[5]
    dict['price'] = get_property_value(soup, 'Price')
    dict['sale'] = ''
    dict['rooms_number'] = get_property_value(soup, 'Bedrooms')
    dict['area'] = get_property_value(soup, 'Living area')
    dict['kitchen_has'] = get_property_bool(soup, 'Kitchen type')
    dict['furnished'] = get_property_bool(soup, 'Furnished')
    dict['open_fire'] = get_property_bool(soup, 'Fireplace')
    dict['terrace'] = get_property_bool(soup, 'Terrace surface')
    dict['terrace_area'] = get_property_value(soup, 'Terrace surface')
    dict['garden'] = get_property_bool(soup, 'Garden')
    dict['garden_area'] = get_property_value(soup, 'Garden surface')
    dict['land_surface'] = None
    dict['land_plot_surface'] = None
    dict['facades_number'] = get_property_value(soup, 'Facades')
    dict['swimming_pool_has'] = get_property_bool(soup, 'Swimming pool')
    
    return dict

{'hyperlink': 'https://www.immoweb.be/en/classified/apartment/for-sale/auderghem/1160/8899851?searchId=5f6c49990b1b6', 'locality': 'auderghem', 'postcode': '1160', 'house_is': 0, 'property_subtype': 'apartment', 'price': '', 'sale': '', 'rooms_number': '2', 'area': '87', 'kitchen_has': True, 'furnished': False, 'open_fire': False, 'terrace': True, 'terrace_area': '13', 'garden': False, 'garden_area': None, 'land_surface': '', 'land_plot_surface': '', 'facades_number': '2', 'swimming_pool_has': False}
