In [5]:

import time
from threading import Thread
from time import perf_counter
import requests
from bs4 import BeautifulSoup
import json
import csv
from typing import Any
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import pandas as pd
import re



def get_in(data: dict, keys: list, default: Any = None):
    obj = data
    for key in keys:
        if not obj or key not in obj:
            return default
        obj = obj[key]
    return obj

def getTypeOfSale(data):
    keys = ["isPublicSale", "isNotarySale", "isLifeAnnuitySale", "isAnInteractiveSale", 
            "isNewlyBuilt", "isInvestmentProject", "isUnderOption", "isNewRealEstateProject"]
    for key in keys:
        if get_in(data, ["flags", key]) == True:
            return key.replace("is", "")
    return None

def extract_energy_certificate(script_content):
    # Regex to find energy_certificate value
    match = re.search(r'"energy_certificate"\s*:\s*"([^"]*)"', script_content)
    
    if match:
        return match.group(1)
    
    return None

def get_property_data(url):
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"})
    parsed_data = {}
    if response.status_code == 200:
        response_text = response.text
        soup = BeautifulSoup(response.content, 'html.parser')
        soup2 = BeautifulSoup(response_text, 'html.parser')
        script_tag = soup.find('script', string=lambda text: text and 'av_items' in text)
        script_content = script_tag.string
        
        s = soup.select('iw-load-advertisements')
        if len(s) > 0 and s[0].has_attr(":classified"):
            data = json.loads(s[0].attrs[":classified"])
            parsed_data["bedrooms"] = get_in(data, ["property", "bedroomCount"])
            parsed_data["property_type"] = get_in(data, ["property", "type"])
            parsed_data["property_subtype"] = get_in(data, ["property", "subtype"])
            parsed_data["locality"] = get_in(data, ["property", "location", "locality"])
            parsed_data["postal_code"] = get_in(data, ["property", "location", "postalCode"])
            parsed_data["street"] = get_in(data, ["property", "location", "street"])
            parsed_data["number"] = get_in(data, ["property", "location", "number"])
            parsed_data["box"] = get_in(data, ["property", "location", "box"])
            parsed_data["kitchen"] = get_in(data, ["property", "kitchen", "type"])
            parsed_data["facades"] = get_in(data, ["property", "building", "facadeCount"])
            parsed_data["price"] = get_in(data, ["transaction", "sale", "price"])
            parsed_data["furnished"] = get_in(data, ["transaction", "sale", "isFurnished"])
            parsed_data["terrace"] = get_in(data, ["property", "hasTerrace"])
            parsed_data["terraceSurface"] = get_in(data, ["property", "terraceSurface"])
            parsed_data["fireplace"] = get_in(data, ["property", "fireplaceExists"])
            parsed_data["fireplaceCount"] = get_in(data, ["property", "fireplaceCount"])
            parsed_data["buildingState"] = get_in(data, ["property", "building", "condition"])
            parsed_data["garden"] = get_in(data, ["property", "hasGarden"])
            parsed_data["gardenSurface"] = get_in(data, ["property", "gardenSurface"])
            parsed_data["pool"] = get_in(data, ["property", "hasSwimmingPool"])
            parsed_data["landSurface"] = get_in(data, ["property", "land", "surface"])
            parsed_data["livingArea"] = get_in(data, ["property", "netHabitableSurface"])
            parsed_data["surfaceOfThePlot"] = get_in(data, ["property", "land", "surface"])
            parsed_data["typeOfSale"] = getTypeOfSale(data)
            parsed_data["energy_certificate"] = extract_energy_certificate(script_content)
            
            pool = get_in(data, ["property", "hasSwimmingPool"])
            if pool:
                parsed_data["pool"] = 1
            else:
                parsed_data["pool"] = 0

            kitchen = get_in(data, ["property", "kitchen", "type"])
            if kitchen:
                parsed_data["kitchen"] = 1
            else:
                parsed_data["kitchen"] = 0

            furnished = get_in(data, ["transaction", "sale", "isFurnished"])
            if furnished:
                parsed_data["furnished"] = 1
            else :
                parsed_data["furnished"] = 0

            terrace = get_in(data, ["property", "hasTerrace"])
            if terrace:
                parsed_data["terrace"] = 1
                parsed_data["terraceSurface"] = get_in(data, ["property", "terraceSurface"])
            else:
                parsed_data["terrace"] = 0 
                parsed_data["terraceSurface"] = 0

            fireplace = get_in(data, ["property", "fireplaceExists"])
            if fireplace:
                parsed_data["fireplace"] = 1
                parsed_data["fireplaceCount"] = get_in(data, ["property", "fireplaceCount"])
            else:
                parsed_data["fireplace"] = 0 
                parsed_data["fireplaceCount"] = 0

            garden = get_in(data, ["property", "hasGarden"])
            if garden:
                parsed_data["garden"] = 1
                parsed_data["gardenSurface"] = get_in(data, ["property", "gardenSurface"])
            else:
                parsed_data["garden"] = 0
                parsed_data["gardenSurface"] = 0
    
    return parsed_data  

 


def main():
    url1=f"https://www.immoweb.be/en/classified/apartment/for-sale/deurne/2100/20365958"
    url2=f"https://www.immoweb.be/en/search/house-and-apartment/for-sale?countries=BE&isALifeAnnuitySale=false&isNewlyBuilt=false&minConstructionYear=1976&orderBy=relevance"
    
    property_data= get_property_data(url1)
    print(property_data)
        
main()

{'bedrooms': 1, 'property_type': 'APARTMENT', 'property_subtype': 'APARTMENT', 'locality': 'DEURNE', 'postal_code': '2100', 'street': 'August Van de Wielelei', 'number': '85', 'box': '002', 'kitchen': 1, 'facades': 2, 'price': 239000, 'furnished': 0, 'terrace': 0, 'terraceSurface': 0, 'fireplace': 0, 'fireplaceCount': 0, 'buildingState': 'GOOD', 'garden': 1, 'gardenSurface': 45, 'pool': 0, 'landSurface': None, 'livingArea': 88, 'surfaceOfThePlot': None, 'typeOfSale': None, 'energy_certificate': 'D'}
