In [16]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import json
import csv
import pandas as pd
import time
import random
from datetime import datetime

In [23]:
# CONFIG

URL = "https://www.immobiliare.it/vendita-case/milano/?criterio=rilevanza"
today = datetime.today().strftime('%Y%m%d')
destination_folder = f"../../../data/immobiliare/{today}/"

In [1]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
}

@staticmethod
def request_get(url):
    print(url)
    req = requests.get(url, headers=headers)
    time.sleep(random.uniform(10, 30))
    return req

In [9]:
class ImmobiliareScraper:
    def __init__(
        self, 
        url, 
        get_data_of_following_pages = False,
        destination_folder = destination_folder
    ) -> None:
        self.base_url = url
        self.last_scraped_url = self.url
        self.get_data_of_following_pages = get_data_of_following_pages
        self.response = request(self.url)
        self.last_response = self.response
        self.real_estates = []
        self.gather_real_estate_data()
        self.data_frame = pd.DataFrame(self.real_estates)
        self.destination_folder = destination_folder

    def __str__(self) -> str:
        return f"Immobiliare scraper - url='{self.url}'"

    def _check_url(self) -> None:
        if not "https://www.immobiliare.it" in self.url:
            raise ValueError(f"Given url must include 'https://www.immobiliare.it'.")

        if "mapCenter" in self.url:
            raise ValueError(f"Given url must not include 'mapCenter' as it uses another api to retrieve data.")

        if "search-list" in self.url:
            raise ValueError(f"Given url must not include 'search-list' as it uses another api to retrieve data.")

        if self.response.status_code != 200:
            self.response.raise_for_status()

    @staticmethod
    def do_request(self, url):
        self._check_url()

    def gather_real_estate_data(self) -> None:
        self._check_url()
        if self.get_data_of_following_pages:
            parsed_url = urlparse(self.url)
            query_params = parse_qs(parsed_url.query)
            pag_value = int(query_params.get("pag", ["1"])[0])
            self.last_scraped_url = urlunparse((parsed_url.scheme, parsed_url.netloc,
                parsed_url.path, parsed_url.params, urlencode(query_params, doseq=True),
                parsed_url.fragment))
            print(self.last_scraped_url)
            self.last_response = request(self.last_scraped_url)

            while self.last_response.status_code == 200:
                print(f"Getting real estate data of {self.last_scraped_url}")
                self.real_estates += self.filter_json_data(self.last_response)
                pag_value += 1
                query_params['pag'] = [str(pag_value)]
                self.last_scraped_url = urlunparse((parsed_url.scheme, parsed_url.netloc,
                    parsed_url.path, parsed_url.params, urlencode(query_params, doseq=True),
                    parsed_url.fragment))
                if self.last_response.status_code == 200:
                    print(self.last_scraped_url)
                    self.last_response = request(self.last_scraped_url)
        else:
            print(f"Getting real estate data of {self.url}")
            self.real_estates += self.filter_json_data(self.response)

    def filter_json_data(self, response) -> list:
        soup = BeautifulSoup(response.text, 'html.parser')
        try:
            json_data = json.loads(soup.find("script", {"id": "__NEXT_DATA__"}).text)
            json_data = json_data["props"]["pageProps"]["dehydratedState"]["queries"][0]["state"]["data"]["results"]
        except KeyError:
            json_data = []

        real_estates = []
        if json_data:
            for record in json_data:
                real_estate = {}
                real_estate["id"] = record["realEstate"]["id"]
                real_estate["url"] = record["seo"]["url"]
                real_estate["contract"] = record["realEstate"]["contract"]
                real_estate["agency_id"], real_estate["agency_url"], real_estate["agency_name"] =  None, None, None
                real_estate["is_private_ad"] = 1 if record["realEstate"]["advertiser"].get("agency", None) == None else 0
                if not real_estate["is_private_ad"]:
                    real_estate["agency_id"] =  record["realEstate"]["advertiser"]["agency"]["label"]
                    real_estate["agency_url"] = record["realEstate"]["advertiser"]["agency"]["agencyUrl"]
                    real_estate["agency_name"] = record["realEstate"]["advertiser"]["agency"]["displayName"]
                real_estate["is_new"] = 1 if record["realEstate"]["isNew"] else 0
                real_estate["is_luxury"] = 1 if record["realEstate"]["luxury"] else 0
                real_estate["formatted_price"] = record["realEstate"]["price"]["formattedValue"]
                real_estate["price"] = record["realEstate"]["price"].get("value", None)
                if (not real_estate["price"]):
                    price_match = re.search(r'\d+\.?\d*', real_estate["formatted_price"])
                    if price_match:
                        real_estate["price"] = price_match.group(0).replace('.', '')
                real_estate["bathrooms"] = record["realEstate"]["properties"][0].get("bathrooms", None)
                real_estate["bedrooms"] = record["realEstate"]["properties"][0].get("bedRoomsNumber", None)
                real_estate["floor"], real_estate["formatted_floor"] = None, None
                floor_data = record["realEstate"]["properties"][0].get("floor", None)
                if floor_data:
                    real_estate["floor"] = floor_data["abbreviation"]
                    real_estate["formatted_floor"] = floor_data["value"]
                real_estate["total_floors"] = record["realEstate"]["properties"][0].get("floors", None)
                real_estate["condition"] = record["realEstate"]["properties"][0].get("condition", None)
                real_estate["rooms"] = record["realEstate"]["properties"][0].get("rooms", None)
                real_estate["has_elevators"] = 1 if record["realEstate"]["properties"][0].get("hasElevators", None) else 0
                real_estate["surface"] = None
                real_estate["surface_formatted"] = record["realEstate"]["properties"][0].get("surface", None)
                if real_estate["surface_formatted"]:
                    real_estate["surface"] = re.search(r'(\d+\.?\d*)', real_estate["surface_formatted"]).group(1) or None
                real_estate["type"] = record["realEstate"]["properties"][0]["typologyGA4Translation"]
                real_estate["caption"] = record["realEstate"]["properties"][0].get("caption", None)
                real_estate["category"] = record["realEstate"]["properties"][0]["category"]["name"]
                real_estate["description"] = record["realEstate"]["properties"][0].get("description", None)
                energy_data = record["realEstate"]["properties"][0].get("energy", None)
                real_estate["heating_type"], real_estate["air_conditioning"] = None, None
                if energy_data:
                    real_estate["heating_type"] = energy_data.get("heatingType", None)
                    real_estate["air_conditioning"] = energy_data.get("airConditioning", None)
                real_estate["latitude"] = record["realEstate"]["properties"][0]["location"]["latitude"]
                real_estate["longitude"] = record["realEstate"]["properties"][0]["location"]["longitude"]
                real_estate["region"] = record["realEstate"]["properties"][0]["location"]["region"]
                real_estate["province"] = record["realEstate"]["properties"][0]["location"]["province"]
                real_estate["macrozone"] = record["realEstate"]["properties"][0]["location"]["macrozone"]
                real_estate["microzone"] = record["realEstate"]["properties"][0]["location"]["microzone"]
                real_estate["city"] = record["realEstate"]["properties"][0]["location"]["city"]
                real_estate["country"] = record["realEstate"]["properties"][0]["location"]["nation"]["id"]
                real_estates.append(real_estate)
        else:
            self.last_response.status_code = 404
        return real_estates

    def save_data_json(self, filename="immobiliare.json") -> None:
        file = self.destination_folder + filename
        print("Writing JSON into:", file)
        with open(file, "w") as fb:
            json.dump(self.real_estates, fb, indent=4)

    def save_data_csv(self, filename="immobiliare.csv") -> None:
        file = self.destination_folder + filename
        print("Writing CSV into:", file)
        
        with open(file, "w", newline="", encoding="utf-8") as csvfile:
            fieldnames = self.real_estates[0].keys() if self.real_estates else []
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for real_estate in self.real_estates:
                writer.writerow(real_estate)

Testing:

In [10]:
if __name__ == '__main__':
    immobiliare = Immobiliare(
        url=URL, 
        get_data_of_following_pages=False
    )
    immobiliare.save_data_json()
    immobiliare.save_data_csv()
    data = immobiliare.data_frame

https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza
https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza
https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza
Getting real estate data of https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza
https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza&pag=2
https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza&pag=2
Getting real estate data of https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza&pag=2
https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza&pag=3
https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza&pag=3
Getting real estate data of https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza&pag=3
https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=rilevanza&pag=4
https://www.immobiliare.it/