In [88]:

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import pickle

# Adjust the column width setting to display the full column content
pd.set_option('display.max_colwidth', None)
# Optionally, adjust the display width if needed
pd.set_option('display.width', None)

### The objective of this web scrapper is to collect the data of all the real state information available in Luxembourg.

There are mainly 2 sources:
1. Athome (I will focus on this now)
2. Luxhouse maybe later

1. Ideally, we aim to develop a straightforward tool that allows users to retrieve information for selected places through the UI.

This approach does not necessitate fetching all available data, which would be considerably more exhaustive.

Nonetheless, starting from this point, we have the potential to automate the process using AWS Lambda to gather comprehensive information for all places on a daily basis.

For the time being, however, this expanded functionality is not required.


In [84]:
"""
Function to fetch the latest page number for the given type from the athome.lu website.

Parameters:
    type (str): The type of listing to fetch the latest page number for. Can be either 'vente' or 'location'.

Returns:
    int: The number of the last page available for the given type. In case the last page link cannot be found, it returns 1.

Raises:
    ValueError: If the provided type is not 'vente' or 'location'.
    requests.exceptions.RequestException: If an error occurs while making the GET request.

Example:
    The last page number is: 50
    50
"""


def get_latest_page(url_base: str):

    url = url_base

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the 'a' tag with the class 'last' to get the last page link
    last_page_tag = soup.find("a", class_="last")

    # Extract the href attribute
    if last_page_tag:
        last_page_link = last_page_tag.get("href")
        # Extract the page number from the href
        last_page_number = last_page_link.split("=")[-1]
        print(f"The last page number is: {last_page_number}")
    else:
        print("Could not find the last page link.")
    return int(last_page_number)


def get_all_url(page_number: int, url_base: str):
    """
    Fetches and parses URLs from a specific page for a given type of real estate listing.

    This function sends an HTTP GET request to a specified page of a real estate
    listing type (e.g., 'vente' or 'location') on the `athome.lu` website, parses the
    HTML content, extracts all hyperlinks, and filters out unwanted links based on predefined
    criteria and patterns.

    Args:
        page_number (int): The page number of the real estate listings to fetch.
        type (str): The type of real estate listing ('vente' or 'location').

    Returns:
        list: A list of strings representing filtered URLs extracted from the page.
    """

    # URL of the web page you want to scrape
    
    url = url_base + f"&page={page_number}"

    # url = f"https://www.athome.lu/en/srp/?tr=rent&q=faee1a4a&loc=L2-luxembourg&ptypes=house%2Cflat%2Cnew-property%2C4%2C7%2C5%2C6%2C42%2C32%2C41%2C43&page={page_number}"
    # url = f"https://www.athome.lu/srp/?tr=rent&q=a2d9b00c&ptypes=flat"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all 'a' tags (which define hyperlinks)
    a_tags = soup.find_all("a")

    # Extract the href attributes
    links = [a.get("href") for a in a_tags if a.get("href") is not None]

    # Remove empty and the following values
    values_to_remove = [
        "",
        "/vente/projet-neuf",
        "/blog/guides/vente",
        "/estimer",
        "/vente/bureau",
        "/vente/garage-parking",
        "/vente",
        "/vente/projet-neuf/lotissement",
        "/vente?page=5",
        "/blog/",
        "/blog/finance-et-assurance/prets-immobilier",
        "/connexion",
        "/vente/maison-a-construire/modele-de-maison",
        "/finance/pret-immobilier",
        "/vente/maison-a-construire",
        "/vente/commerce",
        "/mes-favoris?type=search",
        "/agences-immobilieres",
        "/vente/appartement/studio",
        "/assurance",
        "https://www.athome.lu/publier",
        "/vente?page=2085",
        "/vente?page=4",
        "/services/demenager",
        "/vente/terrain",
        "/vente/maison",
        "/vendre",
        "/blog/guides/achat",
        "/vente?page=1",
        "/",
        "/vente/appartement",
        "/immobilier",
        "/vente?page=3",
        "/location",
        "/mes-favoris",
        "/finance/partenaires",
        "/vente/immeuble-de-rapport",
        "/location/maison",
        "/blog/guides/location",
        "/location/appartement/studio",
        "/location/appartement",
        "/location/commerce",
        "/location/bureau",
        "/location/garage-parking",
        "/location/terrain",
        "/location/immeuble-de-rapport",
        "/location/maison-a-construire",
        "/location/projet-neuf",
        "/blog/finance-et-assurance/assurance-habitation",
        "/vente/projet-neuf/programme-neuf",
        "/blog/finance-et-assurance/assurance-habitation",
        "/blog/guides/location",
        "/location/appartement",
        "https://www.athome.lu/en/list",
        "/en/buy",
        "/location/terrain",
        "/location/immeuble-rapport",
        "/location/maison-a-construire",
        "/location/projet-neuf",
        "/blog/finance-et-assurance/assurance-habitation",
        "/vente/projet-neuf/programme-neuf",
        "/blog/finance-et-assurance/assurance-habitation",
        "/blog/guides/location",
        "/location/appartement",
        "https://www.athome.lu/en/list",
        "https://www.athome.lu",
        "https://www.athome.lu/en/list",
        "https://www.athome.lu/en/insurance",
        "https://www.athome.lu/en/services/move",
        "https://www.athome.lu/en/sell",
        "https://www.athome.lu/en/estimate",
        "https://www.athome.lu/en/rent",
        "https://www.athome.lu/en/buy",
        "https://www.athome.lu/en/finance/mortgage",
        "https://www.athome.lu/en/srp/?tr=rent&q=faee1a4a&loc=L2-luxembourg&ptypes=house%2Cflat%2Cnew-property%2C4%2C7%2C5%2C6%2C42%2C32%2C41%2C43&page=95",
        "https://www.athome.lu/en/connect",
        "https://www.athome.lu/en/my-favourites",
        "https://www.athome.lu/en/my-favourites?type=search",
        "https://www.athome.lu/en/finance/partners",
        "https://www.athome.lu/en/",
        "https://www.athome.lu/en/srp/?tr=rent&q=faee1a4a&loc=L2-luxembourg&ptypes=house%2Cflat%2Cnew-property%2C4%2C7%2C5%2C6%2C42%2C32%2C41%2C43&page=1",
        "https://www.athome.lu/en/rent",
        "https://www.athome.lu/en/insurance",
        "https://www.athome.lu/en/finance/mortgage",
        "https://www.athome.lu/en/services/move",
        "https://www.athome.lu/en/sell",
        "https://www.athome.lu/en/estimate",
        "https://www.athome.lu/en/my-favourites",
        "https://www.athome.lu/en/srp/?tr=rent&q=faee1a4a&loc=L2-luxembourg&ptypes=house%2Cflat%2Cnew-property%2C4%2C7%2C5%2C6%2C42%2C32%2C41%2C43&page=95",
        "https://www.athome.lu/en/finance/partners",
        "https://www.athome.lu/en/",
        "https://www.athome.lu/en/my-favourites?type=search",
        "https://www.athome.lu/en/connect",
    ]

    # Filter out unwanted links
    links = [link for link in links if link not in values_to_remove]

    # Patterns to be removed
    pattern1 = re.compile(r"/vente\?page=\d+")
    pattern2 = re.compile(r"/location\?page=\d+")
    pattern3 = re.compile(r"/agence\?page=\d+")
    pattern4 = re.compile(r"https?://[^/]+/agence/\d+")
    pattern5 = re.compile(r"https?://[^/]+/vente\?page=\d+")
    pattern6 = re.compile(r"https?://[^/]+/location\?page=\d+")
    pattern7 = re.compile(r"https?://www\.athome\.lu/realestate-agent/\d+")

    # Filter out links
    links = [
        link
        for link in links
        if not (
            pattern1.search(link)
            or pattern2.search(link)
            or pattern3.search(link)
            or pattern4.search(link)
            or pattern5.search(link)
            or pattern6.search(link)
            or pattern7.search(link)
        )
    ]
    return links


def scraper_thread(page, all_links, url_base):
    """
    Scraper thread function for fetching URLs from a specific page.

    This function is designed to be used as a thread in conjunction with a ThreadPoolExecutor. 
    It fetches URLs from a specified page of a given type of real estate listing, wraps the 
    results in a dictionary, and appends this dictionary to the provided list.

    Args:
        page (int): The page number to scrape.
        type (str): The type of real estate listing ('vente' or 'location').
        all_links (list): A list to which the function will append a dictionary containing the 
                          page number and the collected links.
        url_base (str): The base URL to use for scraping.

    Returns:
        None as it appends to the all_links
    """

    page_links = get_all_url(page_number = page, url_base = url_base)
    temp_dict = {"page": page, "links": page_links}
    all_links.append(temp_dict)
    print(f"Page {page} done")



def get_all_links(threads, url_base: str,last_page_number):
    """
    Collects all available links for a specified type of real estate listing using multithreading.

    This function retrieves URLs from multiple pages of either 'vente' or 'location' 
    real estate listings on the `athome.lu` website. It employs multithreading via
    `ThreadPoolExecutor` to expedite the fetching process. The collected URLs are then
    structured into a pandas DataFrame.

    Args:
        TYPE (str): The type of real estate listing ('vente' or 'location').
        threads (int): The number of threads to use for concurrent URL fetching.
        url_base (str): The base URL for the real estate listings.

    Returns:
        pandas.DataFrame: A DataFrame containing the collected URLs along with their 
                          corresponding pages and types. The DataFrame includes columns
                          'page', 'href', 'type', and 'link'.
        
    Raises:
        ValueError: If the 'TYPE' argument is not either 'vente' or 'location'.
    """

    all_links = []
    max_threads = threads
        
    with ThreadPoolExecutor(max_threads) as executor:
        futures = [executor.submit(scraper_thread, url_base, page, all_links) for page in range(1, last_page_number + 1)]

        for future in as_completed(futures):
            result = future.result()
            all_links.append(result)

    # Format Data
    flattened_data = []

    for item in all_links:
        if item:
            page = item['page']
            for link in item['links']:
                flattened_data.append({'page': page, 'href': link, 'type': TYPE})

    # Create the DataFrame
    df = pd.DataFrame(flattened_data).sort_values(by='page')
    df['link'] = 'https://www.athome.lu' + df['href']
    
    return df


In [90]:
def main():
    """
    Fetches rental flat URLs from the specified base URL and saves them into a pickle file.
    """
    url_base = "https://www.athome.lu/srp/?tr=rent&q=a2d9b00c&ptypes=flat"
    last_page_number = get_latest_page(url_base)
    list_of_links = []

    for i in range(1, last_page_number + 1):
        outcome = get_all_url(page_number=i, url_base=url_base)
        list_of_links.extend(outcome)

    with open("list_of_links.pkl", "wb") as f:
        pickle.dump(list_of_links, f)


if __name__ == "__main__":
    main()

The last page number is: 9
