# 1. Idealista Web Scraper

**Author:** Carlos Gómez Gómez

This notebook contains the complete code for scraping property data from the Spanish real estate portal, `Idealista`. The process is divided into two main steps:
1.  **Fetching Property IDs:** First, we scrape the listing pages for a specific zone to collect the unique ID of each property.
2.  **Parsing Property Details:** Then, we visit each individual property page to extract detailed information, such as price, size, number of rooms, and other features.

In [2]:
# --- 1. Library Imports ---
from bs4 import BeautifulSoup as bs
import random
import time
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import undetected_chromedriver as uc

## 2. Core Scraping Functions

Here we define the core functions responsible for the data extraction process.

### 2.1 Get Property IDs

The `get_id_list` function automates Browse through Idealista's paginated search results for a given geographical zone. It simulates user behavior by navigating page by page, handling cookie banners, and extracting the unique `data-element-id` for each property listing it finds. Finally, it returns a clean list of unique IDs.

In [3]:
def get_id_list(target_zone):
    """
    Scrapes Idealista for a given search zone to retrieve a list of all property IDs.
    
    Args:
        target_zone (str): The specific search path for the zone (e.g., 'barcelona/sarria-sant-gervasi').

    Returns:
        list: A list of unique property IDs.
    """
    page_number = 1
    id_list = []
    driver = uc.Chrome()
    
    while True:
        if page_number == 1:
            url = f'https://www.idealista.com/en/venta-viviendas/{target_zone}/'
        else:
            url = f'https://www.idealista.com/en/venta-viviendas/{target_zone}/pagina-{page_number}.htm'
        
        driver.get(url)
        print(f"Scraping page {page_number} for IDs...")
        time.sleep(random.randint(8, 12))
        
        try:
            # Click cookie banner if it appears
            driver.find_element("xpath", '//*[@id="didomi-notice-agree-button"]').click()
        except:
            pass
        
        html = driver.page_source
        soup = bs(html, 'lxml')
        
        # Stop if the page does not exist or we are redirected
        if soup.find('main', {'class':'listing-items'}) is None:
            print("No more items found. Ending ID scrape.")
            break

        articles = soup.find('main',{'class':'listing-items'}).find_all('article')
        if not articles:
            print("No articles found on this page. Ending ID scrape.")
            break

        for article in articles:
            data_id = article.get('data-element-id')
            if data_id:
                id_list.append(data_id)

        # A small delay to mimic human behavior
        time.sleep(random.randint(1, 2))
        page_number += 1
        
    driver.quit()
    
    # Remove duplicates and None values
    id_list = list(set([article for article in id_list if article is not None]))
    print(f"Found a total of {len(id_list)} unique IDs.")
    return id_list

### 2.2 Parse Property Details

The `parse_property_details` function takes a single property ID and a Selenium driver instance. It navigates to the property's specific URL and extracts all relevant features like price, location, area, number of rooms, and amenities (terrace, elevator, etc.). It handles potential errors gracefully and returns the scraped data as a single-row Pandas DataFrame.

In [4]:
def parse_property_details(property_id, driver):
    """
    Parses the details of a single property page.
    
    Args:
        property_id (str): The unique ID of the property.
        driver: The Selenium WebDriver instance.

    Returns:
        pd.DataFrame: A DataFrame containing the details of the property.
    """
    print(f"Parsing data for property ID: {property_id}")
    url = f"https://www.idealista.com/en/inmueble/{property_id}/"
    driver.get(url)
    time.sleep(random.randint(5, 10))
    
    try:
        driver.find_element("xpath", '//*[@id="didomi-notice-agree-button"]').click()
    except:
        pass
    
    html = driver.page_source
    soup = bs(html, 'lxml')
    
    details_dict = {}
    details_dict['id'] = property_id
    
    # --- Basic Info ---
    try:
        details_dict['title'] = soup.select_one('.main-info__title-main').text if soup.select_one('.main-info__title-main') else "NaN"
        location_str = soup.select_one('.main-info__title-minor')
        if location_str:
            details_dict['location'] = location_str.text.split(',')[0].replace(',','')
            details_dict['city'] = location_str.text.split(',')[1] if len(location_str.text.split(',')) > 1 else "NaN"
        else:
            details_dict['location'], details_dict['city'] = "NaN", "NaN"
            
        price_str = soup.select_one(".info-data-price span.h1-simulated")
        details_dict['price'] = price_str.text.replace('.', '') if price_str else "NaN"
        
    except Exception as e:
        print(f"Error parsing main info: {e}")

    # --- Features ---
    try:
        info_features = soup.find("div", {"class": "info-features"}).find_all('span')
        for feature in info_features:
            feature_text = feature.text.strip()
            if "m²" in feature_text:
                details_dict['area_m2'] = feature_text.split()[0]
            elif "hab." in feature_text:
                details_dict['rooms'] = feature_text.split()[0]
            elif "Planta" in feature_text or "Bajo" in feature_text:
                details_dict['floor_info'] = feature_text
    except Exception as e:
        pass 

    # --- Additional Details & Amenities ---
    try:
        details_propertys = soup.find("div", {"class": "details-property"})
        if details_propertys:
            full_details_text = " ".join([p.text for p in details_propertys.find_all('p')]).lower()
            details_dict['air_conditioning'] = 'aire acondicionado' in full_details_text
            details_dict['terrace'] = 'terraza' in full_details_text
            details_dict['storage'] = 'trastero' in full_details_text
            details_dict['elevator'] = 'ascensor' in full_details_text
            details_dict['garage'] = 'garaje' in full_details_text
            details_dict['pool'] = 'piscina' in full_details_text
            details_dict['garden'] = 'jardín' in full_details_text or 'zonas verdes' in full_details_text
            
            if "obra nueva" in full_details_text:
                details_dict['status'] = "New build"
            elif "buen estado" in full_details_text:
                details_dict['status'] = "Good condition"
            elif "a reformar" in full_details_text:
                details_dict['status'] = "To renovate"
            else:
                details_dict['status'] = "Not specified"
    except Exception as e:
        pass
        
    df = pd.DataFrame([details_dict])
    return df

## 3. Execution

This section runs the scraping process.

### 3.1. How to Find the Target Zone

The `TARGET_ZONE` variable defines which geographical area to scrape. To find the correct value for a specific district in Barcelona:

1.  Go to the Idealista map for Barcelona: [https://www.idealista.com/venta-viviendas/barcelona-barcelona/mapa](https://www.idealista.com/venta-viviendas/barcelona-barcelona/mapa)
2.  Click on the desired district on the map.
3.  Look at the URL in your browser's address bar.
4.  Copy the part of the path that comes after `.../venta-viviendas/`. For example, for the "Gràcia" district, the path will be `barcelona/gracia`.

Once you have the zone, set it in the configuration cell below.

In [None]:
# --- Configuration ---
# Define the zone you want to scrape.
# You can get this path from the Idealista map URL for Barcelona, as explained above.

# List of available zones for Barcelona:
# 'barcelona/ciutat-vella'
# 'barcelona/eixample'
# 'barcelona/sants-montjuic'
# 'barcelona/les-corts'
# 'barcelona/sarria-sant-gervasi'
# 'barcelona/gracia'
# 'barcelona/horta-guinardo'
# 'barcelona/nou-barris'
# 'barcelona/sant-andreu'
# 'barcelona/sant-marti'

TARGET_ZONE = 'barcelona/sarria-sant-gervasi'

# --- Step 1: Get all property IDs ---
# This function can take a long time to run.
# It's recommended to run it once and save the results.
id_list = get_id_list(TARGET_ZONE)

# Save the IDs to a CSV file for backup and later use
ids_df = pd.DataFrame(id_list, columns=['id'])
ids_filename = f'data/id_list_{TARGET_ZONE.replace("/", "_")}.csv'
ids_df.to_csv(ids_filename, index=False)

print(f"Saved {len(ids_df)} IDs to {ids_filename}")

### 3.2. Parse Details for Each Property ID

Now that we have the list of IDs, we will loop through each one, call our `parse_property_details` function to scrape its specific page, and append the results to a final DataFrame. This process is intentionally slow, with pauses between requests to be respectful to Idealista's servers.

In [None]:
# --- Step 2: Parse details for each ID ---

# Define the target zone to construct the correct filename
TARGET_ZONE = 'barcelona/sarria-sant-gervasi'

# Construct the filename and load the previously saved property IDs
ids_filename = f'../data/id_list_{TARGET_ZONE.replace("/", "_")}.csv'
ids_df = pd.read_csv(ids_filename)

print(f"Loaded {len(ids_df)} IDs from '{ids_filename}'")

# --- Main scraping loop ---
all_properties_df = pd.DataFrame()
driver = uc.Chrome() # Initialize the browser once

for property_id in ids_df['id']:
    try:
        temp_df = parse_property_details(property_id, driver)
        all_properties_df = pd.concat([all_properties_df, temp_df], ignore_index=True)
    except Exception as e:
        print(f"A critical error occurred for ID {property_id}: {e}. Skipping.")
    
    # A random pause to be respectful to the server
    time.sleep(random.randint(4, 8))

driver.quit() # Close the browser when finished

print("Scraping finished.")
all_properties_df.info()

## 4. Save Final Data

Finally, we save the complete dataset into a single CSV file for the next stage of the analysis (data cleaning).

In [None]:
# --- Save the complete dataset ---
final_filename = f'../data/scraped_data_{TARGET_ZONE.replace("/", "_")}.csv'
all_properties_df.to_csv(final_filename, index=False)

print(f"Complete dataset saved to {final_filename}")
all_properties_df.head()

## 5. Utility: Resuming an Interrupted Scrape

Web scraping can often be interrupted due to network errors. The code below provides a utility to resume the process from the last successfully scraped item, so you don't have to start from scratch.

To use it, you would typically uncomment the code and run these cells manually.

In [None]:
# # --- SCRIPT TO RESUME A FAILED SCRAPE ---

# # 1. Load the dataframe of already scraped properties
# main_df_filename = f'../data/scraped_data_{TARGET_ZONE.replace("/", "_")}.csv'
# main_df = pd.read_csv(main_df_filename)

# # 2. Load the complete list of all property IDs
# ids_filename = f'../data/id_list_{TARGET_ZONE.replace("/", "_")}.csv'
# ids_df = pd.read_csv(ids_filename)

# # 3. Find which IDs are missing by comparing the two lists
# scraped_ids = set(main_df['id'].astype(str))
# all_ids = set(ids_df['id'].astype(str))
# missing_ids = all_ids - scraped_ids

# print(f"Previously scraped: {len(scraped_ids)} properties.")
# print(f"Found {len(missing_ids)} IDs to resume scraping.")

# # 4. Scrape only the missing IDs
# if missing_ids:
#     resumed_properties_df = pd.DataFrame()
#     driver = uc.Chrome()

#     for property_id in list(missing_ids):
#         try:
#             temp_df = parse_property_details(property_id, driver)
#             resumed_properties_df = pd.concat([resumed_properties_df, temp_df], ignore_index=True)
#         except Exception as e:
#             print(f"A critical error occurred for ID {property_id}: {e}. Skipping.")
#         time.sleep(random.randint(4, 8))

#     driver.quit()
#     print(f"Finished scraping {len(resumed_properties_df)} new properties.")

### Combine and Save Recovered Data

After running the recovery script, you can combine the original dataframe with the newly scraped data and save the final, complete version.

In [None]:
# # --- Combine original and resumed data ---

# # 1. Check if there is new data to combine
# if not resumed_properties_df.empty:
#     # 2. Concatenate the two dataframes
#     final_df = pd.concat([main_df, resumed_properties_df], ignore_index=True)

#     # 3. Clean up the dataframe by removing any unnamed columns and resetting the index
#     final_df = final_df.loc[:, ~final_df.columns.str.contains('^Unnamed')]
#     final_df.reset_index(drop=True, inplace=True)

#     # 4. Save the final, complete file
#     final_filename = f'../data/scraped_data_{TARGET_ZONE.replace("/", "_")}_complete.csv'
#     final_df.to_csv(final_filename, index=False)

#     print(f"Successfully saved combined dataset with {len(final_df)} properties to {final_filename}.")
#     final_df.info()
# else:
#     print("No new properties were scraped. The original dataset is already complete.")