In [109]:
import os
import re
import json
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
from fake_useragent import UserAgent
import time
import random

# Configuration
BASE_DIR = "Files"
LISTINGS_CSV = os.path.join(BASE_DIR, "apartmentsPortugal.csv")
CONSOLIDATED_CSV = os.path.join(BASE_DIR, "Consolidated.csv")
BASE_URL = "https://www.imovirtual.com/pt/resultados/comprar/apartamento/todo-o-pais?page="
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Ensure directory exists
os.makedirs(BASE_DIR, exist_ok=True)

In [110]:
def scrape_page(page_num):
    """Scrape data from a single listing page with columns matching consolidated.csv"""
    url = f"{BASE_URL}{page_num}"
    print(f"Fetching URL: {url}")  # Debugging line to verify the requested page
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code != 200:
        print(f"Failed to retrieve page {page_num}. Status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all('article', {'data-cy': 'listing-item'})
    
    if not articles:
        return None
    
    # Initialize all columns present in consolidated.csv
    data = {
        'URL': [],
        'Titles': [],
        'Price': [],
        'Location': [],
        'Rooms': [],
        'Areas': [],
        'Bathroom': [],
        'Listing Type': [],
        'Useful area': [],
        'Gross area': [],
        'Construction year': [],
        'Energetic certificate': [],
        'DateScraped': [],
        'Page': [],
        'Description': []
    }
    
    scrape_date = datetime.now().strftime('%Y-%m-%d')
    
    for article in articles:
        # Existing fields
        data['URL'].append(
            f"https://www.imovirtual.com{article.find('a')['href']}" 
            if article.find('a') else np.nan
        )
        data['Titles'].append(
            article.find('p', class_='css-u3orbr').text.strip() 
            if article.find('p', class_='css-u3orbr') else np.nan
        )
        data['Price'].append(
            article.find('span', class_='css-2bt9f1').text.strip() 
            if article.find('span', class_='css-2bt9f1') else np.nan
        )
        data['Location'].append(
            article.find('p', class_='css-42r2ms').text.strip() 
            if article.find('p', class_='css-42r2ms') else np.nan
        )
        
        # Rooms and Areas
        dl = article.find('dl', class_='css-12dsp7a')
        if dl:
            dt_dd = {dt.text.strip(): dd.text.strip() 
                    for dt, dd in zip(dl.find_all('dt'), dl.find_all('dd'))}
            data['Rooms'].append(dt_dd.get('Tipologia', np.nan))
            area = dt_dd.get('Zona', np.nan)
            data['Areas'].append(area.split()[0] if area else np.nan)
        else:
            data['Rooms'].append(np.nan)
            data['Areas'].append(np.nan)
        
        # Empty columns to be filled in Part 2
        data['Bathroom'].append(np.nan)
        data['Listing Type'].append(np.nan)
        data['Useful area'].append(np.nan)
        data['Gross area'].append(np.nan)
        data['Construction year'].append(np.nan)
        data['Energetic certificate'].append(np.nan)
        data['Description'].append(np.nan)
        data['DateScraped'].append(scrape_date)
        data['Page'].append(page_num)
    
    # Validate lengths
    lengths = [len(v) for v in data.values()]
    if len(set(lengths)) != 1:
        print(f"Data length mismatch in page {page_num}: {lengths}")
        return None
    
    return pd.DataFrame(data)

def scrape_listings(num_pages):
    """Scrape multiple listing pages with resume support."""
    if os.path.exists(LISTINGS_CSV):
        df_existing = pd.read_csv(LISTINGS_CSV)
        start_page = df_existing['Page'].max() + 1 if 'Page' in df_existing.columns else 1
    else:
        start_page = 1
    
    end_page = start_page + num_pages - 1
    session = requests.Session()
    
    for page in range(start_page, end_page + 1):
        print(f"Scraping page {page}...")
        df_page = scrape_page(page)
        if df_page is None:
            print(f"No data on page {page}. Stopping.")
            break
        
        # Save incrementally
        df_page.to_csv(LISTINGS_CSV, mode='a', header=not os.path.exists(LISTINGS_CSV), index=False)
        print(f"Page {page} saved.")


In [111]:
def scrape_details():
    """Scrape detailed information with corrected column names"""
    df = pd.read_csv(LISTINGS_CSV)
    
    # Identify URLs needing processing
    mask = df['Construction year'].isna() | df['Bathroom'].isna()
    indices = df[mask].index.tolist()
    
    ua = UserAgent()
    session = requests.Session()
    
    for i, idx in enumerate(indices):
        try:
            url = df.loc[idx, 'URL']
            response = session.get(url, headers={'User-Agent': ua.random}, timeout=10)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')

                # Get useful area (default to "N/A" if not available)
                df.at[idx, 'Useful area'] = "N/A"

                # Get gross area, rooms, and bathrooms
                try:
                    buttons = soup.find_all('button', class_='eezlw8k1 css-ds0a69')
                    if len(buttons) >= 3:
                        area_div = buttons[0].find('div', class_='css-1ftqasz')
                        df.at[idx, 'Gross area'] = area_div.get_text(strip=True) if area_div else "N/A"

                        rooms_div = buttons[1].find('div', class_='css-1ftqasz')
                        df.at[idx, 'Rooms'] = rooms_div.get_text(strip=True) if rooms_div else "N/A"

                        bathrooms_div = buttons[2].find('div', class_='css-1ftqasz')
                        df.at[idx, 'Bathroom'] = bathrooms_div.get_text(strip=True) if bathrooms_div else "N/A"
                    else:
                        df.at[idx, 'Gross area'] = "N/A"
                        df.at[idx, 'Rooms'] = "N/A"
                        df.at[idx, 'Bathroom'] = "N/A"
                except AttributeError:
                    df.at[idx, 'Gross area'] = "N/A"
                    df.at[idx, 'Rooms'] = "N/A"
                    df.at[idx, 'Bathroom'] = "N/A"

                # Get construction year
                try:
                    divs = soup.find_all('div', class_='css-t7cajz e15n0fyo1')
                    construction_year = "N/A"
                    for div in divs:
                        p_elements = div.find_all('p', class_='e15n0fyo2 css-nlohq6')
                        for i, p in enumerate(p_elements):
                            if "Ano de construção" in p.get_text(strip=True):
                                if i + 1 < len(p_elements):
                                    construction_year = p_elements[i + 1].get_text(strip=True)
                                break
                        if construction_year != "N/A":
                            break
                    df.at[idx, 'Construction year'] = construction_year
                except (AttributeError, IndexError):
                    df.at[idx, 'Construction year'] = "N/A"

                # Get energetic certificate
                try:
                    energetic_certificate = "N/A"
                    for div in divs:
                        p_elements = div.find_all('p', class_='e15n0fyo2 css-nlohq6')
                        for i, p in enumerate(p_elements):
                            if "Certificado energético" in p.get_text(strip=True):
                                if i + 1 < len(p_elements):
                                    energetic_certificate = p_elements[i + 1].get_text(strip=True)
                                break
                        if energetic_certificate != "N/A":
                            break
                    df.at[idx, 'Energetic certificate'] = energetic_certificate
                except (AttributeError, IndexError):
                    df.at[idx, 'Energetic certificate'] = "N/A"

                # Save progress every 50 URLs
                if (i + 1) % 50 == 0:
                    df.to_csv(LISTINGS_CSV, index=False)

                time.sleep(random.uniform(0.5, 1.5))

            else:
                print(f"Skipping {url} due to status code {response.status_code}")

        except Exception as e:
            print(f"Error processing {url}: {str(e)}")

    
    df.to_csv(LISTINGS_CSV, index=False)

In [112]:
def merge_data():
    """Merge data while preserving existing structure"""
    if os.path.exists(CONSOLIDATED_CSV):
        consolidated = pd.read_csv(CONSOLIDATED_CSV)
    else:
        consolidated = pd.DataFrame(columns=[
            'URL', 'Titles', 'Price', 'Location', 'Rooms', 'Areas',
            'Bathroom', 'Listing Type', 'Useful area', 'Gross area',
            'Construction year', 'Energetic certificate',
            'DateScraped', 'Page', 'Description'
        ])
    
    new_data = pd.read_csv(LISTINGS_CSV)
    
    # Convert numeric columns
    numeric_cols = ['Construction year', 'Page']
    for col in numeric_cols:
        new_data[col] = pd.to_numeric(new_data[col], errors='coerce')
    
    # Merge and deduplicate
    combined = pd.concat([consolidated, new_data], ignore_index=True)
    combined = combined.drop_duplicates('URL', keep='last')
    
    # Remove temporary columns
    combined = combined.loc[:, ~combined.columns.str.contains('Unnamed')]
    
    combined.to_csv(CONSOLIDATED_CSV, index=False)
    print(f"Merged data saved. Total records: {len(combined)}")

In [None]:
scrape_listings(2)
scrape_details()
merge_data()

In [95]:
listings = pd.read_csv(LISTINGS_CSV, index_col=False)

In [None]:
listings.info()