In [2]:
import os
import re
import json
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime


In [4]:
consolidated_df = pd.read_csv("Files/Consolidated.csv", index_col=False)
consolidated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   URL                    90000 non-null  object 
 1   Titles                 90000 non-null  object 
 2   Price                  90000 non-null  object 
 3   Location               90000 non-null  object 
 4   Rooms                  90000 non-null  object 
 5   Areas                  90000 non-null  object 
 6   Bathrooms              81941 non-null  object 
 7   Listing Type           62115 non-null  object 
 8   Useful area            89747 non-null  object 
 9   Gross area             71697 non-null  object 
 10  Construction year      49716 non-null  float64
 11  Energetic certificate  89575 non-null  object 
 12  Enterprise             80494 non-null  object 
dtypes: float64(1), object(12)
memory usage: 8.9+ MB


In [3]:
base_url = "https://www.imovirtual.com/comprar/apartamento/?page="
num_pages = 1  # Specify the number of pages to scrape
df = []
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [4]:
def scrape_data(url):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve data. HTTP Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    article_elements = soup.find_all('article', {'data-cy': 'listing-item'})

    # Lists to store extracted data
    titles = []
    prices = []
    locations = []
    rooms = []
    areas = []
    urls = []
    dates = []

    scrape_date = datetime.now().strftime('%Y-%m-%d')

    for article_element in article_elements:
        # Extract title
        title_element = article_element.find('p', class_='css-u3orbr e1g5xnx10')
        titles.append(title_element.text.strip() if title_element else np.nan)

        # Extract URL
        url_element = article_element.find('a')
        urls.append(f"https://www.imovirtual.com{url_element['href']}" if url_element else np.nan)

        # Extract price
        price_element = article_element.find('span', class_='css-2bt9f1 evk7nst0')
        prices.append(price_element.text.strip() if price_element else np.nan)

        # Extract location
        location_element = article_element.find('p', class_='css-42r2ms eejmx80')
        locations.append(location_element.text.strip() if location_element else np.nan)

        # Extract room count and area from <dl>
        dl_element = article_element.find('dl', class_='css-12dsp7a e1clni9t1')
        room = np.nan
        area = np.nan

        if dl_element:
            dt_elements = dl_element.find_all('dt')
            dd_elements = dl_element.find_all('dd')

            # Ensure there is a match between <dt> and <dd>
            for dt, dd in zip(dt_elements, dd_elements):
                if dt.text.strip() == "Tipologia":
                    room = dd.text.strip()
                if dt.text.strip() == "Zona":
                    area = dd.text.strip().split(" ")[0]  # Extract just the numeric value
                    break

        rooms.append(room)
        areas.append(area)

        # Add scraping date for each item
        dates.append(scrape_date)
    
    print(f"Titles: {len(titles)}, Prices: {len(prices)}, Locations: {len(locations)}, Rooms: {len(rooms)}, Areas: {len(areas)}, URLs: {len(urls)}")

    return {
        'titles': titles,
        'prices': prices,
        'locations': locations,
        'rooms': rooms,
        'areas': areas,
        'urls': urls,
        'dates': dates
    }

In [None]:
titles = []
prices = []
locations = []
rooms = []
areas = []
urls = []
dates = []

for page in range(1, num_pages + 1):
    print(f"Processing page {page}...")
    page_url = base_url + str(page)
    
    # Scrape data for the current page
    scraped_data = scrape_data(page_url)
    if not scraped_data:
        print(f"Skipping page {page} due to scraping error.")
        continue

    page_titles = scraped_data["titles"]
    page_prices = scraped_data["prices"]
    page_locations = scraped_data["locations"]
    page_rooms = scraped_data["rooms"]
    page_areas = scraped_data["areas"]
    page_urls = scraped_data["urls"]
    page_dates = scraped_data["dates"]

    # Validate that all lists have the same length
    lengths = [
        len(page_titles),
        len(page_prices),
        len(page_locations),
        len(page_rooms),
        len(page_areas),
        len(page_urls),
        len(page_dates)
    ]
    
    if len(set(lengths)) != 1:
        print(f"Warning: Data length mismatch on page {page}. Skipping...")
        print(f"Lengths: {lengths}")
        continue

    # Append to the main lists
    titles += page_titles
    prices += page_prices
    locations += page_locations
    rooms += page_rooms
    areas += page_areas
    urls += page_urls
    dates += page_dates

    print(f"Page {page} processed: {lengths[0]} items.")

In [6]:
data = {
    "Title": titles,
    "Price": prices,
    "Location": locations,
    "Rooms": rooms,
    "Area": areas,
    "URL": urls,
    "DateScraped": dates
}

df = pd.DataFrame(data)

In [7]:
df.to_csv("Files/apartmentsPortugal.csv", index="0")

In [None]:
from fake_useragent import UserAgent
import time
import random

# Load your DataFrame containing property URLs
df = pd.read_csv('Files/apartmentsPortugal.csv')

# Create empty columns in the existing DataFrame to store the extracted information
df['Useful area'] = None
df['Gross area'] = None
df['Construction year'] = None
df['Energetic certificate'] = None
df['Enterprise'] = None
df['Rooms'] = None
df['Bathroom'] = None
df['Description'] = None

# Initialize UserAgent object
ua = UserAgent()

url_counter = 0

for property_number, url in enumerate(df['URL']):
    # Send a request to the URL with a random User-Agent
    headers = {'User-Agent': ua.random}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get the useful_area from the property page
        try:
            # Extract useful_area if available (if not, set to 'N/A')
            useful_area = "N/A"
            df.at[property_number, 'Useful area'] = useful_area
        except AttributeError:
            pass

        # Get the gross_area, rooms, and bathrooms from the property page    
        try:
            # Find all <button> elements with the target class
            buttons = soup.find_all('button', class_='eezlw8k1 css-ds0a69')
            
            # Ensure there are at least three buttons for area, rooms, and bathrooms
            if len(buttons) >= 3:
                # Extract area from the first button
                area_div = buttons[0].find('div', class_='css-1ftqasz')
                if area_div:
                    area = area_div.get_text(strip=True)
                    df.at[property_number, 'Gross area'] = area
                else:
                    df.at[property_number, 'Gross area'] = "N/A"
                
                # Extract rooms from the second button
                rooms_div = buttons[1].find('div', class_='css-1ftqasz')
                if rooms_div:
                    rooms = rooms_div.get_text(strip=True)
                    df.at[property_number, 'Rooms'] = rooms
                else:
                    df.at[property_number, 'Rooms'] = "N/A"
                
                # Extract bathrooms from the third button
                bathrooms_div = buttons[2].find('div', class_='css-1ftqasz')
                if bathrooms_div:
                    bathrooms = bathrooms_div.get_text(strip=True)
                    df.at[property_number, 'Bathroom'] = bathrooms
                else:
                    df.at[property_number, 'Bathroom'] = "N/A"
            else:
                # If fewer than three buttons are found, set all fields to "N/A"
                df.at[property_number, 'Area'] = "N/A"
                df.at[property_number, 'Rooms'] = "N/A"
                df.at[property_number, 'Bathroom'] = "N/A"

        except AttributeError:
            # Handle potential errors gracefully
            df.at[property_number, 'Area'] = "N/A"
            df.at[property_number, 'Rooms'] = "N/A"
            df.at[property_number, 'Bathrooms'] = "N/A"
            pass


        # Get the construction year from the property page    
        try:
            # Find all div elements with the target class
            divs = soup.find_all('div', class_='css-t7cajz e15n0fyo1')
            
            construction_year_found = False  # Flag to indicate if the construction year was found
            
            for div in divs:
                # Find all <p> elements inside the current div
                p_elements = div.find_all('p', class_='e15n0fyo2 css-nlohq6')
                
                # Iterate through the <p> elements to find the "Ano de construção"
                for i, p in enumerate(p_elements):
                    if "Ano de construção" in p.get_text(strip=True):
                        # Ensure there is a next <p> element for the construction year
                        if i + 1 < len(p_elements):
                            construction_year = p_elements[i + 1].get_text(strip=True)
                            df.at[property_number, 'Construction year'] = construction_year
                            construction_year_found = True
                            break
                if construction_year_found:
                    break
            
            # If no construction year was found in any of the divs
            if not construction_year_found:
                df.at[property_number, 'Construction year'] = "N/A"

        except (AttributeError, IndexError):
            # Handle potential errors gracefully
            df.at[property_number, 'Construction year'] = "N/A"
            pass

        #Get the energetic certificate from the property page    
        try:
            # Find all div elements with the target class
            divs = soup.find_all('div', class_='css-t7cajz e15n0fyo1')
            
            construction_year_found = False  # Flag to indicate if the construction year was found
            
            for div in divs:
                # Find all <p> elements inside the current div
                p_elements = div.find_all('p', class_='e15n0fyo2 css-nlohq6')
                
                # Iterate through the <p> elements to find the "Certificado energético"
                for i, p in enumerate(p_elements):
                    if "Certificado energético" in p.get_text(strip=True):
                        # Ensure there is a next <p> element for the construction year
                        if i + 1 < len(p_elements):
                            construction_year = p_elements[i + 1].get_text(strip=True)
                            df.at[property_number, 'Energetic certificate'] = construction_year
                            construction_year_found = True
                            break
                if construction_year_found:
                    break
            
            # If no construction year was found in any of the divs
            if not construction_year_found:
                df.at[property_number, 'Energetic certificate'] = "N/A"
        except (AttributeError, IndexError):
            # Handle potential errors gracefully
            df.at[property_number, 'Energetic certificate'] = "N/A"
            pass

        try:
            # Extract enterprise if available
            enterprise = "N/A"
            df.at[property_number, 'Enterprise'] = enterprise
        except AttributeError:
            pass
        
        # Get the description from the property page
        try:
            # Extract useful_area if available (if not, set to 'N/A')
            description = "N/A"
            df.at[property_number, 'Description'] = description
        except AttributeError:
            pass


        # Introduce a random delay between requests (e.g., between 0.5 and 1.5 seconds)
        delay = random.uniform(0.5, 1.5)
        time.sleep(delay)
        # Increment the URL counter
        url_counter += 1

        # Check if 100 URLs have been processed and save the DataFrame
        if url_counter % 100 == 0:
            df.to_csv('Files/apartmentsPortugal.csv', index=False)
            print(f"Saved progress at {url_counter} URLs.")

    else:
        print(f"Failed to retrieve property page for {url}")

# Save the updated DataFrame to the existing CSV file
df.to_csv('Files/apartmentsPortugal.csv', index=False)

print("Information extracted and added to apartmentsPortugal.csv.")


In [None]:
# Load your DataFrame containing property URLs
df = pd.read_csv('Files/apartmentsPortugal.csv')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

for property_number, url in enumerate(df['URL']):
    # Send a request to the URL
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the script tag containing image data
        script_elements = soup.find_all('script')

        for script_element in script_elements:
            script_content = script_element.string
            if script_content:
                # Use regex to extract image URLs from the script
                image_urls = re.findall(r'"large":"(https://ireland\.apollo\.olxcdn\.com[^"]+)"', script_content)
                if image_urls:
                    property_folder = f'Files/PropertiesImages/Property{property_number}'
                    os.makedirs(property_folder, exist_ok=True)

                    for index, image_url in enumerate(image_urls):
                        image_response = requests.get(image_url)
                        if image_response.status_code == 200:
                            image_extension = "webp"  # Assuming images are in webp format
                            image_filename = f"Property{property_number}-Image{index + 1}.{image_extension}"
                            image_path = os.path.join(property_folder, image_filename)

                            with open(image_path, 'wb') as f:
                                f.write(image_response.content)
                            print(f"Image {index + 1} saved to {image_path}")
                        else:
                            print(f"Failed to download image {index + 1}")
                else:
                    print("No image URLs found in script")
    else:
        print(f"Failed to retrieve property page for {url}")

print("Images downloaded and saved for all properties.")


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re

# Load your DataFrame containing property URLs
df = pd.read_csv('Files/apartmentsPortugal.csv')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Choose  a property to start from
start_property = 13843  

for property_number, url in enumerate(df['URL']):
    if property_number < start_property:
        continue  # Pula propriedades até atingir a propriedade inicial

    # Send a request to the URL
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the script tag containing image data
        script_elements = soup.find_all('script')

        for script_element in script_elements:
            script_content = script_element.string
            if script_content:
                # Use regex to extract image URLs from the script
                image_urls = re.findall(r'"large":"(https://ireland\.apollo\.olxcdn\.com[^"]+)"', script_content)
                if image_urls:
                    property_folder = f'Files/PropertiesImages/Property{property_number}'
                    os.makedirs(property_folder, exist_ok=True)

                    for index, image_url in enumerate(image_urls):
                        image_response = requests.get(image_url)
                        if image_response.status_code == 200:
                            image_extension = "webp"  # Assuming images are in webp format
                            image_filename = f"Property{property_number}-Image{index + 1}.{image_extension}"
                            image_path = os.path.join(property_folder, image_filename)

                            with open(image_path, 'wb') as f:
                                f.write(image_response.content)
                            print(f"Image {index + 1} saved to {image_path}")
                        else:
                            print(f"Failed to download image {index + 1}")
                else:
                    print("No image URLs found in script")
    else:
        print(f"Failed to retrieve property page for {url}")

print("Images downloaded and saved for all properties.")
