In [1]:
import os
import re
import requests
import numpy as np
from bs4 import BeautifulSoup


In [2]:
base_url = "https://www.imovirtual.com/comprar/apartamento/?page="
num_pages = 2  # Specify the number of pages to scrape
df = []
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [3]:
def scrape_data(url):
    response = requests.get(url, headers=headers)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    article_elements = soup.find_all('article')

    titles = []
    prices = []
    locations = []
    rooms = []
    areas = []
    bathrooms = []
    listing_types = []
    urls = []


    for article_element in article_elements:
        # Extract title
        title_element = article_element.find('span', class_='offer-item-title')
        title = title_element.text.strip() if title_element else np.nan
        titles.append(title)

        # Extract URL
        url_element = article_element.find('a')
        url = url_element['href'] if url_element else np.nan
        urls.append(url)

        # Extract price
        price_element = article_element.find(class_='offer-item-price')
        price = price_element.text.strip() if price_element else np.nan
        prices.append(price)

        # Extract location
        location_element = article_element.find('p', class_='text-nowrap')
        location = location_element.text.strip().split(':')[-1].strip() if location_element else np.nan
        locations.append(location)

        # Extract room
        room_element = article_element.find(class_='offer-item-rooms')
        room = room_element.text.strip() if room_element else np.nan
        rooms.append(room)

        # Extract area
        area_element = article_element.find(class_='offer-item-area')
        area = area_element.text.strip() if area_element else np.nan
        areas.append(area)

        # Extract bathrooms and listing type
        details_element = article_element.find(class_=['parameters-view', 'params-small'])
        bathroom = np.nan
        listing_type = np.nan

        if details_element:
            li_elements = details_element.find_all('li')
            for li_element in li_elements:
                text = li_element.text.strip()
                if 'Casas de Banho' in text:
                    bathroom = text.split(':')[-1].strip()
                elif text in ['Em construção', 'Usado', 'Novo', 'Remodelado', 'Ruína', 'Para recuperar']:
                    listing_type = text

        bathrooms.append(bathroom)
        listing_types.append(listing_type)


    return titles, prices, locations, rooms, areas, bathrooms, listing_types, urls


In [4]:
# Scrape prices, rooms, areas, bathrooms, and listing types from the specified number of pages
titles = []
prices = []
locations = []
rooms = []
areas = []
bathrooms = []
listing_types = []
urls = []

for page in range(1, num_pages + 1):
    print(page)
    page_url = base_url + str(page)
    (
        page_titles,
        page_prices,
        page_locations,
        page_rooms,
        page_areas,
        page_bathrooms,
        page_listing_types,
        page_urls,
    ) = scrape_data(page_url)

    titles += page_titles
    prices += page_prices
    locations += page_locations
    rooms += page_rooms
    areas += page_areas
    bathrooms += page_bathrooms
    listing_types += page_listing_types
    urls += page_urls

    print(
        f"Page {page}: Titles: {len(page_titles)}, Prices: {len(page_prices)}, ..."
    )

1
Page 1: Titles: 24, Prices: 24, ...
2
Page 2: Titles: 24, Prices: 24, ...


In [5]:
len(listing_types)

48

In [6]:
import pandas as pd
# Create a DataFrame
data = {
    "URL": urls,
    "Titles": titles,
    "Price": prices,
    "Location": locations,
    "Rooms": rooms,
    "Areas": areas,
    "Bathrooms": bathrooms,
    "Listing Type": listing_types,
}

df = pd.DataFrame(data)
# Print the DataFrame
df

Unnamed: 0,URL,Titles,Price,Location,Rooms,Areas,Bathrooms,Listing Type
0,https://www.imovirtual.com/pt/anuncio/lisboa-s...,Lisboa Sta Maria Maior vende-se Apartamento T1...,215 000 €,"Santa Maria Maior, Lisboa",T1,30 m²,1,
1,https://www.imovirtual.com/pt/anuncio/apartame...,Apartamento T3 com terraço,520 000 €,"Ericeira, Mafra, Lisboa",T3,151 m²,2,Usado
2,https://www.imovirtual.com/pt/anuncio/apartame...,Apartamento T2 na Cavaleira,225 000 €,"Algueirão-Mem Martins, Sintra, Lisboa",T2,95 m²,1,Usado
3,https://www.imovirtual.com/pt/anuncio/t4-total...,T4 totalmente remodelado com vista rio,520 000 €,"Carcavelos e Parede, Cascais, Lisboa",T4,128 m²,2,
4,https://www.imovirtual.com/pt/anuncio/apartame...,"Apartamento T1 novo, em condomínio fechado de ...",179 000 €,Braga (São José de São Lázaro e São João do So...,T1,55 m²,1,Novo
5,https://www.imovirtual.com/pt/anuncio/apartame...,Apartamento T2 na zona do Hospital S. João,239 000 €,"São Mamede de Infesta e Senhora da Hora, Matos...",T2,75 m²,1,Usado
6,https://www.imovirtual.com/pt/anuncio/apartame...,"Apartamento T1, Empreendimento Asprela Easy, e...",225 000 €,"Paranhos, Porto",T1,60 m²,1,Em construção
7,https://www.imovirtual.com/pt/anuncio/apartame...,Apartamento T1Kit no Asprela Domus III,235 020 €,"Paranhos, Porto",T1,62 m²,1,Novo
8,https://www.imovirtual.com/pt/anuncio/apartame...,"Apartamento T1 Novo - Paranhos, Porto",210 000 €,"Paranhos, Porto",T1,59 m²,1,Novo
9,https://www.imovirtual.com/pt/anuncio/apartame...,Apartamento T2 - Matriz Loja - Comércio,94 500 €,"Valongo, Porto",T2,"70,40 m²",2,


In [7]:
df.to_csv("apartmentsPortugal.csv")

In [8]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Load your DataFrame containing property URLs
# Replace 'df' with the actual name of your DataFrame
df = pd.read_csv('apartmentsPortugal.csv')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Create a directory to save images if it doesn't exist
image_folder = 'property_images'
os.makedirs(image_folder, exist_ok=True)

# Loop through the DataFrame and scrape image URLs from property pages
for index, row in df.iterrows():
    property_url = row['URL']  # Replace 'URL' with the actual column name

    response = requests.get(property_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all image tags
        img_elements = soup.find_all('img')
        if img_elements:
            # Create a folder for the property if it doesn't exist
            property_folder = os.path.join(image_folder, f"property_{index}")
            os.makedirs(property_folder, exist_ok=True)

            for img_element in img_elements:
                image_url = img_element.get('src')
                if image_url:
                    # Extract image filename from the URL
                    image_filename = os.path.basename(image_url)
                    image_path = os.path.join(property_folder, image_filename)

                    # Download and save the image
                    image_response = requests.get(image_url)
                    if image_response.status_code == 200:
                        with open(image_path, 'wb') as f:
                            f.write(image_response.content)
                        print(f"Image for property {index} saved to {image_path}")
                    else:
                        print(f"Failed to download image for property {index}")
                else:
                    print(f"Image URL not found in property {index}")
        else:
            print(f"No image tags found in property {index}")
    else:
        print(f"Failed to retrieve property page for {index}")

print("All images downloaded and saved.")


Image for property 0 saved to property_images\property_0\imovirtualpt.svg
Image for property 0 saved to property_images\property_0\image;s=1280x1024;q=80
Image for property 0 saved to property_images\property_0\image;s=314x236;q=80
Image for property 0 saved to property_images\property_0\image;s=314x236;q=80
Image for property 0 saved to property_images\property_0\image;s=314x236;q=80
Image for property 0 saved to property_images\property_0\image;s=314x236;q=80
Image for property 0 saved to property_images\property_0\imovirtualpt.svg
Image for property 0 saved to property_images\property_0\app_store.png
Image for property 0 saved to property_images\property_0\google_play.png
Image for property 0 saved to property_images\property_0\escolha-consumidor-2022.webp
Image for property 0 saved to property_images\property_0\cinco-estrelas-2022.webp
Image for property 1 saved to property_images\property_1\imovirtualpt.svg
Image for property 1 saved to property_images\property_1\image;s=1280x1024

KeyboardInterrupt: 