In [1]:
import os
import re
import json
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime


In [2]:
df = pd.read_csv("Files/apartmentsPortugal.csv")
df

Unnamed: 0,URL,Titles,Price,Location,Rooms,Areas,Bathrooms,Listing Type,Useful area,Gross area,Construction year,Energetic certificate,Enterprise
0,https://www.imovirtual.com/pt/anuncio/apartame...,Apartamento T2+1 c/ Lugar de Garagem e Arrumo ...,260 000 €,"Mafamude e Vilar do Paraíso, Vila Nova de Gaia...",T3,106 m²,2,Usado,106 m²,121 m²,2001.0,C,
1,https://www.imovirtual.com/pt/anuncio/apartame...,Apartamento T1 para venda na Praia da Rocha,185 000 €,"Portimão, Faro",T1,"58,30 m²",1,Usado,"58,30 m²","68,30 m²",2001.0,D,não
2,https://www.imovirtual.com/pt/anuncio/apartame...,"Apartamento T4, 2 Suítes, Luxo, Fonte Nova, Av...",950 000 €,"Glória e Vera Cruz, Aveiro",T4,175 m²,4 ou mais,Em construção,175 m²,245 m²,2023.0,A+,sim
3,https://www.imovirtual.com/pt/anuncio/apartame...,Apartamento T4,780 000 €,"Vila do Conde, Porto",T4,"181,65 m²",,Em construção,"181,65 m²","207,90 m²",,B,sim
4,https://www.imovirtual.com/pt/anuncio/venda-de...,Venda de Apartamento T3 duplex no centro de Vi...,300 000 €,"Vila do Conde, Porto",T3,183 m²,2,,183 m²,,,D,não
...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,https://www.imovirtual.com/pt/anuncio/t1-novo-...,T1 Novo ao ISCAP,175 000 €,"São Mamede de Infesta e Senhora da Hora, Matos...",T1,55 m²,,Novo,55 m²,,,A,não
89996,https://www.imovirtual.com/pt/anuncio/apartame...,"Apartamento T2 Venda em Aldoar, Foz do Douro e...",625 000 €,"Aldoar, Foz do Douro e Nevogilde, Porto",T2,"78,80 m²",2,Usado,"78,80 m²","78,80 m²",,Isento / Em Trâmite,não
89997,https://www.imovirtual.com/pt/anuncio/t3-1-em-...,T3+1 em construção com excelentes acabamentos ...,470 000 €,"Montijo e Afonsoeiro, Montijo, Setúbal",T4,140 m²,3,Em construção,140 m²,190 m²,,A+,não
89998,https://www.imovirtual.com/pt/anuncio/apartame...,APARTAMENTO T3 NOVO NO CENTRO DE LEIRIA,290 000 €,"Leiria, Pousos, Barreira e Cortes, Leiria",T3,"138,70 m²",2,Em construção,"138,70 m²",,,Isento / Em Trâmite,não


In [2]:
base_url = "https://www.imovirtual.com/comprar/apartamento/?page="
num_pages = 2  # Specify the number of pages to scrape
df = []
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [3]:
def scrape_data(url):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve data. HTTP Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    article_elements = soup.find_all('article', {'data-cy': 'listing-item'})

    # Lists to store extracted data
    titles = []
    prices = []
    locations = []
    rooms = []
    areas = []
    urls = []
    dates = []

    scrape_date = datetime.now().strftime('%Y-%m-%d')

    for article_element in article_elements:
        # Extract title
        title_element = article_element.find('p', class_='css-u3orbr e1g5xnx10')
        titles.append(title_element.text.strip() if title_element else np.nan)

        # Extract URL
        url_element = article_element.find('a')
        urls.append(f"https://www.imovirtual.com{url_element['href']}" if url_element else np.nan)

        # Extract price
        price_element = article_element.find('span', class_='css-2bt9f1 evk7nst0')
        prices.append(price_element.text.strip() if price_element else np.nan)

        # Extract location
        location_element = article_element.find('p', class_='css-42r2ms eejmx80')
        locations.append(location_element.text.strip() if location_element else np.nan)

        # Extract room count and area from <dl>
        dl_element = article_element.find('dl', class_='css-12dsp7a e1clni9t1')
        room = np.nan
        area = np.nan

        if dl_element:
            dt_elements = dl_element.find_all('dt')
            dd_elements = dl_element.find_all('dd')

            # Ensure there is a match between <dt> and <dd>
            for dt, dd in zip(dt_elements, dd_elements):
                if dt.text.strip() == "Tipologia":
                    room = dd.text.strip()
                if dt.text.strip() == "Zona":
                    area = dd.text.strip().split(" ")[0]  # Extract just the numeric value
                    break

        rooms.append(room)
        areas.append(area)

        # Add scraping date for each item
        dates.append(scrape_date)
    
    print(f"Titles: {len(titles)}, Prices: {len(prices)}, Locations: {len(locations)}, Rooms: {len(rooms)}, Areas: {len(areas)}, URLs: {len(urls)}")

    return {
        'titles': titles,
        'prices': prices,
        'locations': locations,
        'rooms': rooms,
        'areas': areas,
        'urls': urls,
        'dates': dates
    }

In [5]:
titles = []
prices = []
locations = []
rooms = []
areas = []
urls = []
dates = []

for page in range(1, num_pages + 1):
    print(f"Processing page {page}...")
    page_url = base_url + str(page)
    
    # Scrape data for the current page
    scraped_data = scrape_data(page_url)
    if not scraped_data:
        print(f"Skipping page {page} due to scraping error.")
        continue

    page_titles = scraped_data["titles"]
    page_prices = scraped_data["prices"]
    page_locations = scraped_data["locations"]
    page_rooms = scraped_data["rooms"]
    page_areas = scraped_data["areas"]
    page_urls = scraped_data["urls"]
    page_dates = scraped_data["dates"]

    # Validate that all lists have the same length
    lengths = [
        len(page_titles),
        len(page_prices),
        len(page_locations),
        len(page_rooms),
        len(page_areas),
        len(page_urls),
        len(page_dates)
    ]
    
    if len(set(lengths)) != 1:
        print(f"Warning: Data length mismatch on page {page}. Skipping...")
        print(f"Lengths: {lengths}")
        continue

    # Append to the main lists
    titles += page_titles
    prices += page_prices
    locations += page_locations
    rooms += page_rooms
    areas += page_areas
    urls += page_urls
    dates += page_dates

    print(f"Page {page} processed: {lengths[0]} items.")

Processing page 1...
Titles: 40, Prices: 40, Locations: 40, Rooms: 40, Areas: 40, URLs: 40
Page 1 processed: 40 items.
Processing page 2...
Titles: 40, Prices: 40, Locations: 40, Rooms: 40, Areas: 40, URLs: 40
Page 2 processed: 40 items.


In [6]:
data = {
    "Title": titles,
    "Price": prices,
    "Location": locations,
    "Rooms": rooms,
    "Area": areas,
    "URL": urls,
    "DateScraped": dates
}

df = pd.DataFrame(data)

In [7]:
df.to_csv("Files/apartmentsPortugal.csv", index="0")

In [None]:
from fake_useragent import UserAgent
import time
import random

# Load your DataFrame containing property URLs
df = pd.read_csv('Files/apartmentsPortugal.csv')

# Create empty columns in the existing DataFrame to store the extracted information
df['Useful area'] = None
df['Gross area'] = None
df['Construction year'] = None
df['Energetic certificate'] = None
df['Enterprise'] = None
df['Description'] = None

# Initialize UserAgent object
ua = UserAgent()

url_counter = 0

for property_number, url in enumerate(df['URL']):
    # Send a request to the URL with a random User-Agent
    headers = {'User-Agent': ua.random}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        try:
            # Extract useful_area if available (if not, set to 'N/A')
            useful_area = "N/A"
            df.at[property_number, 'useful_area'] = useful_area
        except AttributeError:
            pass

        try:
            # Extract gross_area
            gross_area = soup.find('div', class_='css-1ftqasz')
            if gross_area:
                df.at[property_number, 'gross_area'] = gross_area.get_text(strip=True)
            else:
                df.at[property_number, 'gross_area'] = "N/A"
        except AttributeError:
            pass

        try:
            # Extract construction_year
            construction_year = soup.find('p', class_='e15n0fyo2 css-nlohq6')
            if construction_year:
                df.at[property_number, 'construction_year'] = construction_year.get_text(strip=True)
            else:
                df.at[property_number, 'construction_year'] = "N/A"
        except AttributeError:
            pass

        try:
            # Extract energetic_certificate
            energetic_certificate = soup.find('p', class_='e15n0fyo2 css-nlohq6')
            if energetic_certificate:
                df.at[property_number, 'energetic_certificate'] = energetic_certificate.get_text(strip=True)
            else:
                df.at[property_number, 'energetic_certificate'] = "N/A"
        except AttributeError:
            pass

        try:
            # Extract enterprise if available
            enterprise = "N/A"
            df.at[property_number, 'enterprise'] = enterprise
        except AttributeError:
            pass

        try:
            # Extract description
            description = soup.find('div', {'data-cy': 'adPageAdDescription'}).text.strip()
            df.at[property_number, 'description'] = description
        except AttributeError:
            pass

        # Introduce a random delay between requests (e.g., between 0.5 and 1.5 seconds)
        delay = random.uniform(0.5, 1.5)
        time.sleep(delay)
        # Increment the URL counter
        url_counter += 1

        # Check if 100 URLs have been processed and save the DataFrame
        if url_counter % 100 == 0:
            df.to_csv('Files/apartmentsPortugal.csv', index=False)
            print(f"Saved progress at {url_counter} URLs.")

    else:
        print(f"Failed to retrieve property page for {url}")

# Save the updated DataFrame to the existing CSV file
df.to_csv('Files/apartmentsPortugal.csv', index=False)

print("Information extracted and added to apartmentsPortugal.csv.")


In [None]:
# Load your DataFrame containing property URLs
df = pd.read_csv('Files/apartmentsPortugal.csv')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

for property_number, url in enumerate(df['URL']):
    # Send a request to the URL
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the script tag containing image data
        script_elements = soup.find_all('script')

        for script_element in script_elements:
            script_content = script_element.string
            if script_content:
                # Use regex to extract image URLs from the script
                image_urls = re.findall(r'"large":"(https://ireland\.apollo\.olxcdn\.com[^"]+)"', script_content)
                if image_urls:
                    property_folder = f'Files/PropertiesImages/Property{property_number}'
                    os.makedirs(property_folder, exist_ok=True)

                    for index, image_url in enumerate(image_urls):
                        image_response = requests.get(image_url)
                        if image_response.status_code == 200:
                            image_extension = "webp"  # Assuming images are in webp format
                            image_filename = f"Property{property_number}-Image{index + 1}.{image_extension}"
                            image_path = os.path.join(property_folder, image_filename)

                            with open(image_path, 'wb') as f:
                                f.write(image_response.content)
                            print(f"Image {index + 1} saved to {image_path}")
                        else:
                            print(f"Failed to download image {index + 1}")
                else:
                    print("No image URLs found in script")
    else:
        print(f"Failed to retrieve property page for {url}")

print("Images downloaded and saved for all properties.")


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re

# Load your DataFrame containing property URLs
df = pd.read_csv('Files/apartmentsPortugal.csv')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Choose  a property to start from
start_property = 13843  

for property_number, url in enumerate(df['URL']):
    if property_number < start_property:
        continue  # Pula propriedades até atingir a propriedade inicial

    # Send a request to the URL
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the script tag containing image data
        script_elements = soup.find_all('script')

        for script_element in script_elements:
            script_content = script_element.string
            if script_content:
                # Use regex to extract image URLs from the script
                image_urls = re.findall(r'"large":"(https://ireland\.apollo\.olxcdn\.com[^"]+)"', script_content)
                if image_urls:
                    property_folder = f'Files/PropertiesImages/Property{property_number}'
                    os.makedirs(property_folder, exist_ok=True)

                    for index, image_url in enumerate(image_urls):
                        image_response = requests.get(image_url)
                        if image_response.status_code == 200:
                            image_extension = "webp"  # Assuming images are in webp format
                            image_filename = f"Property{property_number}-Image{index + 1}.{image_extension}"
                            image_path = os.path.join(property_folder, image_filename)

                            with open(image_path, 'wb') as f:
                                f.write(image_response.content)
                            print(f"Image {index + 1} saved to {image_path}")
                        else:
                            print(f"Failed to download image {index + 1}")
                else:
                    print("No image URLs found in script")
    else:
        print(f"Failed to retrieve property page for {url}")

print("Images downloaded and saved for all properties.")
