In [2]:
import os
import re
import json
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup


In [None]:
base_url = "https://www.imovirtual.com/comprar/apartamento/?page="
num_pages = 3700  # Specify the number of pages to scrape
df = []
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [None]:
def scrape_data(url):
    response = requests.get(url, headers=headers)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    article_elements = soup.find_all('article')

    titles = []
    prices = []
    locations = []
    rooms = []
    areas = []
    bathrooms = []
    listing_types = []
    urls = []


    for article_element in article_elements:
        # Extract title
        title_element = article_element.find('span', class_='offer-item-title')
        title = title_element.text.strip() if title_element else np.nan
        titles.append(title)

        # Extract URL
        url_element = article_element.find('a')
        url = url_element['href'] if url_element else np.nan
        urls.append(url)

        # Extract price
        price_element = article_element.find(class_='offer-item-price')
        price = price_element.text.strip() if price_element else np.nan
        prices.append(price)

        # Extract location
        location_element = article_element.find('p', class_='text-nowrap')
        location = location_element.text.strip().split(':')[-1].strip() if location_element else np.nan
        locations.append(location)

        # Extract room
        room_element = article_element.find(class_='offer-item-rooms')
        room = room_element.text.strip() if room_element else np.nan
        rooms.append(room)

        # Extract area
        area_element = article_element.find(class_='offer-item-area')
        area = area_element.text.strip() if area_element else np.nan
        areas.append(area)

        # Extract bathrooms and listing type
        details_element = article_element.find(class_=['parameters-view', 'params-small'])
        bathroom = np.nan
        listing_type = np.nan

        if details_element:
            li_elements = details_element.find_all('li')
            for li_element in li_elements:
                text = li_element.text.strip()
                if 'Casas de Banho' in text:
                    bathroom = text.split(':')[-1].strip()
                elif text in ['Em construção', 'Usado', 'Novo', 'Remodelado', 'Ruína', 'Para recuperar']:
                    listing_type = text

        bathrooms.append(bathroom)
        listing_types.append(listing_type)


    return titles, prices, locations, rooms, areas, bathrooms, listing_types, urls


In [None]:

titles = []
prices = []
locations = []
rooms = []
areas = []
bathrooms = []
listing_types = []
urls = []

for page in range(1, num_pages + 1):
    print(page)
    page_url = base_url + str(page)
    (
        page_titles,
        page_prices,
        page_locations,
        page_rooms,
        page_areas,
        page_bathrooms,
        page_listing_types,
        page_urls,
    ) = scrape_data(page_url)

    titles += page_titles
    prices += page_prices
    locations += page_locations
    rooms += page_rooms
    areas += page_areas
    bathrooms += page_bathrooms
    listing_types += page_listing_types
    urls += page_urls

    print(
        f"Page {page}: Titles: {len(page_titles)}, Prices: {len(page_prices)}, ..."
    )

In [None]:
len(listing_types)

In [None]:
# Create a DataFrame
data = {
    "URL": urls,
    "Titles": titles,
    "Price": prices,
    "Location": locations,
    "Rooms": rooms,
    "Areas": areas,
    "Bathrooms": bathrooms,
    "Listing Type": listing_types,
}

df = pd.DataFrame(data)
# Print the DataFrame
df

In [None]:
df.to_csv("Files/apartmentsPortugal.csv")

In [None]:
# Load your DataFrame containing property URLs
df = pd.read_csv('Files/apartmentsPortugal.csv')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

for property_number, url in enumerate(df['URL']):
    # Send a request to the URL
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')



        # Find the script tag containing image data
        script_elements = soup.find_all('script')

        for script_element in script_elements:
            script_content = script_element.string
            if script_content:
                # Use regex to extract image URLs from the script
                image_urls = re.findall(r'"large":"(https://ireland\.apollo\.olxcdn\.com[^"]+)"', script_content)
                if image_urls:
                    property_folder = f'Files/PropertiesImages/Property{property_number}'
                    os.makedirs(property_folder, exist_ok=True)

                    for index, image_url in enumerate(image_urls):
                        image_response = requests.get(image_url)
                        if image_response.status_code == 200:
                            image_extension = "webp"  # Assuming images are in webp format
                            image_filename = f"Property{property_number}-Image{index + 1}.{image_extension}"
                            image_path = os.path.join(property_folder, image_filename)

                            with open(image_path, 'wb') as f:
                                f.write(image_response.content)
                            print(f"Image {index + 1} saved to {image_path}")
                        else:
                            print(f"Failed to download image {index + 1}")
                else:
                    print("No image URLs found in script")
    else:
        print(f"Failed to retrieve property page for {url}")

print("Images downloaded and saved for all properties.")
