In [1]:
from fake_useragent import UserAgent

import re
import numpy as np
from bs4 import BeautifulSoup
import selenium
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService

In [2]:
URL = "https://www.olx.com.br/imoveis/aluguel/casas/estado-mg/regiao-de-uberlandia-e-uberaba/uberlandia"

In [3]:
# Configuração do webdriver
def setup_webdriver():
    global driver
    opts = Options()
    opts.add_argument(f"--user-agent={UserAgent.random}")
    
    driver = webdriver.Chrome(
        service=ChromeService(ChromeDriverManager().install()), options=opts 
    )
    driver.maximize_window()
    return driver

In [4]:
# Abre a página controlada pelo webdriver
def open_ad_page(driver, url: str = URL,):
    driver.get(url)

In [5]:
# Procura um elemento na página e espera até que ele esteja clicável
def find_element_with_implicit_wait(by, value, driver, TIMEOUT=10):
    try:
        return WebDriverWait(driver, TIMEOUT).until(EC.element_to_be_clickable((by, value)))
    except Exception as e:
        print(f'ERRO AO ENCONTRAR EM TEMPO LIMITE({TIMEOUT}s): {e.__class__} - {e.__repr__()}')
        return None

In [6]:
# Encontra um elemento e realiza um clique nele
def find_element_and_click(by, value, driver):
    element = find_element_with_implicit_wait(by, value, driver)
    if element:
        element.click()

In [7]:
# Altera o layout de visualização da página
def click_change_layout(driver):
    XPATH = '/html/body/div[1]/div[1]/main/div/div[2]/main/div[3]/div[1]/div[2]/button[2]'
    find_element_and_click(by=By.XPATH, value=XPATH, driver=driver)

In [8]:
# Coleta os dados de anúncios de casas da página atual
def coleta(driver):
    house_list = list()
    soup = BeautifulSoup(driver.page_source, "html.parser")
    all_houses = soup.find_all(
        "section", attrs={"class": "olx-ad-card olx-ad-card--vertical"}
    )

    for house in all_houses:
        house_prices = house.find(
            "h3", attrs={"class": re.compile(r"\bolx-ad-card__price\b")}
        )
        if house_prices:
            house_prices = house_prices.text
            house_prices = house_prices.replace("R$", " ").strip()
        else:
            house_prices = np.nan

        square_meters = house.find(
            "span", attrs={"aria-label": re.compile(r"quadrado?")}
        )
        garage = house.find("span", attrs={"aria-label": re.compile(r"garagem")})
        bathroom = house.find("span", attrs={"aria-label": re.compile(r"banheiro?")})
        bedroom = house.find("span", attrs={"aria-label": re.compile(r"quarto?")})
        address = house.find(
            "p",
            attrs={
                "class": "olx-text olx-text--caption olx-text--block olx-text--regular"
            },
        )

        if square_meters:
            square_meters = square_meters.text
            square_meters = square_meters.replace("m²", " ").strip()
        else:
            square_meters = np.nan

        if garage:
            garage = garage.text
            garage = garage.replace("+", " ").strip()
        else:
            garage = 0

        if bathroom:
            bathroom = bathroom.text
            bathroom = bathroom.replace("+", " ").strip()
        else:
            bathroom = np.nan

        if bedroom:
            bedroom = bedroom.text
            bedroom = bedroom.replace("+", " ").strip()
        else:
            bedroom = np.nan

        if address:
            address = address.text
            address = address.split(",")
            try:
                city = address[0].strip()
                neighborhood = address[1].strip()
            except IndexError:
                continue

        else:
            city = np.nan
            neighborhood = np.nan

        house_list.append(
            [house_prices, bedroom, square_meters, garage, bathroom, city, neighborhood]
        )

    return house_list

In [9]:
# Altera a página atual para a próxima página
def click_change_page(driver, current_page):
    try:
        next_page = current_page + 1
        new_url = f"https://www.olx.com.br/imoveis/aluguel/casas/estado-mg/regiao-de-uberlandia-e-uberaba/uberlandia?o={next_page}"
        driver.get(new_url)
        return True
    except Exception as e:
        print(f'ERRO AO IR PARA PRÓXIMA PÁGINA: {e.__class__} - {e.__repr__()}')
        return False
        

In [10]:
# Inicia o webdriver e coleta os dados
driver = setup_webdriver()
open_ad_page(driver)
click_change_layout(driver)
data = list()

for current_page in range(1, 52):
    data += coleta(driver)
    if not click_change_page(driver, current_page):
        break

ERRO AO ENCONTRAR EM TEMPO LIMITE(10s): <class 'selenium.common.exceptions.NoSuchWindowException'> - NoSuchWindowException()


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=126.0.6478.128)
Stacktrace:
	GetHandleVerifier [0x0090C203+27395]
	(No symbol) [0x008A3E04]
	(No symbol) [0x007A1B7F]
	(No symbol) [0x0077E483]
	(No symbol) [0x0080A06F]
	(No symbol) [0x0081C3D6]
	(No symbol) [0x00803736]
	(No symbol) [0x007D7541]
	(No symbol) [0x007D80BD]
	GetHandleVerifier [0x00BC3AB3+2876339]
	GetHandleVerifier [0x00C17F7D+3221629]
	GetHandleVerifier [0x0098D674+556916]
	GetHandleVerifier [0x0099478C+585868]
	(No symbol) [0x008ACE44]
	(No symbol) [0x008A9858]
	(No symbol) [0x008A99F7]
	(No symbol) [0x0089BF4E]
	BaseThreadInitThunk [0x75DAFA29+25]
	RtlGetAppContainerNamedObjectPath [0x77AE75F4+228]
	RtlGetAppContainerNamedObjectPath [0x77AE75C4+180]


In [None]:
columns_name = ['Rent_price', 'Bedrooms', 'Square_meters', 'Garage', 'Bathrooms', 'City', 'Neighborhood']
df = pd.DataFrame(data, columns=columns_name)

In [None]:
# Salva em um arquivo CSV
namefile_csv = 'rent_house_price_da.csv'
df.to_csv(namefile_csv, index=False)

In [None]:
driver.quit()