<a href="https://colab.research.google.com/github/joaquimor/WebScrappingBooking/blob/main/WebScrappingBooking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Web Scrapping booking

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time
import random
import pandas as pd
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutException
import os


# Configurações do navegador (opcional)
options = webdriver.ChromeOptions()
# options.add_argument('--headless')  # Descomente para modo "headless"
# options.add_argument('--disable-gpu') # Descomente para modo "headless"

driver = webdriver.Chrome(options=options)

url = "https://www.booking.com/searchresults.html?ss=Sao+Paulo%2C+Sao+Paulo+State%2C+Brazil&ssne=Philippines&ssne_untouched=Philippines&efdco=1&label=gen173nr-1FCAEoggI46AdIM1gEaCCIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AriwpLkGwAIB0gIkYjc0N2NmZDItMmRkNC00MzgxLTg0MzQtNGVmYTkzODAxOTQ12AIF4AIB&aid=304142&lang=en-us&sb=1&src_elem=sb&src=index&dest_id=-671824&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=35c984dc072f0636&ac_meta=GhAzNWM5ODRkYzA3MmYwNjM2IAAoATICZW46CFPDo28gcGF1QABKAFAA&checkin=2024-11-04&checkout=2024-11-08&group_adults=1&no_rooms=1&group_children=0"  # **SUBSTITUA**

try:
    driver.get(url)

    # Aguarda carregamento inicial
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[data-testid="property-card"]')))

    while True:
        try:
            # Procura pelo botão ou pelo span dentro do botão
            load_more = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//*[self::button or self::span][contains(text(), 'Load more results')]"))
            )

            max_tentativas_stale = 3
            for tentativa in range(max_tentativas_stale):
                try:
                    load_more_button = load_more if load_more.tag_name == "button" else load_more.find_element(By.XPATH, "./ancestor::button")

                    if load_more_button.is_enabled():
                        driver.execute_script("arguments[0].scrollIntoView();", load_more_button)
                        time.sleep(random.uniform(1, 2))
                        load_more_button.click()
                        time.sleep(random.uniform(5, 7))
                        break  # Sai do loop se o clique for bem-sucedido
                    else:
                        print("O botão 'Load more results' está desabilitado. Fim dos resultados.")
                        raise NoSuchElementException("Botão desabilitado")

                except StaleElementReferenceException:
                    print(f"Stale element. Tentando obter o botão novamente ({tentativa + 1}/{max_tentativas_stale})")
                    load_more = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.XPATH, "//*[self::button or self::span][contains(text(), 'Load more results')]"))
                    )

            else:
                print("Não foi possível clicar no botão 'Load more results' após várias tentativas.")
                break

        except NoSuchElementException:
            print("Botão 'Load more results' não encontrado. Fim dos resultados.")
            break

        except TimeoutException:
            print("Tempo limite excedido ao procurar o botão 'Load more results'.")
            break

        except Exception as e:
            print(f"Erro inesperado: {e}")
            break


    # Raspa todos os resultados
    soup = BeautifulSoup(driver.page_source, "html.parser")
    cards = soup.select('[data-testid="property-card"]')

    dados_hoteis = []

    for card in cards:
        nome = card.select_one('[data-testid="title"]').text.strip() if card.select_one('[data-testid="title"]') else None
        localizacao = card.select_one('[data-testid="address"]').text.strip() if card.select_one('[data-testid="address"]') else None
        preco = card.select_one('[data-testid="price-and-discounted-price"]').text.strip() if card.select_one('[data-testid="price-and-discounted-price"]') else None

        review_score_element = card.select_one('[data-testid="review-score"]')
        if review_score_element:
            review_score_text = review_score_element.text.strip()
            match = re.search(r"(\d+\.?\d*)", review_score_text)
            avaliacao = float(match.group(1)) if match else None
        else:
            avaliacao = None

        dados_hoteis.append([nome, localizacao, preco, avaliacao])  # Adiciona os dados à lista



    # Cria um DataFrame do Pandas
    df = pd.DataFrame(dados_hoteis, columns=["Nome", "Localização", "Preço", "Avaliação"])

    # Define o caminho para salvar o arquivo
    download_path = os.path.expanduser("C:/Users/Joaquim Ornellas/Downloads")
    filename = "dados_booking.xlsx"
    filepath = os.path.join(download_path, filename)

    # Salva o DataFrame em um arquivo Excel
    try:
        df.to_excel(filepath, index=False)
        print(f"Arquivo salvo em: {filepath}")
    except Exception as e:
        print(f"Erro ao salvar o arquivo: {e}")

finally:
    driver.quit()