## **Crawling**


In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Chemin vers GeckoDriver
service = Service("C:\\geckodriver\\geckodriver.exe")

# Initialisation du navigateur Firefox
driver = webdriver.Firefox(service=service)

# Charger la page cible
url = "https://www.vrbo.com/search?destination=Paris%20%28and%20vicinity%29%2C%20France&flexibility=0_DAY&d1=&startDate=&d2=&endDate=&adults=2&regionId=179898&isInvalidatedDate=false&theme=&userIntent=&semdtl=&sort=RECOMMENDED"
driver.get(url)

# Attendre que les annonces soient chargées
wait = WebDriverWait(driver, 15)  # Attente maximum de 15 secondes
try:
    # Localiser les éléments contenant les liens des appartements
    listings = wait.until(EC.presence_of_all_elements_located(
        (By.CSS_SELECTOR, "a[data-stid='open-hotel-information']"))
    )

    # Extraire les URLs des appartements
    room_links = [listing.get_attribute("href") for listing in listings]

    # Afficher les URLs extraites
    print(f"URLs récupérées : {room_links}")

except Exception as e:
    print(f"Erreur lors de l'extraction des URLs : {e}")

finally:
    # Fermer le navigateur
    driver.quit()


URLs récupérées : ['https://www.vrbo.com/622862?dateless=true&x_pwa=1&rfrr=HSR&pwa_ts=1743154498782&referrerUrl=aHR0cHM6Ly93d3cudnJiby5jb20vSG90ZWwtU2VhcmNo&useRewards=false&adults=2&regionId=179898&destination=Paris+%28and+vicinity%29%2C+France&destType=MARKET&neighborhoodId=553248635212953602&latLong=48.853564%2C2.348095&privacyTrackingState=CAN_TRACK&searchId=0f85ebff-3448-4b9e-b443-ad52bf0ad8dc&sort=RECOMMENDED&userIntent=&expediaPropertyId=34516544', 'https://www.vrbo.com/907637a?dateless=true&x_pwa=1&rfrr=HSR&pwa_ts=1743154498786&referrerUrl=aHR0cHM6Ly93d3cudnJiby5jb20vSG90ZWwtU2VhcmNo&useRewards=false&adults=2&regionId=179898&destination=Paris+%28and+vicinity%29%2C+France&destType=MARKET&neighborhoodId=553248635212953586&latLong=48.853564%2C2.348095&privacyTrackingState=CAN_TRACK&searchId=0f85ebff-3448-4b9e-b443-ad52bf0ad8dc&sort=RECOMMENDED&userIntent=&expediaPropertyId=27692749', 'https://www.vrbo.com/570176?dateless=true&x_pwa=1&rfrr=HSR&pwa_ts=1743154498789&referrerUrl=aHR0c

## **Crawling**

In [9]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# ⚙️ Setup Firefox driver (local)
service = Service("C:\\geckodriver\\geckodriver.exe")  # <-- Change si différent
driver = webdriver.Firefox(service=service)
wait = WebDriverWait(driver, 15)

# 🌍 Aller sur la page de recherche
driver.get("https://www.vrbo.com/search?destination=Paris%20%28and%20vicinity%29%2C%20France&adults=2&sort=RECOMMENDED")
time.sleep(5)

# 🔗 Extraire les liens des logements
room_links = []
try:
    listings = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[data-stid='open-hotel-information']")))
    room_links = list(set([el.get_attribute("href") for el in listings]))
    print(f"{len(room_links)} liens récupérés.")
except Exception as e:
    print("Erreur lors de l'extraction des URLs :", e)

# 🔎 Scraping d'un logement
def scrape_listing(url):
    data = {"url": url, "description": "", "comments": []}
    try:
        driver.get(url)
        time.sleep(5)

        # Attente explicite pour la description
        try:
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[@data-stid='content-markup']")))
        except:
            print("⚠️ Description non trouvée")

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        desc = soup.find("div", {"data-stid": "content-markup"})
        if desc:
            data["description"] = desc.get_text(strip=True)

        # --- Commentaires ---
        try:
            see_reviews_btn = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[@data-stid='reviews-link']"))
            )
            driver.execute_script("arguments[0].click();", see_reviews_btn)
            time.sleep(2)

            # Scroll pour charger plus d’avis
            for _ in range(5):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)

            # Clic sur tous les "See more"
            more_btns = driver.find_elements(By.XPATH, "//button[contains(text(), 'See more')]")
            for btn in more_btns:
                try:
                    driver.execute_script("arguments[0].click();", btn)
                    time.sleep(0.5)
                except:
                    pass

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            spans = soup.find_all("span", {"itemprop": "description"})
            for span in spans:
                txt = span.get_text(strip=True)
                if txt and len(txt) > 10:
                    data["comments"].append(txt)
        except:
            pass

    except Exception as e:
        print(f"[{url}] Erreur :", e)

    return data


# 🔄 Scraper les 5 premiers logements (ou plus)
all_data = []
for i, link in enumerate(room_links[:50]):  # ← Augmente si besoin
    print(f"Scraping {i+1}/{len(room_links)}: {link}")
    result = scrape_listing(link)
    print(f"→ {len(result['comments'])} commentaires trouvés.")
    all_data.append(result)

driver.quit()

# 📦 Exporter en Excel
flat_rows = []

for d in all_data:
    # Si aucun commentaire, on ajoute quand même la description seule
    if not d["comments"]:
        flat_rows.append({
            "url": d["url"],
            "description": d["description"],
            "comment": ""  # vide
        })
    else:
        for comment in d["comments"]:
            flat_rows.append({
                "url": d["url"],
                "description": d["description"],
                "comment": comment
            })

df = pd.DataFrame(flat_rows)
df.to_excel("scraping_resultss.xlsx", index=False)
print("✅ Données enregistrées dans scraping_resultss.xlsx")


50 liens récupérés.
Scraping 1/50: https://www.vrbo.com/1516567ha?dateless=true&x_pwa=1&rfrr=HSR&pwa_ts=1743256223261&referrerUrl=aHR0cHM6Ly93d3cudnJiby5jb20vSG90ZWwtU2VhcmNo&useRewards=false&adults=2&regionId=179898&destination=Paris+%28and+vicinity%29%2C+France&destType=MARKET&neighborhoodId=553248635212953593&latLong=48.853564%2C2.348095&privacyTrackingState=CAN_TRACK&searchId=8b5a5035-9715-4e22-ab39-040ad9d5d09c&sort=RECOMMENDED&userIntent=&expediaPropertyId=30363821
→ 100 commentaires trouvés.
Scraping 2/50: https://www.vrbo.com/888705?dateless=true&x_pwa=1&rfrr=HSR&pwa_ts=1743256223307&referrerUrl=aHR0cHM6Ly93d3cudnJiby5jb20vSG90ZWwtU2VhcmNo&useRewards=false&adults=2&regionId=179898&destination=Paris+%28and+vicinity%29%2C+France&destType=MARKET&neighborhoodId=553248635212953609&latLong=48.853564%2C2.348095&privacyTrackingState=CAN_TRACK&searchId=8b5a5035-9715-4e22-ab39-040ad9d5d09c&sort=RECOMMENDED&userIntent=&expediaPropertyId=33492071
→ 109 commentaires trouvés.
Scraping 3/50: 

## **Scrapping d'une déscription d'une URL**

In [9]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time

# Configurer Selenium
driver = webdriver.Firefox()
url = "https://www.vrbo.com/pdp/lo/10116304?dateless=true&x_pwa=1&rfrr=HSR&pwa_ts=1735673883028&referrerUrl=aHR0cHM6Ly93d3cudnJiby5jb20vSG90ZWwtU2VhcmNo&useRewards=false&adults=2&regionId=179898&destination=Paris+%28and+vicinity%29%2C+France&destType=MARKET&neighborhoodId=553248635212958361&latLong=48.853564%2C2.348095&privacyTrackingState=CAN_TRACK&searchId=6c3e040e-c944-4f40-b8c2-5f4249af8813&sort=RECOMMENDED&userIntent=&expediaPropertyId=10116304&pdpImageUrls=https%3A%2F%2Fimages.trvl-media.com%2Flodging%2F11000000%2F10120000%2F10116400%2F10116304%2F093c7021.jpg%3Fimpolicy%3Dresizecrop%26rw%3D598%26ra%3Dfit%2Chttps%3A%2F%2Fimages.trvl-media.com%2Flodging%2F11000000%2F10120000%2F10116400%2F10116304%2F09ee5527.jpg%3Fimpolicy%3Dresizecrop%26rw%3D297%26ra%3Dfit%2Chttps%3A%2F%2Fimages.trvl-media.com%2Flodging%2F11000000%2F10120000%2F10116400%2F10116304%2F0ec02ac8.jpg%3Fimpolicy%3Dresizecrop%26rw%3D297%26ra%3Dfit%2Chttps%3A%2F%2Fimages.trvl-media.com%2Flodging%2F11000000%2F10120000%2F10116400%2F10116304%2Fw1867h2493x54y53-103d366f.jpg%3Fimpolicy%3Dresizecrop%26rw%3D297%26ra%3Dfit%2Chttps%3A%2F%2Fimages.trvl-media.com%2Flodging%2F11000000%2F10120000%2F10116400%2F10116304%2F13f7868d.jpg%3Fimpolicy%3Dresizecrop%26rw%3D297%26ra%3Dfit&propertyName=Le+Rayz+Vend%C3%B4me"
driver.get(url)

# Attendre le chargement
time.sleep(5)

# Obtenir le HTML de la page
html_content = driver.page_source

# Analyser avec BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Trouver la section Amenities
descriptions = soup.find_all('div', {'data-stid': 'content-markup'})

for desc in descriptions:
    print(desc.get_text(strip=True))

# Fermer le navigateur
driver.quit()


Aparthotel with a 24-hour front desk, a short walk to Palais Garnier
Along with concierge services, this aparthotel has dry cleaning and a 24-hour front desk. WiFi in public areas is free. Additionally, express check-out, tour/ticket assistance, and a garden are onsite.  Each apartment offers free WiFi, premium bedding, and an LED TV with satellite channels. A separate tub and shower, an espresso maker, and free bottled water are among the other amenities available to guests.
Le Rayz Vendôme offers 20 air-conditioned accommodations with espresso makers and safes. Beds feature premium bedding. 83-cm LED televisions come with satellite channels.Bathrooms include separate bathtubs and showers, designer toiletries, complimentary toiletries, and hair dryers. Guests can surf the web using the complimentary wireless Internet access. Additionally, rooms include complimentary bottled water and blackout drapes/curtains. Housekeeping is offered daily and irons/ironing boards can be requested.
The

## **Scrapping d'un commentaire**

In [16]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Configurer le driver Firefox
driver = webdriver.Firefox()   # Remplacez par le chemin de geckodriver

# URL cible
url = "https://www.vrbo.com/pdp/lo/10116304?dateless=true&x_pwa=1&rfrr=HSR&pwa_ts=1735673883028&referrerUrl=aHR0cHM6Ly93d3cudnJiby5jb20vSG90ZWwtU2VhcmNo&useRewards=false&adults=2&regionId=179898&destination=Paris+%28and+vicinity%29%2C+France&destType=MARKET&neighborhoodId=553248635212958361&latLong=48.853564%2C2.348095&privacyTrackingState=CAN_TRACK&searchId=6c3e040e-c944-4f40-b8c2-5f4249af8813&sort=RECOMMENDED&userIntent=&expediaPropertyId=10116304"
driver.get(url)

try:
    # Étape 1 : Attendre que la page charge le bouton "See all reviews"
    wait = WebDriverWait(driver, 10)
    see_reviews_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@data-stid='reviews-link']")))

    # Étape 2 : Cliquer sur le bouton "See all reviews"
    see_reviews_button.click()

    # Étape 3 : Attendre que les commentaires se chargent
    wait.until(EC.presence_of_element_located((By.XPATH, "//article[@itemprop='review']")))

    # Étape 4 : Extraire un commentaire
    first_comment = driver.find_element(By.XPATH, "//article[@itemprop='review']//span[@itemprop='description']").text
    print("Premier commentaire extrait : ", first_comment)

except Exception as e:
    print("Erreur : ", e)

finally:
    # Fermer le navigateur
    driver.quit()


Premier commentaire extrait :  Our room was very very nice. We also had a terrace but the weather was too cold to enjoy it. The bed was comfortable and the room was very modern. Location was central to the Louvre and gardens. The Hop On Hop off bus was also nearby.


In [None]:
"https://www.vrbo.com/pdp/lo/10116304?dateless=true&x_pwa=1&rfrr=HSR&pwa_ts=1735673883028&referrerUrl=aHR0cHM6Ly93d3cudnJiby5jb20vSG90ZWwtU2VhcmNo&useRewards=false&adults=2&regionId=179898&destination=Paris+%28and+vicinity%29%2C+France&destType=MARKET&neighborhoodId=553248635212958361&latLong=48.853564%2C2.348095&privacyTrackingState=CAN_TRACK&searchId=6c3e040e-c944-4f40-b8c2-5f4249af8813&sort=RECOMMENDED&userIntent=&expediaPropertyId=10116304&pdpImageUrls=https%3A%2F%2Fimages.trvl-media.com%2Flodging%2F11000000%2F10120000%2F10116400%2F10116304%2F093c7021.jpg%3Fimpolicy%3Dresizecrop%26rw%3D598%26ra%3Dfit%2Chttps%3A%2F%2Fimages.trvl-media.com%2Flodging%2F11000000%2F10120000%2F10116400%2F10116304%2F09ee5527.jpg%3Fimpolicy%3Dresizecrop%26rw%3D297%26ra%3Dfit%2Chttps%3A%2F%2Fimages.trvl-media.com%2Flodging%2F11000000%2F10120000%2F10116400%2F10116304%2F0ec02ac8.jpg%3Fimpolicy%3Dresizecrop%26rw%3D297%26ra%3Dfit%2Chttps%3A%2F%2Fimages.trvl-media.com%2Flodging%2F11000000%2F10120000%2F10116400%2F10116304%2Fw1867h2493x54y53-103d366f.jpg%3Fimpolicy%3Dresizecrop%26rw%3D297%26ra%3Dfit%2Chttps%3A%2F%2Fimages.trvl-media.com%2Flodging%2F11000000%2F10120000%2F10116400%2F10116304%2F13f7868d.jpg%3Fimpolicy%3Dresizecrop%26rw%3D297%26ra%3Dfit&propertyName=Le+Rayz+Vend%C3%B4me"

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Configurer le driver Firefox
driver = webdriver.Firefox()   # Remplacez par le chemin de geckodriver

# URL cible
url = "https://www.vrbo.com/pdp/lo/10116304?dateless=true&x_pwa=1&rfrr=HSR&pwa_ts=1735673883028&referrerUrl=aHR0cHM6Ly93d3cudnJiby5jb20vSG90ZWwtU2VhcmNo&useRewards=false&adults=2&regionId=179898&destination=Paris+%28and+vicinity%29%2C+France&destType=MARKET&neighborhoodId=553248635212958361&latLong=48.853564%2C2.348095&privacyTrackingState=CAN_TRACK&searchId=6c3e040e-c944-4f40-b8c2-5f4249af8813&sort=RECOMMENDED&userIntent=&expediaPropertyId=10116304"
driver.get(url)

try:
    # Étape 1 : Attendre que la page charge le bouton "See all reviews"
    wait = WebDriverWait(driver, 10)
    see_reviews_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@data-stid='reviews-link']")))

    # Étape 2 : Cliquer sur le bouton "See all reviews"
    see_reviews_button.click()

    # Étape 3 : Attendre que les commentaires se chargent
    wait.until(EC.presence_of_element_located((By.XPATH, "//article[@itemprop='review']")))

    # Étape 4 : Extraire un commentaire
    first_comment = driver.find_element(By.XPATH, "//article[@itemprop='review']//span[@itemprop='description']").text
    print("Premier commentaire extrait : ", first_comment)

except Exception as e:
    print("Erreur : ", e)

finally:
    # Fermer le navigateur
    driver.quit()

Premier commentaire extrait :  Our room was very very nice. We also had a terrace but the weather was too cold to enjoy it. The bed was comfortable and the room was very modern. Location was central to the Louvre and gardens. The Hop On Hop off bus was also nearby.
