In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from random import randint, choice
import logging
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def setup_logging():
    logging.basicConfig(level=logging.INFO)
    return logging.getLogger(__name__)

def create_session():
    session = requests.Session()
    retry = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def get_headers():
    ua = UserAgent()
    return {
        'User-Agent': ua.random,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control': 'max-age=0',
        'TE': 'Trailers'
    }

def get_page_content(url, logger, session):
    try:
        # Add longer timeout and verify=False for testing
        response = session.get(
            url, 
            headers=get_headers(),
            timeout=30,
            verify=False  # Note: In production, keep verify=True
        )
        response.raise_for_status()
        return response.text
    except Exception as e:
        logger.error(f"Error fetching URL {url}: {str(e)}")
        return None

def scrape_restaurants(city_url, num_pages=5):
    logger = setup_logging()
    restaurants_data = []
    session = create_session()
    
    for page in range(num_pages):
        try:
            if page == 0:
                page_url = city_url
            else:
                page_url = city_url.replace('.html', f'-oa{page*30}.html')
                
            logger.info(f"Scraping page {page + 1}")
            
            # Add longer delay between requests
            time.sleep(randint(5, 10))
            
            content = get_page_content(page_url, logger, session)
            if not content:
                continue
                
            soup = BeautifulSoup(content, 'html.parser')
            restaurant_divs = soup.find_all('div', class_='listing')
            
            if not restaurant_divs:
                restaurant_divs = soup.find_all('div', class_='location-meta-block')  # Alternative class
            
            if not restaurant_divs:
                logger.warning(f"No restaurants found on page {page + 1}")
                continue
            
            for div in restaurant_divs:
                restaurant = parse_restaurant(div, logger)
                if restaurant:
                    restaurants_data.append(restaurant)
            
        except Exception as e:
            logger.error(f"Error processing page {page + 1}: {str(e)}")
            continue
    
    return pd.DataFrame(restaurants_data)

def main():
    base_url = "https://www.tripadvisor.com/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes.html"
    df = scrape_restaurants(base_url, num_pages=3)
    
    if df.empty:
        print("No restaurants were found")
        return
        
    df.to_csv('lyon_restaurants.csv', index=False)
    print(f"Successfully scraped {len(df)} restaurants")

if __name__ == "__main__":
    main()

INFO:__main__:Scraping page 1
INFO:__main__:Scraping page 2
INFO:__main__:Scraping page 3


No restaurants were found


In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import logging

def setup_logging():
    logging.basicConfig(level=logging.INFO)
    return logging.getLogger(__name__)

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def parse_restaurant(restaurant_element, logger):
    try:
        return {
            'name': restaurant_element.find_element(By.CSS_SELECTOR, '[data-test="title"]').text.strip(),
            # 'rating': restaurant_element.find_element(By.CSS_SELECTOR, '[data-test="rating-review"]').get_attribute('aria-label').split()[0] if restaurant_element.find_elements(By.CSS_SELECTOR, '[data-test="rating-review"]') else 'N/A',
            # 'reviews': restaurant_element.find_element(By.CSS_SELECTOR, '[data-test="review-count"]').text.strip('()') if restaurant_element.find_elements(By.CSS_SELECTOR, '[data-test="review-count"]') else '0',
            # 'cuisine': restaurant_element.find_element(By.CSS_SELECTOR, '[data-test="cuisine"]').text if restaurant_element.find_elements(By.CSS_SELECTOR, '[data-test="cuisine"]') else 'N/A',
            # 'price_range': restaurant_element.find_element(By.CSS_SELECTOR, '[data-test="price-range"]').text if restaurant_element.find_elements(By.CSS_SELECTOR, '[data-test="price-range"]') else 'N/A',
            # 'address': restaurant_element.find_element(By.CSS_SELECTOR, '[data-test="address"]').text if restaurant_element.find_elements(By.CSS_SELECTOR, '[data-test="address"]') else 'N/A'
        }
    except Exception as e:
        logger.error(f"Error parsing restaurant: {str(e)}")
        return None

def scrape_restaurants(city_url, num_pages=5):
    logger = setup_logging()
    restaurants_data = []
    driver = setup_driver()
    wait = WebDriverWait(driver, 10)
    
    try:
        for page in range(num_pages):
            page_url = f"{city_url.replace('.html', '')}-oa{page*30}.html" if page > 0 else city_url
            logger.info(f"Scraping page {page + 1}")
            
            driver.get(page_url)
            time.sleep(5)  # Allow page to load
            
            # Wait for restaurants to load
            restaurant_elements = wait.until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-test='restaurant-item']"))
            )
            
            if not restaurant_elements:
                logger.warning(f"No restaurants found on page {page + 1}")
                continue
                
            for element in restaurant_elements:
                restaurant = parse_restaurant(element, logger)
                if restaurant:
                    restaurants_data.append(restaurant)
            
            time.sleep(randint(3, 5))
            
    except Exception as e:
        logger.error(f"Error during scraping: {str(e)}")
    finally:
        driver.quit()
    
    return pd.DataFrame(restaurants_data)

def main():
    base_url = "https://www.tripadvisor.com/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes.html"
    df = scrape_restaurants(base_url, num_pages=3)
    
    if df.empty:
        print("No restaurants were found")
        return
        
    df.to_csv('lyon_restaurants.csv', index=False)
    print(f"Successfully scraped {len(df)} restaurants")

if __name__ == "__main__":
    main()

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [C:\Users\ediad\.wdm\drivers\chromedriver\win64\131.0.6778.108\chromedriver-win32/chromedriver.exe] found in cache
INFO:__main__:Scraping page 1
ERROR:__main__:Error during scraping: Message: 



No restaurants were found


In [12]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
import time
from random import randint
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TripAdvisorScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.ua = UserAgent()
        self.session = requests.Session()

    def get_headers(self):
        return {
            "User-Agent": self.ua.random,
            "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1"
        }

    def get_page_content(self, url, retries=3):
        for attempt in range(retries):
            try:
                response = self.session.get(
                    url, 
                    headers=self.get_headers(), 
                    timeout=15
                )
                response.raise_for_status()
                return BeautifulSoup(response.text, 'html.parser')
            except requests.exceptions.RequestException as e:
                logger.error(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt == retries - 1:
                    return None
                time.sleep(randint(2, 5))

    def parse_restaurant(self, restaurant):
        print(restaurant)
        try:
            data = {
                "nom": restaurant.find('a', class_='Lwqic Cj b').text.strip() if restaurant.find('a', class_='Lwqic Cj b') else "N/A",
                # "adresse": restaurant.find('span', class_='fHibO').text.strip() if restaurant.find('span', class_='fHibO') else "N/A",
                # "note": restaurant.find('svg', class_='UctUV')['aria-label'].split()[0] if restaurant.find('svg', class_='UctUV') else "N/A",
                # "nb_avis": restaurant.find('span', class_='IiChw').text.strip('()') if restaurant.find('span', class_='IiChw') else "0",
                # "cuisine": ', '.join([c.text.strip() for c in restaurant.find_all('span', class_='DsyBj DxyfE')]) if restaurant.find_all('span', class_='DsyBj DxyfE') else "N/A",
                # "prix": restaurant.find('span', class_='DsyBj DxyfE Gi z').text.strip() if restaurant.find('span', class_='DsyBj DxyfE Gi z') else "N/A"
            }
            return data
        except Exception as e:
            logger.error(f"Error parsing restaurant: {str(e)}")
            return None

    def scrape_page(self, page_number=0):
        url = self.base_url if page_number == 0 else self.base_url.replace('.html', f'-oa{page_number*30}.html')
        logger.info(f"Scraping page: {url}")
        
        soup = self.get_page_content(url)
        if not soup:
            return []

        restaurants = []
        restaurant_divs = soup.find_all('div', class_='nJbYN Wh S2 H2 f')
        
        for restaurant_div in restaurant_divs:
            data = self.parse_restaurant(restaurant_div)
            if data:
                restaurants.append(data)
            time.sleep(randint(1, 3))  # Respect rate limiting

        return restaurants

    def scrape_restaurants(self, num_pages=3):
        all_restaurants = []
        
        for page in range(num_pages):
            restaurants = self.scrape_page(page)
            if restaurants:
                all_restaurants.extend(restaurants)
            time.sleep(randint(3, 5))  # Delay between pages
            
        return pd.DataFrame(all_restaurants)

def main():
    base_url = "https://www.tripadvisor.fr/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes.html"
    scraper = TripAdvisorScraper(base_url)
    
    try:
        df = scraper.scrape_restaurants(num_pages=3)
        if not df.empty:
            df.to_csv('restaurants_lyon.csv', index=False, encoding='utf-8-sig')
            logger.info(f"Successfully scraped {len(df)} restaurants")
            logger.info(f"Data saved to restaurants_lyon.csv")
        else:
            logger.error("No restaurants found")
    except Exception as e:
        logger.error(f"Scraping failed: {str(e)}")

if __name__ == "__main__":
    main()

INFO:__main__:Scraping page: https://www.tripadvisor.fr/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes.html
INFO:__main__:Scraping page: https://www.tripadvisor.fr/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes-oa30.html
INFO:__main__:Scraping page: https://www.tripadvisor.fr/Restaurants-g187265-Lyon_Rhone_Auvergne_Rhone_Alpes-oa60.html
ERROR:__main__:No restaurants found


In [2]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
import time
from random import randint
import logging
from urllib.parse import urljoin

# Configuration du logger
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

class TripAdvisorScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.ua = UserAgent()
        self.session = requests.Session()

    def get_headers(self):
        """Génère des en-têtes HTTP réalistes pour simuler un vrai navigateur."""
        return {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Cache-Control": "max-age=0",
        }

    def get_page_content(self, url, retries=3):
        """Récupère le contenu d'une page avec gestion des erreurs et des délais."""
        for attempt in range(retries):
            try:
                response = self.session.get(url, headers=self.get_headers(), timeout=30)
                response.raise_for_status()
                logger.debug(f"Page récupérée avec succès : {url}")
                time.sleep(randint(2, 4))  # Respect des limites
                return BeautifulSoup(response.text, 'html.parser')
            except Exception as e:
                logger.error(f"Tentative {attempt + 1} échouée : {str(e)}")
                if attempt == retries - 1:
                    return None
                time.sleep(randint(5, 10))

    def parse_restaurant(self, restaurant):
        """Analyse les informations d'un restaurant dans une carte."""
        try:
            name_elem = restaurant.find('a', {"data-test": "restaurant-name"})
            address_elem = restaurant.find('div', class_='vQlTa')
            rating_elem = restaurant.find('svg', {'aria-label': True})
            reviews_elem = restaurant.find('span', {'data-test': 'reviews-count'})
            price_elem = restaurant.find('span', {'data-test': 'price-range'})

            data = {
                "nom": name_elem.text.strip() if name_elem else "N/A",
                "adresse": address_elem.text.strip() if address_elem else "N/A",
                "note": rating_elem['aria-label'].split()[0] if rating_elem else "N/A",
                "nb_avis": reviews_elem.text.strip().split()[0] if reviews_elem else "0",
                "prix": price_elem.text.strip() if price_elem else "N/A",
                "url": urljoin(self.base_url, name_elem['href']) if name_elem and name_elem.has_attr('href') else "N/A"
            }
            logger.debug(f"Restaurant analysé : {data}")
            return data
        except Exception as e:
            logger.error(f"Erreur lors de l'analyse d'un restaurant : {str(e)}")
            return None

    def get_next_page_url(self, page_number):
        """Génère l'URL pour paginer les résultats."""
        offset = page_number * 30  # TripAdvisor utilise un offset par tranches de 30
        return f"{self.base_url}&o=a{offset}"

    def scrape_page(self, page_number=0):
        """Récupère les données des restaurants d'une page donnée."""
        url = self.get_next_page_url(page_number)
        logger.info(f"Scraping de la page : {url}")
        soup = self.get_page_content(url)
        # print(soup)
        if not soup:
            logger.warning(f"Pas de contenu récupéré pour la page {page_number}")
            return []

        restaurants = []
        restaurant_divs = soup.find_all('div', {"data-test": "restaurant-item"})

        for restaurant_div in restaurant_divs:
            data = self.parse_restaurant(restaurant_div)
            if data:
                restaurants.append(data)
            time.sleep(randint(1, 2))  # Délai entre les analyses

        return restaurants

    def scrape_restaurants(self, num_pages=3):
        """Récupère les informations de plusieurs pages de restaurants."""
        all_restaurants = []
        for page in range(num_pages):
            restaurants = self.scrape_page(page)
            if restaurants:
                all_restaurants.extend(restaurants)
                logger.info(f"Page {page + 1} : {len(restaurants)} restaurants récupérés.")
            time.sleep(randint(3, 5))  # Délai entre les pages
        return pd.DataFrame(all_restaurants)

def main():
    # URL de recherche des restaurants à Lyon
    base_url = "https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity"
    scraper = TripAdvisorScraper(base_url)
    
    try:
        df = scraper.scrape_restaurants(num_pages=1)
        if not df.empty:
            df.to_csv('restaurants_lyon.csv', index=False, encoding='utf-8-sig')
            logger.info(f"Scraping réussi : {len(df)} restaurants enregistrés dans 'restaurants_lyon.csv'")
        else:
            logger.error("Aucun restaurant trouvé.")
    except Exception as e:
        logger.error(f"Échec du scraping : {str(e)}")

if __name__ == "__main__":
    main()


INFO:__main__:Scraping de la page : https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.tripadvisor.fr:443
DEBUG:urllib3.connectionpool:https://www.tripadvisor.fr:443 "GET /RestaurantSearch?geo=187265&sortOrder=popularity&o=a0 HTTP/11" 200 None
DEBUG:__main__:Page récupérée avec succès : https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a0
ERROR:__main__:Aucun restaurant trouvé.


In [7]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
import time
from random import randint
import logging
from urllib.parse import urljoin

# Configuration du logger
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

class TripAdvisorScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.ua = UserAgent()
        self.session = requests.Session()

    def get_headers(self):
        """Génère des en-têtes HTTP réalistes pour simuler un vrai navigateur."""
        return {
            # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
            "User-Agent": self.ua.random,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Cache-Control": "max-age=0",
        }

    def get_page_content(self, url, retries=3):
        """Récupère le contenu d'une page avec gestion des erreurs et des délais."""
        for attempt in range(retries):
            try:
                response = self.session.get(url, headers=self.get_headers(), timeout=30)
                response.raise_for_status()
                logger.debug(f"Page récupérée avec succès : {url}")
                time.sleep(randint(2, 4))  # Respect des limites
                return BeautifulSoup(response.text, 'html.parser')
            except Exception as e:
                logger.error(f"Tentative {attempt + 1} échouée : {str(e)}")
                if attempt == retries - 1:
                    return None
                time.sleep(randint(5, 10))

    def parse_restaurant(self, restaurant):
        """Analyse les informations d'un restaurant dans une carte."""
        try:
            name_elem = restaurant.find('a', {"data-test": "restaurant-name"})
            address_elem = restaurant.find('div', class_='vQlTa')
            rating_elem = restaurant.find('svg', {'aria-label': True})
            reviews_elem = restaurant.find('span', {'data-test': 'reviews-count'})
            price_elem = restaurant.find('span', {'data-test': 'price-range'})

            data = {
                "nom": name_elem.text.strip() if name_elem else "N/A",
                "adresse": address_elem.text.strip() if address_elem else "N/A",
                "note": rating_elem['aria-label'].split()[0] if rating_elem else "N/A",
                "nb_avis": reviews_elem.text.strip().split()[0] if reviews_elem else "0",
                "prix": price_elem.text.strip() if price_elem else "N/A",
                "url": urljoin(self.base_url, name_elem['href']) if name_elem and name_elem.has_attr('href') else "N/A"
            }
            logger.debug(f"Restaurant analysé : {data}")
            return data
        except Exception as e:
            logger.error(f"Erreur lors de l'analyse d'un restaurant : {str(e)}")
            return None

    def get_next_page_url(self, page_number):
        """Génère l'URL pour paginer les résultats."""
        offset = page_number * 30  # TripAdvisor utilise un offset par tranches de 30
        return f"{self.base_url}&o=a{offset}"

    def scrape_page(self, page_number=0):
        """Récupère les données des restaurants d'une page donnée."""
        url = self.get_next_page_url(page_number)
        logger.info(f"Scraping de la page : {url}")
        soup = self.get_page_content(url)
        # print(soup)
        with open('page_content.html', 'w', encoding='utf-8') as file:
            file.write(str(soup))
        

        if not soup:
            logger.warning(f"Pas de contenu récupéré pour la page {page_number}")
            return []

        restaurants = []
        restaurant_divs = soup.find_all('div', {"data-test": "restaurant-item"})

        for restaurant_div in restaurant_divs:
            data = self.parse_restaurant(restaurant_div)
            if data:
                restaurants.append(data)
            time.sleep(randint(1, 2))  # Délai entre les analyses

        return restaurants

    def scrape_restaurants(self, num_pages=3):
        """Récupère les informations de plusieurs pages de restaurants."""
        all_restaurants = []
        for page in range(num_pages):
            restaurants = self.scrape_page(page)
            if restaurants:
                all_restaurants.extend(restaurants)
                logger.info(f"Page {page + 1} : {len(restaurants)} restaurants récupérés.")
            time.sleep(randint(3, 5))  # Délai entre les pages
        return pd.DataFrame(all_restaurants)

def main():
    # URL de recherche des restaurants à Lyon
    base_url = "https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity"
    scraper = TripAdvisorScraper(base_url)
    
    try:
        df = scraper.scrape_restaurants(num_pages=1)
        if not df.empty:
            df.to_csv('restaurants_lyon.csv', index=False, encoding='utf-8-sig')
            logger.info(f"Scraping réussi : {len(df)} restaurants enregistrés dans 'restaurants_lyon.csv'")
        else:
            logger.error("Aucun restaurant trouvé.")
    except Exception as e:
        logger.error(f"Échec du scraping : {str(e)}")

if __name__ == "__main__":
    main()


INFO:__main__:Scraping de la page : https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.tripadvisor.fr:443
DEBUG:urllib3.connectionpool:https://www.tripadvisor.fr:443 "GET /RestaurantSearch?geo=187265&sortOrder=popularity&o=a0 HTTP/11" 200 None
DEBUG:__main__:Page récupérée avec succès : https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a0
ERROR:__main__:Aucun restaurant trouvé.


In [2]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
import time
from random import randint
import logging
from urllib.parse import urljoin
from itertools import cycle

# Configuration du logger
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

class TripAdvisorScraper:
    def __init__(self, base_url, min_delay=5, max_delay=10, retries=3, proxies=None):
        self.base_url = base_url
        self.ua = UserAgent()
        self.session = requests.Session()
        self.min_delay = min_delay
        self.max_delay = max_delay
        self.retries = retries
        self.proxies = cycle(proxies) if proxies else None  # Rotation des proxies

    def get_headers(self):
        """Génère des en-têtes HTTP réalistes pour simuler un vrai navigateur."""
        return {
            "User-Agent": self.ua.random,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
        }

    def get_page_content(self, url):
        """Récupère le contenu HTML d'une page avec gestion des erreurs."""
        for attempt in range(self.retries):
            try:
                proxy = next(self.proxies) if self.proxies else None
                proxies = {"http": proxy, "https": proxy} if proxy else None

                response = self.session.get(url, headers=self.get_headers(), proxies=proxies, timeout=30)
                response.raise_for_status()

                if "captcha" in response.url.lower():
                    logger.error("Redirection vers une page CAPTCHA. Impossible de continuer.")
                    return None

                logger.debug(f"Page récupérée avec succès : {url}")
                time.sleep(randint(self.min_delay, self.max_delay))  # Respect des délais
                return BeautifulSoup(response.text, 'html.parser')

            except requests.RequestException as e:
                logger.error(f"Tentative {attempt + 1} échouée pour {url} : {e}")
                if attempt == self.retries - 1:
                    return None
                time.sleep(randint(5, 10))

    def parse_restaurant(self, restaurant):
        """Analyse les informations d'un restaurant dans une carte."""
        try:
            name_elem = restaurant.find('a', {"data-test": "restaurant-name"})
            address_elem = restaurant.find('div', class_='vQlTa')
            rating_elem = restaurant.find('svg', {'aria-label': True})
            reviews_elem = restaurant.find('span', {'data-test': 'reviews-count'})
            price_elem = restaurant.find('span', {'data-test': 'price-range'})

            data = {
                "nom": name_elem.text.strip() if name_elem else "N/A",
                "adresse": address_elem.text.strip() if address_elem else "N/A",
                "note": rating_elem['aria-label'].split()[0] if rating_elem else "N/A",
                "nb_avis": reviews_elem.text.strip().split()[0] if reviews_elem else "0",
                "prix": price_elem.text.strip() if price_elem else "N/A",
                "url": urljoin(self.base_url, name_elem['href']) if name_elem and name_elem.has_attr('href') else "N/A",
            }
            logger.debug(f"Restaurant analysé : {data}")
            return data
        except Exception as e:
            logger.error(f"Erreur lors de l'analyse d'un restaurant : {e}")
            return None

    def get_next_page_url(self, page_number):
        """Génère l'URL pour paginer les résultats."""
        offset = page_number * 30  # TripAdvisor utilise un offset par tranches de 30
        return f"{self.base_url}&o=a{offset}"

    def scrape_page(self, page_number=0):
        """Récupère les données des restaurants d'une page donnée."""
        url = self.get_next_page_url(page_number)
        logger.info(f"Scraping de la page : {url}")
        soup = self.get_page_content(url)

        if not soup:
            logger.warning(f"Pas de contenu récupéré pour la page {page_number}")
            return []

        restaurants = []
        restaurant_divs = soup.find_all('div', {"data-test": "restaurant-item"})

        for restaurant_div in restaurant_divs:
            data = self.parse_restaurant(restaurant_div)
            if data:
                restaurants.append(data)
            time.sleep(randint(1, 2))  # Délai entre les analyses

        return restaurants

    def scrape_restaurants(self, num_pages=3):
        """Récupère les informations de plusieurs pages de restaurants."""
        all_restaurants = []
        for page in range(num_pages):
            restaurants = self.scrape_page(page)
            if restaurants:
                all_restaurants.extend(restaurants)
                logger.info(f"Page {page + 1} : {len(restaurants)} restaurants récupérés.")
            time.sleep(randint(self.min_delay + 1, self.max_delay + 2))  # Délai entre les pages
        return pd.DataFrame(all_restaurants)

def main():
    # URL de recherche des restaurants à Lyon
    base_url = "https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity"
    # Liste de proxies à utiliser (à remplacer par vos propres proxies)
    proxies = ["http://proxy1:port", "http://proxy2:port", "http://proxy3:port"]

    scraper = TripAdvisorScraper(base_url, proxies=proxies)

    try:
        df = scraper.scrape_restaurants(num_pages=1)
        if not df.empty:
            df.to_csv('restaurants_lyon.csv', index=False, encoding='utf-8-sig')
            logger.info(f"Scraping réussi : {len(df)} restaurants enregistrés dans 'restaurants_lyon.csv'")
        else:
            logger.error("Aucun restaurant trouvé.")
    except Exception as e:
        logger.error(f"Échec du scraping : {e}")

if __name__ == "__main__":
    main()


2024-12-18 20:31:28,862 - INFO - Scraping de la page : https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a0
2024-12-18 20:31:28,866 - ERROR - Tentative 1 échouée pour https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a0 : Failed to parse: http://proxy1:port
2024-12-18 20:31:38,868 - ERROR - Tentative 2 échouée pour https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a0 : Failed to parse: http://proxy2:port
2024-12-18 20:31:46,889 - ERROR - Tentative 3 échouée pour https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a0 : Failed to parse: http://proxy3:port
2024-12-18 20:31:57,895 - INFO - Scraping de la page : https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a30
2024-12-18 20:31:57,904 - ERROR - Tentative 1 échouée pour https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a30 : Failed to parse: http://proxy1:port
2024-12-18 20:

In [12]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent, FakeUserAgentError
import pandas as pd
import time
from random import randint
import logging
from urllib.parse import urljoin

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class TripAdvisorScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.setup_user_agent()
        self.session = requests.Session()
        self.min_delay = 3
        self.max_delay = 7

    def setup_user_agent(self):
        try:
            self.ua = UserAgent()
        except FakeUserAgentError:
            logger.warning("FakeUserAgent failed, using fallback")
            self.ua = None

    def get_headers(self):
        default_ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        return {
            "User-Agent": self.ua.random if self.ua else default_ua,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Cache-Control": "max-age=0",
        }

    def make_request(self, url, retries=3):
        for attempt in range(retries):
            try:
                response = self.session.get(
                    url,
                    headers=self.get_headers(),
                    timeout=30
                )
                response.raise_for_status()
                return response.text
            except Exception as e:
                logger.warning(f"Request failed (attempt {attempt + 1}): {str(e)}")
                if attempt == retries - 1:
                    raise
                time.sleep(randint(5, 10))

    def parse_restaurant(self, element):
        """Extrait les informations d'un restaurant."""
        try:
            name_elem = element.find('a', class_='Lwqic Cj b')
            return {
                'Nom': name_elem.text.strip() if name_elem else "N/A",
                'URL': urljoin(self.base_url, name_elem['href']) if name_elem and name_elem.has_attr('href') else "N/A"
            }
        except Exception as e:
            logger.error(f"Erreur lors de l'analyse d'un restaurant : {str(e)}")
            return None

    def scrape_page(self, page_number):
        url = f"{self.base_url}&o=a{page_number * 30}"
        logger.info(f"Scraping page {page_number + 1}")
        
        try:
            content = self.make_request(url)
            soup = BeautifulSoup(content, 'html.parser')
            
            restaurant_divs = soup.find_all('div', class_='YtrWs')
            print(soup.find_all('div', {"data-test-target": "restaurants-list"}))
            if not restaurant_divs:
                logger.warning(f"No restaurants found on page {page_number + 1}")
                return []

            restaurants = []
            for div in restaurant_divs:
                data = self.parse_restaurant(div)
                if data:
                    restaurants.append(data)
                time.sleep(randint(1, 2))

            return restaurants
            
        except Exception as e:
            logger.error(f"Error scraping page {page_number + 1}: {str(e)}")
            return []

    def scrape_restaurants(self, num_pages=1):
        all_restaurants = []
        empty_pages = 0
        
        for page in range(num_pages):
            restaurants = self.scrape_page(page)
            if restaurants:
                all_restaurants.extend(restaurants)
                empty_pages = 0
                logger.info(f"Found {len(restaurants)} restaurants on page {page + 1}")
            else:
                empty_pages += 1
                if empty_pages >= 2:
                    logger.info("Stopping due to consecutive empty pages")
                    break
            time.sleep(randint(self.min_delay, self.max_delay))
            
        return pd.DataFrame(all_restaurants)

def main():
    base_url = "https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity"
    scraper = TripAdvisorScraper(base_url)
    
    try:
        df = scraper.scrape_restaurants(num_pages=1)
        if not df.empty:
            df.to_csv('restaurants_lyon.csv', index=False, encoding='utf-8-sig')
            logger.info(f"Successfully scraped {len(df)} restaurants")
        else:
            logger.error("No restaurants found")
    except Exception as e:
        logger.error(f"Scraping failed: {str(e)}")

if __name__ == "__main__":
    main()

2024-12-18 22:43:03,086 - INFO - Scraping page 1
2024-12-18 22:43:03,096 - DEBUG - Starting new HTTPS connection (1): www.tripadvisor.fr:443
2024-12-18 22:43:03,689 - DEBUG - https://www.tripadvisor.fr:443 "GET /RestaurantSearch?geo=187265&sortOrder=popularity&o=a0 HTTP/11" 200 None


[<div class="YtrWs" data-test-target="restaurants-list"></div>]


2024-12-18 22:43:04,777 - INFO - Found 1 restaurants on page 1
2024-12-18 22:43:11,790 - INFO - Successfully scraped 1 restaurants


In [14]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent, FakeUserAgentError
import pandas as pd
import time
from random import randint
import logging

# Configuration du logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class TripAdvisorScraper:
    def __init__(self, base_url):
        self.base_url = base_url
        self.setup_user_agent()
        self.session = requests.Session()
        self.min_delay = 2
        self.max_delay = 5

    def setup_user_agent(self):
        """Initialise le User-Agent avec gestion des erreurs."""
        try:
            self.ua = UserAgent()
        except FakeUserAgentError:
            logger.warning("Impossible d'utiliser FakeUserAgent. Utilisation d'un User-Agent par défaut.")
            self.ua = None

    def get_headers(self):
        """Crée des en-têtes HTTP pour simuler un navigateur."""
        default_ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        return {
            "User-Agent": self.ua.random if self.ua else default_ua,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
            "Connection": "keep-alive",
            "Cache-Control": "no-cache"
        }

    def make_request(self, url, retries=3):
        """Effectue une requête GET avec gestion des erreurs."""
        for attempt in range(retries):
            try:
                response = self.session.get(url, headers=self.get_headers(), timeout=30)
                response.raise_for_status()
                return response.text
            except Exception as e:
                logger.warning(f"Requête échouée (tentative {attempt + 1}): {str(e)}")
                if attempt == retries - 1:
                    raise
                time.sleep(randint(3, 7))

    def parse_restaurant(self, element):
        """Extrait les informations d'un restaurant à partir d'un élément HTML."""
        try:
            name_elem = element.find('a', {"data-test": "restaurant-name"})
            rating_elem = element.find('svg', {'aria-label': True})
            reviews_elem = element.find('span', {'data-test': 'reviews-count'})
            price_elem = element.find('span', {'data-test': 'price-range'})
            
            return {
                'name': name_elem.text.strip() if name_elem else "N/A",
                'rating': rating_elem['aria-label'].split()[0] if rating_elem else "N/A",
                'reviews': reviews_elem.text.strip().split()[0] if reviews_elem else "0",
                'price': price_elem.text.strip() if price_elem else "N/A",
            }
        except Exception as e:
            logger.error(f"Erreur lors de l'analyse d'un restaurant : {str(e)}")
            return None

    def scrape_page(self, page_number):
        """Récupère les informations des restaurants sur une page spécifique."""
        url = f"{self.base_url}&o=a{page_number * 30}"
        logger.info(f"Scraping de la page {page_number + 1} : {url}")
        
        try:
            content = self.make_request(url)
            soup = BeautifulSoup(content, 'html.parser')

            with open(f'page_{page_number + 1}.html', 'w', encoding='utf-8') as file:
                file.write(str(soup))

            
            # Extraction des restaurants (sélecteur basé sur la classe RfBGI)
            restaurant_divs = soup.find_all('div', class_='RfBGI')
            print(restaurant_divs)
            if not restaurant_divs:
                logger.warning(f"Aucun restaurant trouvé sur la page {page_number + 1}")
                return []

            restaurants = []
            for div in restaurant_divs:
                data = self.parse_restaurant(div)
                if data:
                    restaurants.append(data)
                time.sleep(randint(1, 2))  # Petite pause entre les éléments

            return restaurants
            
        except Exception as e:
            logger.error(f"Erreur lors du scraping de la page {page_number + 1} : {str(e)}")
            return []

    def scrape_all_pages(self, max_pages=10):
        """Récupère les informations des restaurants sur plusieurs pages."""
        all_restaurants = []
        empty_pages = 0

        for page_number in range(max_pages):
            restaurants = self.scrape_page(page_number)
            if restaurants:
                all_restaurants.extend(restaurants)
                empty_pages = 0
                logger.info(f"Page {page_number + 1} : {len(restaurants)} restaurants trouvés.")
            else:
                empty_pages += 1
                if empty_pages >= 2:
                    logger.info("Arrêt du scraping après 2 pages vides consécutives.")
                    break
            time.sleep(randint(self.min_delay, self.max_delay))  # Pause entre les pages
            
        return pd.DataFrame(all_restaurants)

def main():
    base_url = "https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity"
    scraper = TripAdvisorScraper(base_url)

    try:
        df = scraper.scrape_all_pages(max_pages=1)  # Scrape jusqu'à 5 pages
        if not df.empty:
            df.to_csv('restaurants_lyon.csv', index=False, encoding='utf-8-sig')
            logger.info(f"Scraping terminé avec succès : {len(df)} restaurants trouvés.")
        else:
            logger.error("Aucun restaurant trouvé.")
    except Exception as e:
        logger.error(f"Échec du scraping : {str(e)}")

if __name__ == "__main__":
    main()


2024-12-18 22:56:27,994 - INFO - Scraping de la page 1 : https://www.tripadvisor.fr/RestaurantSearch?geo=187265&sortOrder=popularity&o=a0
2024-12-18 22:56:28,003 - DEBUG - Starting new HTTPS connection (1): www.tripadvisor.fr:443
2024-12-18 22:56:28,706 - DEBUG - https://www.tripadvisor.fr:443 "GET /RestaurantSearch?geo=187265&sortOrder=popularity&o=a0 HTTP/11" 200 None


[<div class="RfBGI"><span><a class="Lwqic Cj b" href="/Restaurant_Review-g187265-d15114321-Reviews-L_affreux_Jojo-Lyon_Rhone_Auvergne_Rhone_Alpes.html" target="_blank">1<!-- -->. <!-- -->L'affreux Jojo</a></span></div>, <div class="RfBGI"><span><a class="Lwqic Cj b" href="/Restaurant_Review-g187265-d18626103-Reviews-Table_Partage-Lyon_Rhone_Auvergne_Rhone_Alpes.html" target="_blank">2<!-- -->. <!-- -->Table &amp; Partage</a></span></div>, <div class="RfBGI"><span><a class="Lwqic Cj b" href="/Restaurant_Review-g187265-d23110895-Reviews-Frazarin-Lyon_Rhone_Auvergne_Rhone_Alpes.html" target="_blank">3<!-- -->. <!-- -->Frazarin Bistrot Franco Italien</a></span></div>, <div class="RfBGI"><span><a class="Lwqic Cj b" href="/Restaurant_Review-g187265-d2027277-Reviews-La_Bouteillerie-Lyon_Rhone_Auvergne_Rhone_Alpes.html" target="_blank">4<!-- -->. <!-- -->La Bouteillerie</a></span></div>, <div class="RfBGI"><span><a class="Lwqic Cj b" href="/Restaurant_Review-g187265-d19896976-Reviews-Empanadas

2024-12-18 22:57:12,138 - INFO - Page 1 : 30 restaurants trouvés.
2024-12-18 22:57:17,145 - INFO - Scraping terminé avec succès : 30 restaurants trouvés.
