In [1]:
!pip install fake_useragent beautifulsoup4 pandas requests

Collecting fake_useragent
  Downloading fake_useragent-2.0.3-py3-none-any.whl.metadata (17 kB)
Downloading fake_useragent-2.0.3-py3-none-any.whl (201 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.1/201.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fake_useragent
Successfully installed fake_useragent-2.0.3


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
base_path = "/content/drive/My Drive/Projet-crawler-TopRestaurant"

csv_file = f"{base_path}/Restaurants.csv"
last_index_file = f"{base_path}/last_restaurant.txt"
output_file = f"{base_path}/user.csv"

In [4]:
import os

if os.path.exists(csv_file):
    print("✅ Le fichier Restaurants.csv existe.")
else:
    print("❌ Le fichier Restaurants.csv est introuvable. Vérifie le chemin dans Google Drive.")

✅ Le fichier Restaurants.csv existe.


In [5]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
import os
import time


In [7]:

# Read last processed index
try:
    with open(last_index_file, 'r') as f:
        last_index = int(f.read().strip())
except (FileNotFoundError, ValueError):
    last_index = -1  # Start from the beginning

# Read CSV file
df = pd.read_csv(csv_file)

# Base URL for TripAdvisor
base_url = "https://www.tripadvisor.com"

# Check if output file exists (to avoid re-adding headers)
file_exists = os.path.exists(output_file)

# Iterate through restaurants
for index, row in df.iterrows():
    if index <= last_index:
        continue  # Skip already processed ones

    restaurant_name = row["name"]
    restaurant_url = row["link"]

    # Generate new User-Agent for each request
    headers = {
        "User-Agent": UserAgent().random,
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive",
    }

    # Handle connection errors
    try:
        response = requests.get(restaurant_url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"⚠ Failed to retrieve {restaurant_name} (Status: {response.status_code})")
            continue
    except requests.RequestException as e:
        print(f"🚨 Request error for {restaurant_name}: {e}")
        continue

    # Parse HTML
    soup = BeautifulSoup(response.text, "html.parser")
    users_infos = soup.find_all("div", class_="_c")

    # Collect reviews
    reviews = []
    for user_infos in users_infos:
        try:
            user_element = user_infos.find("a", class_="BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS")
            user_name = user_element.text if user_element else "no name"
        except AttributeError:
            user_name = "no name"

        try:
            location_element = user_infos.find("div", class_="biGQs _P pZUbB osNWb")
            location = location_element.find("span").text if location_element else "no location"
        except AttributeError:
            location = "no location"

        try:
            review_title_div = user_infos.find("div", class_="biGQs _P fiohW qWPrE ncFvv fOtGX")
            review_title = review_title_div.find("a").text if review_title_div else "no review title"
        except AttributeError:
            review_title = "no review title"

        try:
            review = user_infos.find("span", class_="JguWG").text
        except AttributeError:
            review = "no review"

        try:
            date = user_infos.find("div", class_="aVuQn").text
        except AttributeError:
            date = "no date"

        try:
            rating_element = user_infos.find("svg", class_="UctUV d H0")
            rating = rating_element.find("title").text if rating_element else "no rating"
        except AttributeError:
            rating = "no rating"

        try:
            user_element = user_infos.find("a", class_="BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS")
            user_profile_link = base_url + user_element['href'].strip() if user_element and user_element.has_attr('href') else "no link"
        except (AttributeError, TypeError):
            user_profile_link = "no link"

        reviews.append({
            "restaurant_name": restaurant_name,  # Include restaurant name for clarity
            "user_name": user_name,
            "location": location,
            "review_title": review_title,
            "review": review,
            "date": date,
            "rating": rating,
            "user_profile_link": user_profile_link
        })
        print(f'{user_name} processed for {restaurant_name}')

    # Save reviews
    df_reviews = pd.DataFrame(reviews)
    df_reviews.to_csv(output_file, mode='a', index=False, header=not file_exists)
    file_exists = True  # Ensure header isn't written again

    # Save last processed index
    with open(last_index_file, 'w') as f:
        f.write(str(index))

    print(f"✅ Saved reviews for {restaurant_name}.\n")

    # Sleep to avoid getting blocked
    time.sleep(3)

Maxime T. processed for 492. Met Her At A Bar
VirginiaCW processed for 492. Met Her At A Bar
betsnow processed for 492. Met Her At A Bar
Kathleen Alicia M processed for 492. Met Her At A Bar
Linda B processed for 492. Met Her At A Bar
246810911 processed for 492. Met Her At A Bar
Dan R processed for 492. Met Her At A Bar
lemonsherbets processed for 492. Met Her At A Bar
RoguePincushion processed for 492. Met Her At A Bar
kathig77 processed for 492. Met Her At A Bar
Susan J processed for 492. Met Her At A Bar
Robert R processed for 492. Met Her At A Bar
Adam Kassel processed for 492. Met Her At A Bar
Yadi Ganuza processed for 492. Met Her At A Bar
Austin Patton processed for 492. Met Her At A Bar
✅ Saved reviews for 492. Met Her At A Bar.

rhnyrtye2A processed for 493. Jimmy Buffett's Margaritaville
Chris K processed for 493. Jimmy Buffett's Margaritaville
caraj00 processed for 493. Jimmy Buffett's Margaritaville
Annie F processed for 493. Jimmy Buffett's Margaritaville
JourneyWithUss p

KeyboardInterrupt: 