In [1]:
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin

all_links = []

def get_links_from_url(url, page_num, total_pages):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    # Calculate the progress percentage
    progress_percentage = (page_num + 1) / total_pages * 100
    print(f"Fetching links progress: {progress_percentage:.2f}% - from page: {url}")

    response = requests.get(url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    container = soup.select_one('body > main > section.section-main.search-results.search-listing-result > div.row > div')

    if not container:
        print("No container found for links on page: ", page_num + 1)
        return []

    link_elements = container.select('div.row.restaurant__list-row.js-restaurant__list_items > div > div > a')
    return [urljoin(url, link['href']) for link in link_elements if 'href' in link.attrs]

base_url = "https://guide.michelin.com/us/en/restaurants"
total_pages = 844

# Looping through the Michelin pages to gather restaurant links
for page_num in range(total_pages): # Include the base page as well
    if page_num == 0:
        url = base_url
    else:
        url = f"{base_url}/page/{page_num}"

    links = get_links_from_url(url, page_num, total_pages)

    all_links.extend(links)
    time.sleep(2)  # Adding a delay to prevent being rate-limited


Fetching links progress: 0.12% - from page: https://guide.michelin.com/us/en/restaurants
Fetching links progress: 0.24% - from page: https://guide.michelin.com/us/en/restaurants/page/1
Fetching links progress: 0.36% - from page: https://guide.michelin.com/us/en/restaurants/page/2
Fetching links progress: 0.47% - from page: https://guide.michelin.com/us/en/restaurants/page/3
Fetching links progress: 0.59% - from page: https://guide.michelin.com/us/en/restaurants/page/4
Fetching links progress: 0.71% - from page: https://guide.michelin.com/us/en/restaurants/page/5
Fetching links progress: 0.83% - from page: https://guide.michelin.com/us/en/restaurants/page/6
Fetching links progress: 0.95% - from page: https://guide.michelin.com/us/en/restaurants/page/7
Fetching links progress: 1.07% - from page: https://guide.michelin.com/us/en/restaurants/page/8
Fetching links progress: 1.18% - from page: https://guide.michelin.com/us/en/restaurants/page/9
Fetching links progress: 1.30% - from page: htt

In [None]:
len(all_links)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_restaurant_data(restaurant_url, current_index, total_links):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    # Calculate the progress percentage
    progress_percentage = (current_index + 1) / total_links * 100
    print(f"Scraping progress: {progress_percentage:.2f}% - getting data from {restaurant_url}")

    response = requests.get(restaurant_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Helper function to safely extract data and handle potential Unicode issues
    def safe_extract(selector):
        try:
            text = soup.select_one(selector).text.strip()
            return text.encode('utf-8').decode('utf-8')
        except AttributeError:
            return ""

    # Extracting various details about the restaurant
    name = safe_extract('#online-booking-desktop > h1')
    address = safe_extract('#online-booking-desktop > ul > li.restaurant-details__heading--address')
    value_content = safe_extract('#online-booking-desktop > ul > li.restaurant-details__heading-price > span')
    value = value_content.split('\n')[0]
    cuisine_type = value_content.split('  ')[-1]
    rating = safe_extract('body > main > div.restaurant-details > div.container > div > div.col-xl-8.col-lg-7.restaurant-details__components > section.section.restaurant-details__main > div.restaurant-details__description > div.restaurant-details__classification > div > div > div:nth-child(2)')
    description = safe_extract('body > main > div.restaurant-details > div.container > div > div.col-xl-8.col-lg-7.restaurant-details__components > section.section.restaurant-details__main > div.restaurant-details__description > div.restaurant-details__description--text > p')

    return {
        "Name": name,
        "Address": address,
        "Value": value,
        "Type": cuisine_type,
        "Rating": rating,
        "Description": description,
        "Link": restaurant_url
    }

def main():
    restaurant_links = all_links

    dataset = []
    total_links = len(restaurant_links)
    for index, link in enumerate(restaurant_links):
        data = scrape_restaurant_data(link, index, total_links)
        dataset.append(data)

    # Convert the dataset to a DataFrame
    df = pd.DataFrame(dataset)

    # Save the DataFrame to a CSV file with utf-8 encoding
    df.to_csv('C:/Users/agutierrez/Downloads/MichelinRestaurants.xlsx', index=False, encoding='utf-8-sig')

    # Print the DataFrame
    print(df)

if __name__ == "__main__":
    main()
