In [8]:
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

# List of restaurant links based on localities
localities = [
    "https://www.zomato.com/hyderabad/begumpet-restaurants",
    # Add more locality URLs as needed...
]

# Initialize lists to store data
all_urls = []
all_rest_name = []
all_ratings = []
all_price = []
all_cuisine = []
all_images = []  # For image URLs
all_opening_hours = []
all_locations = []
all_signature_dishes = []  # Popular dishes
all_special_features = []  # What people say about the restaurant
all_safety_measures = []  # Safety and hygiene measures
all_address = []  # Address of the restaurant

# Set up the Selenium WebDriver
driver = webdriver.Chrome()

for link in localities:
    driver.get(link)
    time.sleep(3)

    # Scroll and load all contents for the current locality
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for new content to load
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break  # Break if no more content is loaded
        last_height = new_height

    # Parse the page source using BeautifulSoup after loading all content
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    divs = soup.findAll('div', class_='jumbo-tracker')

    for parent in divs:
        name_tag = parent.find("h4")
        if name_tag is not None:
            rest_name = name_tag.text.strip()
            link_tag = parent.find("a")
            restaurant_link = urljoin("https://www.zomato.com", link_tag.get('href'))

            try:
                driver.get(restaurant_link)
                time.sleep(3)  # Allow time for the page to load
                
                inner_soup = BeautifulSoup(driver.page_source, 'html.parser')

                rating_tag = inner_soup.find('div', class_='sc-1q7bklc-1 cILgox')
                rating_value = rating_tag.text.strip() if rating_tag else 'Not available'

                # Extract price using the specific class you provided
                price_tag = inner_soup.find('p', class_='sc-1hez2tp-0 sc-adtsK iuDANL')
                price_value = price_tag.text.strip() if price_tag else 'Not available'

                cuisine_tag = inner_soup.find('div', class_='sc-fgfRvd gBMRZZ')
                cuisine_value = cuisine_tag.text.strip() if cuisine_tag else 'Not available'

                open_timing_tag = inner_soup.find('span', class_='sc-kasBVs dfwCXs')
                open_timing_value = open_timing_tag.text.strip() if open_timing_tag else 'Not available'

                location_tag = inner_soup.find('a', class_='sc-clNaTc vNCcy')
                location_value = location_tag.text.strip() if location_tag else 'Not available'

                popular_dishes_tag = inner_soup.find('h3', string='Popular Dishes')  # Updated to use string instead of text
                signature_dishes_text_value = popular_dishes_tag.find_next('p').text.strip() if popular_dishes_tag else 'Not available'

                people_say_tag = inner_soup.find('h3', string='People Say This Place Is Known For')  # Updated to use string instead of text
                special_features_text_value = people_say_tag.find_next('p').text.strip() if people_say_tag else 'Not available'

                image_tag = inner_soup.find('img', class_='sc-s1isp7-5 fyZwWD')
                image_url_value = image_tag.get("src") if image_tag else None

                safety_measures_section_1 = inner_soup.find('section', class_='sc-bgxRrC fHqOaY')
                safety_measures_value_list_items_1 = safety_measures_section_1.find_all('p') if safety_measures_section_1 else []

                safety_measures_section_2_items= inner_soup.find_all('p', class_='sc-1hez2tp-0 fvARMW')  
                
                all_safety_measures_items = [item.text.strip() for item in safety_measures_value_list_items_1]
                all_safety_measures_items += [item.text.strip() for item in safety_measures_section_2_items]

                safety_measures_value_final = ", ".join(all_safety_measures_items) if all_safety_measures_items else 'Not available'

                address_section = inner_soup.find('p', class_='sc-1hez2tp-0 clKRrC')
                address_value = address_section.text.strip() if address_section else 'Not available'

                all_urls.append(restaurant_link)
                all_rest_name.append(rest_name)
                all_ratings.append(rating_value)
                all_price.append(price_value)
                all_cuisine.append(cuisine_value)
                all_images.append(image_url_value)
                all_opening_hours.append(open_timing_value)
                all_locations.append(location_value)
                all_signature_dishes.append(signature_dishes_text_value)
                all_special_features.append(special_features_text_value)
                all_safety_measures.append(safety_measures_value_final)
                all_address.append(address_value)

            except Exception as e:
                print(f"Error processing {restaurant_link}: {e}")
        
            finally:
                driver.back()  # Navigate back to the locality page
                time.sleep(3)  # Wait for the page to load again

# Close the WebDriver after scraping
driver.quit()

# Create a DataFrame to store the collected data
df = pd.DataFrame({
    'links': all_urls,
    'names': all_rest_name,
    'ratings': all_ratings,
    'price for two': all_price,
    'cuisine': all_cuisine,
    'images': all_images,
    'opening & closing time': all_opening_hours,
    'location': all_locations,
    'signature dishes': all_signature_dishes,
    'special features': all_special_features,
    'safety measures': all_safety_measures,
    'address': all_address
})

# Save the DataFrame to a CSV file
df.to_csv('zomato_restaurants_data.csv', index=False)
print("Data collected and saved to zomato_restaurants_data.csv")

Data collected and saved to zomato_restaurants_data.csv


In [None]:
"""
1. Links -
2. Restaurant Names-
3. Location & Accessibility (Address)-
4. Cuisine Type-
5. Price Range (Per 2)-
6. Reviews & Ratings (Out of 5)-
7. Opening Hours & Reservation Policies (Timings)-
8. Special Features-
9. Signature Dishes-
10. Service Quality
11. Dietary Preferences
12. Hygiene & Safety
13. Crowd & Wait Time
14. Family-Friendliness


"""

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

# List of restaurant links based on localities
localities = [
    "https://www.zomato.com/hyderabad/jubilee-hills-restaurants",
    "https://www.zomato.com/hyderabad/gachibowli-restaurants",
    # Add more locality URLs as needed...
]

# Initialize lists to store data
all_data = []

# Set up the Selenium WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

for link in localities:
    driver.get(link)
    time.sleep(2)

    # Scroll and load all contents for the current locality
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for new content to load
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break  # Break if no more content is loaded
        last_height = new_height

    # Parse the page source using BeautifulSoup after loading all content
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    divs = soup.findAll('div', class_='jumbo-tracker')  # Adjust class as needed based on Zomato's HTML structure

    for parent in divs:
        # Extract restaurant name and link
        name_tag = parent.find("h4")
        if name_tag is not None:
            rest_name = name_tag.text.strip()
            link_tag = parent.find("a")
            restaurant_link = urljoin("https://www.zomato.com", link_tag.get('href'))

            try:
                # Click on the restaurant link to open its page
                driver.get(restaurant_link)
                time.sleep(2)  # Allow time for the page to load

                # Extract data from the individual restaurant page
                inner_soup = BeautifulSoup(driver.page_source, 'html.parser')

                rating_tag = inner_soup.find('div', class_='sc-1q7bklc-1 cILgox')
                rating_value = rating_tag.text.strip() if rating_tag else 'Not available'
                
                price_tag = inner_soup.find('p', class_='sc-1hez2tp-0 sc-adtsK iuDANL')
                price_value = price_tag.text.strip() if price_tag else 'Not available'

                cuisine_tag = inner_soup.find('div', class_='sc-fgfRvd gBMRZZ')
                cuisine_value = cuisine_tag.text.strip() if cuisine_tag else 'Not available'

                open_timing_tag = inner_soup.find('span', class_='sc-kasBVs dfwCXs')
                open_timing_value = open_timing_tag.text.strip() if open_timing_tag else 'Not available'

                location_tag = inner_soup.find('a', class_='sc-clNaTc vNCcy')
                location_value = location_tag.text.strip() if location_tag else 'Not available'

                popular_dishes_tag = inner_soup.find('h3', text='Popular Dishes')
                signature_dishes_text_value = popular_dishes_tag.find_next('p').text.strip() if popular_dishes_tag else 'Not available'

                people_say_tag = inner_soup.find('h3', text='People Say This Place Is Known For')
                special_features_text_value = people_say_tag.find_next('p').text.strip() if people_say_tag else 'Not available'

                image_tag = inner_soup.find('img', class_='sc-s1isp7-5 fyZwWD')
                image_url_value = image_tag.get("src") if image_tag else None

                safety_measures_section_1 = inner_soup.find('section', class_='sc-bgxRrC fHqOaY')
                safety_measures_value_list_items_1 = safety_measures_section_1.find_all('p') if safety_measures_section_1 else []

                safety_measures_section_2_items= inner_soup.find_all('p', class_='sc-1hez2tp-0 fvARMW')  
                
                all_safety_measures_items = [item.text.strip() for item in safety_measures_value_list_items_1]
                all_safety_measures_items += [item.text.strip() for item in safety_measures_section_2_items]

                safety_measures_value_final = ", ".join(all_safety_measures_items) if all_safety_measures_items else 'Not available'

                address_section = inner_soup.find('p', class_='sc-1hez2tp-0 clKRrC')
                address_value = address_section.text.strip() if address_section else 'Not available'

                # Append extracted data to the list
                all_data.append({
                    "Restaurant Name": rest_name,
                    "Link": restaurant_link,
                    "Rating": rating_value,
                    "Price": price_value,
                    "Cuisine": cuisine_value,
                    "Image URL": image_url_value,
                    "Opening Hours": open_timing_value,
                    "Location": location_value,
                    "Signature Dishes": signature_dishes_text_value,
                    "Special Features": special_features_text_value,
                    "Safety Measures": safety_measures_value_final,
                    "Address": address_value
                })

            except Exception as e:
                print(f"Error processing {restaurant_link}: {e}")
            finally:
                driver.back()  # Navigate back to the locality page

# Close the WebDriver after scraping
driver.quit()

# Create a DataFrame to store the collected data
df = pd.DataFrame(all_data)

# Save the DataFrame to a CSV file
df.to_csv('zomato_restaurants_data.csv', index=False)
print("Data scraped and saved to zomato_restaurants_data.csv")