UPDATE: RAN THIS SUCCESSFULLY FOR PAGE 1

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
from bs4 import BeautifulSoup

def init_driver():
    print("Initializing the Chrome driver...")
    driver = webdriver.Chrome()
    driver.maximize_window()
    return driver

def load_page_and_agree(driver, base_url):
    print(f"Navigating to {base_url}...")
    driver.get(base_url)
    time.sleep(10)  # Increase the wait time if needed

    try:
        print("Looking for the 'Agree and Close' button...")
        agree_button = driver.find_element(By.XPATH, '//*[@id="didomi-notice-agree-button"]/span')
        agree_button.click()
        print("'Agree and Close' button clicked.")
        time.sleep(10)  # Increase the wait time if needed
    except Exception as e:
        print("No 'Agree and Close' button found or error in clicking:", e)

def scrape_reviews(driver, star_xpath, max_reviews=200):
    try:
        print(f"Clicking on the star rating with XPath: {star_xpath}...")
        star_button = driver.find_element(By.XPATH, star_xpath)
        star_button.click()
        time.sleep(10)  # Increase the wait time if needed

        reviews = []
        unique_commenters = set()
        scrollable_popup = driver.find_element(By.XPATH, '//*[@id="baseModal"]/div/div[2]')

        last_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_popup)
        while len(reviews) < max_reviews:
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scrollable_popup)
            time.sleep(5)  # Increase the wait time if needed

            new_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_popup)
            if new_height == last_height:
                break
            last_height = new_height

            review_elements = driver.find_elements(By.XPATH, '//*[@id="baseModal"]/div/div[2]/div[3]/div')
            print(f"Found {len(review_elements)} review elements")
            for review_element in review_elements:
                if len(reviews) >= max_reviews:
                    break
                try:
                    rating = review_element.find_element(By.XPATH, 'div/div[1]/div[1]/a/span[1]').text.strip()

                    spans = review_element.find_elements(By.XPATH, 'div/div[1]/div[1]/a/span')
                    if len(spans) == 3:
                        review_text = spans[2].text.strip()
                    else:
                        review_text = spans[1].text.strip()

                    commenter_name = review_element.find_element(By.XPATH, 'div/div[1]/div[2]/div[1]/div/a[1]').text.strip()
                    review_date = review_element.find_element(By.XPATH, 'div/div[1]/div[2]/div[1]/div/a[2]').text.strip()

                    if commenter_name not in unique_commenters:
                        reviews.append({
                            'Rating': rating,
                            'Review Text': review_text,
                            'Commenter Name': commenter_name,
                            'Review Date': review_date
                        })
                        unique_commenters.add(commenter_name)
                except Exception as e:
                    print("Error in extracting review details:", e)

        print(f"{len(reviews)} reviews scraped for {star_xpath.split()[-1]}-star rating.")

        # Unclick the star rating
        star_button.click()
        time.sleep(3)  # Wait for the action to be processed

        return reviews

    except Exception as e:
        print("Error in scraping reviews for star rating:", e)
        return []

def scrape_wine_reviews(driver, wine_url, wine_name):
    driver.get(wine_url)
    time.sleep(10)

    print(f"Scraping reviews for {wine_name}")

    try:
        average_rating_link = driver.find_element(By.XPATH, '//*[@id="wine-location-header"]/div/div/div/div[2]/a')
        average_rating_link.click()
        time.sleep(10)

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.7);")
        time.sleep(3)

        show_more_reviews_button = driver.find_element(By.XPATH, '//*[@id="all_reviews"]/div[2]/div[1]/button/span')
        show_more_reviews_button.click()
        time.sleep(10)

        star_xpaths = {
            '5': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[1]',
            '4': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[2]',
            '3': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[3]',
            '2': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[4]',
            '1': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[5]'
        }

        all_reviews = []
        for star, xpath in star_xpaths.items():
            print(f"Scraping {star}-star reviews...")
            reviews = scrape_reviews(driver, xpath)
            all_reviews.extend(reviews)

        print(f"Total reviews scraped: {len(all_reviews)}")

        return all_reviews

    except Exception as e:
        print(f"Error in scraping reviews for {wine_name}: {e}")
        return []

def scrape_all_banfi_wines(driver, base_url):
    all_wine_details = []
    all_reviews = []
    scraped_wines = set()
    driver.get(base_url)
    time.sleep(10)

    try:
        agree_button = driver.find_element(By.XPATH, '//*[@id="didomi-notice-agree-button"]')
        agree_button.click()
        time.sleep(10)
    except Exception as e:
        print("No 'Agree and Continue' button found or error in clicking:", e)

    print("Scraping wines on the current page")

    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(10)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    wines = soup.find_all('div', class_='card card-lg')

    for wine in wines:
        try:
            wine_name = wine.find('a', class_='link-color-alt-grey').text.strip()
            if "Banfi" in wine_name and wine_name not in scraped_wines:
                wine_link = wine.find('a', class_='link-color-alt-grey')['href']
                wine_url = f"https://www.vivino.com{wine_link}"
                print(f"Scraping reviews for {wine_name}")

                wine_details = {
                    'Wine Name': wine_name,
                    'Brand': 'Banfi',
                    'Country': 'Italy'
                }

                driver.get(wine_url)
                time.sleep(10)
                soup_wine = BeautifulSoup(driver.page_source, 'html.parser')
                try:
                    wine_price = soup_wine.find('span', class_='wine-price-value').text.strip()
                    wine_details['Price'] = wine_price
                except Exception as e:
                    print("Error in extracting price:", e)

                wine_reviews = scrape_wine_reviews(driver, wine_url, wine_name)
                all_reviews.extend(wine_reviews)
                all_wine_details.append(wine_details)
                scraped_wines.add(wine_name)

                driver.get(base_url)
                time.sleep(10)
        except Exception as e:
            print("Error in processing wine:", e)

    # Check if there is a next page
    try:
        next_button = driver.find_element(By.XPATH, '/html/body/div[3]/section[1]/div/div/div/div[2]/button[2]')
        print("Next page button found, terminating the script.")
    except Exception as e:
        print("No more pages to load or error in finding next button:", e)

    return all_wine_details, all_reviews

def main():
    try:
        driver = init_driver()
        base_url = "https://www.vivino.com/search/wines?q=banfi+"
        all_wine_details, all_reviews = scrape_all_banfi_wines(driver, base_url)
        
        df_wine_details = pd.DataFrame(all_wine_details)
        df_reviews = pd.DataFrame(all_reviews)

        df_wine_details.to_csv('banfi_wine_details.csv', index=False)
        df_reviews.to_csv('banfi_wine_reviews.csv', index=False)

        df_wine_details.to_json('banfi_wine_details.json', orient='records', lines=True)
        df_reviews.to_json('banfi_wine_reviews.json', orient='records', lines=True)

        print("DataFrames saved as CSV and JSON files.")

    except KeyboardInterrupt:
        print("Process interrupted. Saving files...")

        df_wine_details.to_csv('banfi_wine_details.csv', index=False)
        df_reviews.to_csv('banfi_wine_reviews.csv', index=False)

        df_wine_details.to_json('banfi_wine_details.json', orient='records', lines=True)
        df_reviews.to_json('banfi_wine_reviews.json', orient='records', lines=True)

        print("DataFrames saved as CSV and JSON files.")
        driver.quit()
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


PAGE 2

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
from bs4 import BeautifulSoup

def init_driver():
    print("Initializing the Chrome driver...")
    driver = webdriver.Chrome()
    driver.maximize_window()
    return driver

def load_page_and_agree(driver, base_url):
    print(f"Navigating to {base_url}...")
    driver.get(base_url)
    time.sleep(10)  # Increase the wait time if needed

    try:
        print("Looking for the 'Agree and Close' button...")
        agree_button = driver.find_element(By.XPATH, '//*[@id="didomi-notice-agree-button"]/span')
        agree_button.click()
        print("'Agree and Close' button clicked.")
        time.sleep(10)  # Increase the wait time if needed
    except Exception as e:
        print("No 'Agree and Close' button found or error in clicking:", e)

def scrape_reviews(driver, star_xpath, max_reviews=200):
    try:
        print(f"Clicking on the star rating with XPath: {star_xpath}...")
        star_button = driver.find_element(By.XPATH, star_xpath)
        star_button.click()
        time.sleep(10)  # Increase the wait time if needed

        reviews = []
        unique_commenters = set()
        scrollable_popup = driver.find_element(By.XPATH, '//*[@id="baseModal"]/div/div[2]')

        last_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_popup)
        while len(reviews) < max_reviews:
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scrollable_popup)
            time.sleep(5)  # Increase the wait time if needed

            new_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_popup)
            if new_height == last_height:
                break
            last_height = new_height

            review_elements = driver.find_elements(By.XPATH, '//*[@id="baseModal"]/div/div[2]/div[3]/div')
            print(f"Found {len(review_elements)} review elements")
            for review_element in review_elements:
                if len(reviews) >= max_reviews:
                    break
                try:
                    rating = review_element.find_element(By.XPATH, 'div/div[1]/div[1]/a/span[1]').text.strip()

                    spans = review_element.find_elements(By.XPATH, 'div/div[1]/div[1]/a/span')
                    if len(spans) == 3:
                        review_text = spans[2].text.strip()
                    else:
                        review_text = spans[1].text.strip()

                    commenter_name = review_element.find_element(By.XPATH, 'div/div[1]/div[2]/div[1]/div/a[1]').text.strip()
                    review_date = review_element.find_element(By.XPATH, 'div/div[1]/div[2]/div[1]/div/a[2]').text.strip()

                    if commenter_name not in unique_commenters:
                        reviews.append({
                            'Rating': rating,
                            'Review Text': review_text,
                            'Commenter Name': commenter_name,
                            'Review Date': review_date
                        })
                        unique_commenters.add(commenter_name)
                except Exception as e:
                    print("Error in extracting review details:", e)

        print(f"{len(reviews)} reviews scraped for {star_xpath.split()[-1]}-star rating.")

        # Unclick the star rating
        star_button.click()
        time.sleep(3)  # Wait for the action to be processed

        return reviews

    except Exception as e:
        print("Error in scraping reviews for star rating:", e)
        return []

def scrape_wine_reviews(driver, wine_url, wine_name):
    driver.get(wine_url)
    time.sleep(10)

    print(f"Scraping reviews for {wine_name}")

    try:
        average_rating_link = driver.find_element(By.XPATH, '//*[@id="wine-location-header"]/div/div/div/div[2]/a')
        average_rating_link.click()
        time.sleep(10)

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.7);")
        time.sleep(3)

        show_more_reviews_button = driver.find_element(By.XPATH, '//*[@id="all_reviews"]/div[2]/div[1]/button/span')
        show_more_reviews_button.click()
        time.sleep(10)

        star_xpaths = {
            '5': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[1]',
            '4': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[2]',
            '3': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[3]',
            '2': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[4]',
            '1': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[5]'
        }

        all_reviews = []
        for star, xpath in star_xpaths.items():
            print(f"Scraping {star}-star reviews...")
            reviews = scrape_reviews(driver, xpath)
            all_reviews.extend(reviews)

        print(f"Total reviews scraped: {len(all_reviews)}")

        return all_reviews

    except Exception as e:
        print(f"Error in scraping reviews for {wine_name}: {e}")
        return []

def scrape_all_banfi_wines(driver, base_url):
    all_wine_details = []
    all_reviews = []
    scraped_wines = set()
    driver.get(base_url)
    time.sleep(10)

    try:
        agree_button = driver.find_element(By.XPATH, '//*[@id="didomi-notice-agree-button"]')
        agree_button.click()
        time.sleep(10)
    except Exception as e:
        print("No 'Agree and Continue' button found or error in clicking:", e)

    print("Scraping wines on the current page")

    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(10)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    wines = soup.find_all('div', class_='card card-lg')

    for wine in wines:
        try:
            wine_name = wine.find('a', class_='link-color-alt-grey').text.strip()
            if "Banfi" in wine_name and wine_name not in scraped_wines:
                wine_link = wine.find('a', class_='link-color-alt-grey')['href']
                wine_url = f"https://www.vivino.com{wine_link}"
                print(f"Scraping reviews for {wine_name}")

                wine_details = {
                    'Wine Name': wine_name,
                    'Brand': 'Banfi',
                    'Country': 'Italy'
                }

                driver.get(wine_url)
                time.sleep(10)
                soup_wine = BeautifulSoup(driver.page_source, 'html.parser')
                try:
                    wine_price = soup_wine.find('span', class_='wine-price-value').text.strip()
                    wine_details['Price'] = wine_price
                except Exception as e:
                    print("Error in extracting price:", e)

                wine_reviews = scrape_wine_reviews(driver, wine_url, wine_name)
                all_reviews.extend(wine_reviews)
                all_wine_details.append(wine_details)
                scraped_wines.add(wine_name)

                driver.get(base_url)
                time.sleep(10)
        except Exception as e:
            print("Error in processing wine:", e)

    # Check if there is a next page
    try:
        next_button = driver.find_element(By.XPATH, '/html/body/div[3]/section[1]/div/div/div/div[2]/button[2]')
        print("Next page button found, terminating the script.")
    except Exception as e:
        print("No more pages to load or error in finding next button:", e)

    return all_wine_details, all_reviews

def main():
    try:
        driver = init_driver()
        base_url = "https://www.vivino.com/search/wines?q=banfi&start=2"
        all_wine_details, all_reviews = scrape_all_banfi_wines(driver, base_url)
        
        df_wine_details = pd.DataFrame(all_wine_details)
        df_reviews = pd.DataFrame(all_reviews)

        df_wine_details.to_csv('banfi_wine_details_page2.csv', index=False)
        df_reviews.to_csv('banfi_wine_reviews_page2.csv', index=False)

        df_wine_details.to_json('banfi_wine_details_page2.json', orient='records', lines=True)
        df_reviews.to_json('banfi_wine_reviews.json_page2', orient='records', lines=True)

        print("DataFrames saved as CSV and JSON files.")

    except KeyboardInterrupt:
        print("Process interrupted. Saving files...")

        df_wine_details.to_csv('banfi_wine_details_page2.csv', index=False)
        df_reviews.to_csv('banfi_wine_reviews_page2.csv', index=False)

        df_wine_details.to_json('banfi_wine_details_page2.json', orient='records', lines=True)
        df_reviews.to_json('banfi_wine_reviews_page2.json', orient='records', lines=True)

        print("DataFrames saved as CSV and JSON files.")
        driver.quit()
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


Page 3

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
from bs4 import BeautifulSoup

def init_driver():
    print("Initializing the Chrome driver...")
    driver = webdriver.Chrome()
    driver.maximize_window()
    return driver

def load_page_and_agree(driver, base_url):
    print(f"Navigating to {base_url}...")
    driver.get(base_url)
    time.sleep(10)  # Increase the wait time if needed

    try:
        print("Looking for the 'Agree and Close' button...")
        agree_button = driver.find_element(By.XPATH, '//*[@id="didomi-notice-agree-button"]/span')
        agree_button.click()
        print("'Agree and Close' button clicked.")
        time.sleep(10)  # Increase the wait time if needed
    except Exception as e:
        print("No 'Agree and Close' button found or error in clicking:", e)

def scrape_reviews(driver, star_xpath, max_reviews=200):
    try:
        print(f"Clicking on the star rating with XPath: {star_xpath}...")
        star_button = driver.find_element(By.XPATH, star_xpath)
        star_button.click()
        time.sleep(10)  # Increase the wait time if needed

        reviews = []
        unique_commenters = set()
        scrollable_popup = driver.find_element(By.XPATH, '//*[@id="baseModal"]/div/div[2]')

        last_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_popup)
        new_reviews_found = True

        while new_reviews_found and len(reviews) < max_reviews:
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scrollable_popup)
            time.sleep(5)  # Increase the wait time if needed

            new_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_popup)
            new_reviews_found = new_height != last_height
            last_height = new_height

            review_elements = driver.find_elements(By.XPATH, '//*[@id="baseModal"]/div/div[2]/div[3]/div')
            print(f"Found {len(review_elements)} review elements")
            for review_element in review_elements[len(reviews):]:
                if len(reviews) >= max_reviews:
                    break
                try:
                    rating = review_element.find_element(By.XPATH, 'div/div[1]/div[1]/a/span[1]').text.strip()

                    spans = review_element.find_elements(By.XPATH, 'div/div[1]/div[1]/a/span')
                    if len(spans) == 3:
                        review_text = spans[2].text.strip()
                    else:
                        review_text = spans[1].text.strip()

                    commenter_name = review_element.find_element(By.XPATH, 'div/div[1]/div[2]/div[1]/div/a[1]').text.strip()
                    review_date = review_element.find_element(By.XPATH, 'div/div[1]/div[2]/div[1]/div/a[2]').text.strip()

                    if commenter_name not in unique_commenters:
                        reviews.append({
                            'Rating': rating,
                            'Review Text': review_text,
                            'Commenter Name': commenter_name,
                            'Review Date': review_date
                        })
                        unique_commenters.add(commenter_name)
                except Exception as e:
                    print("Error in extracting review details:", e)

        print(f"{len(reviews)} reviews scraped for {star_xpath.split()[-1]}-star rating.")

        # Unclick the star rating
        star_button.click()
        time.sleep(3)  # Wait for the action to be processed

        return reviews

    except Exception as e:
        print("Error in scraping reviews for star rating:", e)
        return []

def scrape_wine_reviews(driver, wine_url, wine_name):
    driver.get(wine_url)
    time.sleep(10)

    print(f"Scraping reviews for {wine_name}")

    try:
        average_rating_link = driver.find_element(By.XPATH, '//*[@id="wine-location-header"]/div/div/div/div[2]/a')
        average_rating_link.click()
        time.sleep(10)

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.7);")
        time.sleep(3)

        show_more_reviews_button = driver.find_element(By.XPATH, '//*[@id="all_reviews"]/div[2]/div[1]/button/span')
        show_more_reviews_button.click()
        time.sleep(10)

        star_xpaths = {
            '5': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[1]',
            '4': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[2]',
            '3': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[3]',
            '2': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[4]',
            '1': '//*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[5]'
        }

        all_reviews = []
        for star, xpath in star_xpaths.items():
            print(f"Scraping {star}-star reviews...")
            reviews = scrape_reviews(driver, xpath)
            all_reviews.extend(reviews)

        print(f"Total reviews scraped: {len(all_reviews)}")

        return all_reviews

    except Exception as e:
        print(f"Error in scraping reviews for {wine_name}: {e}")
        return []

def scrape_all_banfi_wines(driver, base_url):
    all_wine_details = []
    all_reviews = []
    scraped_wines = set()
    driver.get(base_url)
    time.sleep(10)

    try:
        agree_button = driver.find_element(By.XPATH, '//*[@id="didomi-notice-agree-button"]')
        agree_button.click()
        time.sleep(10)
    except Exception as e:
        print("No 'Agree and Continue' button found or error in clicking:", e)

    print("Scraping wines on the current page")

    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(10)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    wines = soup.find_all('div', class_='card card-lg')

    for wine in wines:
        try:
            wine_name = wine.find('a', class_='link-color-alt-grey').text.strip()
            if "Banfi" in wine_name and wine_name not in scraped_wines:
                wine_link = wine.find('a', class_='link-color-alt-grey')['href']
                wine_url = f"https://www.vivino.com{wine_link}"
                print(f"Scraping reviews for {wine_name}")

                wine_details = {
                    'Wine Name': wine_name,
                    'Brand': 'Banfi',
                    'Country': 'Italy'
                }

                driver.get(wine_url)
                time.sleep(10)
                soup_wine = BeautifulSoup(driver.page_source, 'html.parser')
                try:
                    wine_price = soup_wine.find('span', class_='wine-price-value').text.strip()
                    wine_details['Price'] = wine_price
                except Exception as e:
                    print("Error in extracting price:", e)

                wine_reviews = scrape_wine_reviews(driver, wine_url, wine_name)
                all_reviews.extend(wine_reviews)
                all_wine_details.append(wine_details)
                scraped_wines.add(wine_name)

                driver.get(base_url)
                time.sleep(10)
        except Exception as e:
            print("Error in processing wine:", e)

    # Check if there is a next page
    try:
        next_button = driver.find_element(By.XPATH, '/html/body/div[3]/section[1]/div/div/div/div[2]/button[2]')
        print("Next page button found, terminating the script.")
    except Exception as e:
        print("No more pages to load or error in finding next button:", e)

    return all_wine_details, all_reviews

def main():
    try:
        driver = init_driver()
        base_url = "https://www.vivino.com/search/wines?q=banfi&start=3"
        all_wine_details, all_reviews = scrape_all_banfi_wines(driver, base_url)
        
        df_wine_details = pd.DataFrame(all_wine_details)
        df_reviews = pd.DataFrame(all_reviews)

        df_wine_details.to_csv('banfi_wine_details_page3.csv', index=False)
        df_reviews.to_csv('banfi_wine_reviews_page3.csv', index=False)

        df_wine_details.to_json('banfi_wine_details_page3.json', orient='records', lines=True)
        df_reviews.to_json('banfi_wine_reviews_page3.json', orient='records', lines=True)

        print("DataFrames saved as CSV and JSON files.")

    except KeyboardInterrupt:
        print("Process interrupted. Saving files...")

        df_wine_details.to_csv('banfi_wine_details_page3.csv', index=False)
        df_reviews.to_csv('banfi_wine_reviews_page3.csv', index=False)

        df_wine_details.to_json('banfi_wine_details_page3.json', orient='records', lines=True)
        df_reviews.to_json('banfi_wine_reviews_page3.json', orient='records', lines=True)

        print("DataFrames saved as CSV and JSON files.")
        driver.quit()
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Initializing the Chrome driver...
Scraping wines on the current page
Scraping reviews for Banfi Tener Sauvignon - Chardonnay Extra Dry
Error in extracting price: 'NoneType' object has no attribute 'text'
Scraping reviews for Banfi Tener Sauvignon - Chardonnay Extra Dry
Scraping 5-star reviews...
Clicking on the star rating with XPath: //*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[1]...
Found 20 review elements
Found 22 review elements
Found 22 review elements
22 reviews scraped for //*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[1]-star rating.
Scraping 4-star reviews...
Clicking on the star rating with XPath: //*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[2]...
Found 20 review elements
Found 30 review elements
Found 40 review elements
Found 50 review elements
Found 60 review elements
Found 70 review elements
Found 78 review elements
Found 78 review elements
78 reviews scraped for //*[@id="baseModal"]/div/div[2]/div[2]/div[1]/div/div/div[2]-star rating.

CLEANING AND CONSOLIDATING DATA INTO ONE CSV FILE

In [8]:
import pandas as pd

# List of CSV files for reviews and wine details
review_files = ['banfi_wine_reviews.csv', 'banfi_wine_reviews_page2.csv', 'banfi_wine_reviews_page3.csv']
wine_details_files = ['banfi_wine_details.csv', 'banfi_wine_details_page2.csv', 'banfi_wine_details_page3.csv']

# Function to consolidate CSV files and drop empty rows
def consolidate_csv(file_list, output_file):
    df_list = []
    for file in file_list:
        df = pd.read_csv(file)
        df_list.append(df)
    
    consolidated_df = pd.concat(df_list, ignore_index=True)
    consolidated_df.dropna(how='all', inplace=True)  # Drop rows where all elements are NaN
    
    consolidated_df.to_csv(output_file, index=False)
    print(f"Consolidated data saved to {output_file}")

# Consolidate reviews
consolidate_csv(review_files, 'banfi_wine_reviews_consolidated.csv')

# Consolidate wine details
consolidate_csv(wine_details_files, 'banfi_wine_details_consolidated.csv')


Consolidated data saved to banfi_wine_reviews_consolidated.csv
Consolidated data saved to banfi_wine_details_consolidated.csv
