**Team Members: Ethan Wong, Timmy Ren, Mason Shu, Medha Nalamada, Carson Mullen, Bethel Kim**

**Morning Cohort (11 AM - 1 PM)**

*Note to all: Please pull any changes from the repo before working on this file!*

In [3]:
# Import Libraries

# Task A: Scrape from ratebeer.com - extract 5-6K reviews

In [14]:
# May end up with only 1700-2000 reviews with text

In [19]:
# Import necessary libraries
# Import necessary libraries
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
)
from webdriver_manager.chrome import ChromeDriverManager
import time

# Set up Selenium to use Chrome browser
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

# Function to handle cookie consent banner
def handle_cookie_consent():
    try:
        # Wait for the cookie consent banner to be present
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.ID, "onetrust-banner-sdk"))
        )
        # Click the "Accept" button
        accept_button = driver.find_element(
            By.XPATH, "//button[@id='onetrust-accept-btn-handler']"
        )
        accept_button.click()
        print("Cookie consent accepted.")
        # Wait for the banner to disappear
        WebDriverWait(driver, 5).until(
            EC.invisibility_of_element_located((By.ID, "onetrust-banner-sdk"))
        )
    except (TimeoutException, NoSuchElementException):
        # If the banner does not appear, proceed
        print("No cookie consent banner to accept.")

# Function to scrape reviews from a single beer page
def scrape_beer_reviews(beer_name, beer_link):
    driver.get(beer_link)
    all_product_names = []
    all_reviews = []
    all_ratings = []

    # Handle cookie consent if present
    handle_cookie_consent()

    try:
        # Wait for the product name element to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, "//div[contains(@class,'MuiTypography-h4') and contains(@class,'mt-3')]")
            )
        )
        product_name_element = driver.find_element(
            By.XPATH, "//div[contains(@class,'MuiTypography-h4') and contains(@class,'mt-3')]"
        )
        product_name = product_name_element.text
    except TimeoutException:
        print(f"Product name not found for beer: {beer_name}")
        return [], [], []

    page_number = 1  # Initialize page number

    while True:
        print(f"Scraping page {page_number} of reviews for {beer_name}")

        try:
            # Wait for the review elements to be present
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.XPATH, "//div[@class='py-4']"))
            )
            review_elements = driver.find_elements(By.XPATH, "//div[@class='py-4']")
        except TimeoutException:
            print("No review elements found")
            break  # No reviews found on this page, exit loop

        if not review_elements:
            print("No review elements found")
            break  # No reviews found on this page, exit loop

        for review_element in review_elements:
            try:
                rating_element = review_element.find_element(
                    By.XPATH, ".//span[contains(@class,'MuiTypography-subtitle1')]"
                )
                rating = rating_element.text
            except NoSuchElementException:
                rating = None
            try:
                review_text_element = review_element.find_element(
                    By.XPATH, ".//div[contains(@class,'BeerReviewListItem___StyledText')]"
                )
                review_text = review_text_element.text
            except NoSuchElementException:
                review_text = None

            if review_text:
                all_product_names.append(product_name)
                all_reviews.append(review_text)
                all_ratings.append(rating)

        # Check if 'Next' button exists and is enabled
        try:
            # Using the relative XPath for the 'Next' button
            next_button = driver.find_element(
                By.XPATH,
                "//div[@class='MuiTablePagination-actions']/button[@title='Next page']",
            )
            # Check if the 'Next' button is not disabled
            if next_button.is_enabled():
                try:
                    # Scroll to the 'Next' button to ensure it's clickable
                    driver.execute_script("arguments[0].scrollIntoView();", next_button)
                    next_button.click()
                    page_number += 1
                    # Wait for new reviews to load by waiting for new content
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located(
                            (By.XPATH, f"//div[@class='py-4']//span[contains(text(), '{page_number}')]")
                        )
                    )
                except (ElementClickInterceptedException, TimeoutException):
                    # Handle any overlays or delays
                    print("Handling potential overlay or slow loading.")
                    handle_cookie_consent()
                    time.sleep(2)  # Wait a bit before retrying
                    next_button.click()
            else:
                print("Next button is disabled, moving to next beer")
                break  # 'Next' button is disabled, exit the loop
        except NoSuchElementException:
            print("No 'Next' button found, moving to next beer")
            break  # No 'Next' button, exit the loop

    return all_product_names, all_reviews, all_ratings

# Main scraping logic
def main():
    # Go to the top beers page
    driver.get("https://www.ratebeer.com/top-beers")

    # Handle cookie consent if present
    handle_cookie_consent()

    # Wait for the beer elements to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located(
            (By.XPATH, "//div[contains(@class,'DataTable__Row')]/div[2]/a")
        )
    )

    # Find beer elements
    beer_elements = driver.find_elements(
        By.XPATH, "//div[contains(@class,'DataTable__Row')]/div[2]/a"
    )

    # Collect beer names and links
    beer_links = []
    for beer_element in beer_elements:
        beer_name = beer_element.text
        beer_link = beer_element.get_attribute("href")
        beer_links.append((beer_name, beer_link))

    # Now, for each beer, collect the reviews
    all_product_names = []
    all_reviews = []
    all_ratings = []

    for beer_name, beer_link in beer_links:
        print(f"Scraping reviews for beer: {beer_name}")
        names, reviews, ratings = scrape_beer_reviews(beer_name, beer_link)
        all_product_names.extend(names)
        all_reviews.extend(reviews)
        all_ratings.extend(ratings)

        # Optional: stop if we have collected enough reviews
        if len(all_reviews) >= 6000:
            print("Collected 6000 reviews, stopping.")
            break

    # Close the browser
    driver.quit()

    # Create a DataFrame and save it to a CSV file
    df = pd.DataFrame(
        {
            "product_name": all_product_names,
            "product_review": all_reviews,
            "user_rating": all_ratings,
        }
    )

    # Remove duplicates and reviews without text
    df = df.drop_duplicates(subset=["product_review"]).dropna(subset=["product_review"])
    df = df[df["product_review"].str.strip().astype(bool)]  # Remove empty strings

    # Save the DataFrame to CSV
    df.to_csv("BeerReviews.csv", index=False, encoding="utf-8")
    print(f"Saved {len(df)} reviews to 'BeerReviews.csv'")


if __name__ == "__main__":
    main()

Cookie consent accepted.
Scraping reviews for beer: Toppling Goliath Kentucky Brunch
🇺🇸Stout - Imperial Flavored / Pastry
No cookie consent banner to accept.
Scraping page 1 of reviews for Toppling Goliath Kentucky Brunch
🇺🇸Stout - Imperial Flavored / Pastry
Scraping page 2 of reviews for Toppling Goliath Kentucky Brunch
🇺🇸Stout - Imperial Flavored / Pastry
Scraping page 3 of reviews for Toppling Goliath Kentucky Brunch
🇺🇸Stout - Imperial Flavored / Pastry
Scraping page 4 of reviews for Toppling Goliath Kentucky Brunch
🇺🇸Stout - Imperial Flavored / Pastry
Scraping page 5 of reviews for Toppling Goliath Kentucky Brunch
🇺🇸Stout - Imperial Flavored / Pastry
Scraping page 6 of reviews for Toppling Goliath Kentucky Brunch
🇺🇸Stout - Imperial Flavored / Pastry
Scraping page 7 of reviews for Toppling Goliath Kentucky Brunch
🇺🇸Stout - Imperial Flavored / Pastry
Scraping page 8 of reviews for Toppling Goliath Kentucky Brunch
🇺🇸Stout - Imperial Flavored / Pastry
Scraping page 9 of reviews for Top

ElementClickInterceptedException: Message: element click intercepted: Element <button class="MuiButtonBase-root MuiIconButton-root MuiIconButton-colorInherit Mui-disabled Mui-disabled" tabindex="-1" type="button" title="Next page" aria-label="Next page" disabled="">...</button> is not clickable at point (1423, 910). Other element would receive the click: <div class="MuiTablePagination-actions">...</div>
  (Session info: chrome=128.0.6613.138)
Stacktrace:
	GetHandleVerifier [0x0038D933+25811]
	(No symbol) [0x0031E314]
	(No symbol) [0x00212523]
	(No symbol) [0x0025D397]
	(No symbol) [0x0025B799]
	(No symbol) [0x0025967B]
	(No symbol) [0x00258C7B]
	(No symbol) [0x0024D9DF]
	(No symbol) [0x0027AD2C]
	(No symbol) [0x0024D475]
	(No symbol) [0x0027AFC4]
	(No symbol) [0x002946F0]
	(No symbol) [0x0027AAC6]
	(No symbol) [0x0024BEFD]
	(No symbol) [0x0024C8FD]
	GetHandleVerifier [0x0065F143+2981091]
	GetHandleVerifier [0x006B2FF9+3324825]
	GetHandleVerifier [0x0041B32F+605903]
	GetHandleVerifier [0x00422CBC+637020]
	(No symbol) [0x00326F4D]
	(No symbol) [0x00323DD8]
	(No symbol) [0x00323F75]
	(No symbol) [0x00316406]
	BaseThreadInitThunk [0x76C6FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x76FF80CE+286]
	RtlGetAppContainerNamedObjectPath [0x76FF809E+238]


# Task B: Develop a system where a customer specifies three attributes they desire in a beer, and the system recommends three beers that best match those attributes based on the extracted reviews

In [18]:
# Choose three attributes from the actual data
# Consider a word frequency analsysi of beer reviews to find important attributes in the actual data.
# Ensure attributes specified are likely to be mentioned together in a review (lift analysis) 

# Task C: Perform a similarity analysis using cosine similarity with the three attributes specified by the customer and the reviews

**Note: Used the bag-of-words model as opposed to word embeddings**

In [19]:
# Script should accept a file as input that has the product attributes and calculate similarity scores (0-1) 
    # between these attributes and each review
# Output fule should have three columns: product_name (will have a row for each review), product_review, 
    # and similarity_score

# Task D: Perform a sentiment analysis for every review

In [20]:
# Use VADER or any LLM

# Task E: Create an evaluation score for each beer that uses both similarity and sentiment scores and recommended three products to the customer

In [21]:
# Add sentiment and similarity scores for the three products recommended.

# Task F: How would our recommendations change if we use word vectors instead of the standard bag-of-words cosine similarity?

In [22]:
# Consider the % of reviews that mention a preferred attribute
    # For a recommended product, what % of its reviews mention an attribute specified by the customer
# Differences between bag-of-words and word vector approaches
# Bag of words, under certain conditions, will likely be better than word embeddings for recommender systems
# Show rating, similarity scores, and sentiments for the products recommended in this task and the previous one

# Task G: How would our recommendations differ if we ignored the similarity and feature sentiment scores and just chose the 3 highest rated products fro the entire data set? Would these products meet the requirements of the user looking for recommendations? Why or why not?

In [23]:
# Justify with analysis - use similarity and sentiment scores as well as overall ratings to answer this questions

# Task H: From 10 beers in the data, choose 1, and find the most similar beer among the remaining 9

In [24]:
# Show logic we are using in addition to finding the most similar product with markdown; explain method/logic