# Code for pulling the Reveiws from the two websites we are comparing prices for

## Code for the price comparison is [HERE](https://github.com/CameronCSS/Programming-Languages/blob/main/Data%20Notebooks/Price%20Comparison.ipynb)

In [None]:
import time
from datetime import date
from datetime import datetime
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from gspread.exceptions import WorksheetNotFound

current_date = date.today().strftime('%m/%d/%Y')

# Define the scope
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']

# Add your Service Account File
creds = ServiceAccountCredentials.from_json_keyfile_name(r"PATH_TO_CREDENTIALS", scope)

# Authorize your client
client = gspread.authorize(creds)

def get_search_word_from_sheet():
    # Open the Google Spreadsheet by its URL (make sure you have access to it)
    sheet = client.open_by_url('GOOGLE_SHEETS_URL').sheet1

    # Get all the records of the data
    data = sheet.get_all_records()

    # Get today's date if searching by Date instead of the day of the week
    today = date.today()

    # Checks for date instead of the day of the week
    for row in data:
        # If the date in the 'date' column matches today's date, return the corresponding search word
        date_from_sheet = datetime.strptime(row['date'], '%m/%d/%Y').date()
        if date_from_sheet == today:
            return row['search_word']

    return None

def scrape_website(url, word):
    driver = webdriver.Chrome()
    driver.get(url)

    if "livingspaces.com" in url:
        input_field = driver.find_element(By.ID, 'search')
        input_field.send_keys(word)
        form = input_field.find_element(By.XPATH, './ancestor::form')
        form.submit()

    elif "rcwilley.com" in url:
        input_field = driver.find_element(By.ID, 'searchBox')
        input_field.send_keys(word)
        submit_button = driver.find_element(By.ID, 'searchSubmit')
        submit_button.click()

    time.sleep(5)  
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    reviews = []

    wait = WebDriverWait(driver, 10)

    if "livingspaces.com" in url:
        product_items = driver.find_elements(By.CLASS_NAME, 'product-item-container')
        product_urls = [] # We store the URLs here

        # Collect all the product URLs that have a rating
        for item in product_items:
            try:
                name_element = item.find_element(By.CLASS_NAME, 'name')
                rating_element = item.find_element(By.CLASS_NAME, 'ratings')
                product_link_element = item.find_element(By.TAG_NAME, 'a')
                
                if name_element and rating_element and product_link_element:
                    name = name_element.text.strip()
                    product_link = product_link_element.get_attribute('href')
                    sku = product_link[-6:] # Get SKU from the product link
                    
                    # Keep only digits in SKU
                    sku = ''.join(filter(str.isdigit, sku))

                    rating = 0
                    rating_text = rating_element.get_attribute('aria-label')
                    if rating_text:
                        rating = float(rating_text.split(' out of ')[0])
                    rating = int(rating) if isinstance(rating, float) and rating.is_integer() else rating 

                    # Append product link and other details to the list
                    product_urls.append((product_link, name, rating, sku)) # Added SKU in the tuple
                
            except:
                continue

        # Now we iterate over all URLs and repeat the process for each one
        for product_url, name, rating, sku in product_urls: # Updated unpacking with SKU
            driver.get(product_url)
            time.sleep(3)

            wait = WebDriverWait(driver, 10)

            # Use the wait object to wait until the button is clickable, then click it.
            try:
                button = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@role='button' and contains(@class, 'ratings')]")))
                button.click()
                time.sleep(2)
            except:
                continue
                    
            # Get the page source and pass it into BeautifulSoup
            # This is required. I could not get Selenium to understand the HTML in the way bs4 can.
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Find individual reviews and ratings using BeautifulSoup
            review_items = soup.select('div.bvseo-review')

            for review in review_items:
                try:
                    # Extract the review text and rating score
                    review_text = review.find('span', itemprop='description').text.strip()
                    rating_score = int(review.find('span', itemprop='ratingValue').text.strip())
                    reviews.append([sku, name, rating, review_text, current_date])
                except:
                    continue
        print(f"Livingspaces Reviews Added")

    elif "rcwilley.com" in url:
        product_items = soup.find_all('div', class_='productContent')
        for item in product_items:
            rating_element = item.find('div', class_='rating')
            name_element = item.find('div', class_='productName')
            if name_element:
                name = name_element.text.strip()
                rating_element = item.find('div', class_=lambda value: value and value.startswith('rating'))
                rating = 0
                if rating_element:
                    rating_span = rating_element.find('span', class_='sr-only')
                    if rating_span:
                        rating_text = rating_span.text.strip()
                        rating = float(''.join([i for i in rating_text if i.isdigit() or i == '.']))
                product_link_element = item.find_parent('a')  # Find the parent `a` tag of the current item.
                rating = int(rating) if isinstance(rating, float) and rating.is_integer() else rating
                if product_link_element:
                    product_link = "https://www.rcwilley.com" + product_link_element.get('href')

                    # Here we extract SKU from the product_link_element
                    sku = product_link_element.get('id')
                    if sku.startswith('sku-'):
                        sku = sku[4:]  # If the sku starts with 'sku-', strip 'sku-' prefix

                    # Check if the product has a rating
                    if rating > 0:
                        # Navigate to the product link without the reviews tab
                        driver.get(product_link)
                        time.sleep(3)

                        # Update the product link to include the reviews tab
                        product_link += "#reviews-tab"

                        # Navigate to the updated product link with the reviews tab
                        driver.get(product_link)
                        time.sleep(2)

                        # Scroll to the reviews tab
                        driver.execute_script("arguments[0].scrollIntoView(true);", driver.find_element(By.XPATH, "//a[@href='#reviews-tab']"))

                        # Click on the reviews tab using JavaScript
                        driver.execute_script("arguments[0].click();", driver.find_element(By.XPATH, "//a[@href='#reviews-tab']"))

                        # Wait for the reviews to load
                        time.sleep(3)

                        # Get the page source and pass it into BeautifulSoup
                        # This is required. I could not get Selenium to understand the HTML in the way bs4 can. Especially with it being mixed HTML and JSON
                        soup = BeautifulSoup(driver.page_source, 'html.parser')

                        try:
                            jsonld_script = soup.find('script', {'id': 'bv-jsonld-reviews-data'}).string
                            jsonld_data = json.loads(jsonld_script)
                            # Extract the review body text and rating score for each review
                            # Process the reviews as needed
                        except AttributeError:
                            continue

                        # Extract the review body text and rating score for each review
                        for review_data in jsonld_data['review']:
                            review_body = review_data['reviewBody']
                            rating_score = review_data['reviewRating']['ratingValue']

                            # Print the review body and rating score
                            try:
                                reviews.append([sku, name, rating_score, review_body, current_date])  # updated review list to include SKU
                            except:
                                continue
        print(f"RC Willey Reviews Added")

    driver.quit()

    return reviews

# Double check the IDs and continue counting
def get_max_id(sheet_name):
    # Authorize your client
    client = gspread.authorize(creds)

    # Open the Google Spreadsheet by its URL
    sheet = client.open_by_url('GOOGLE_SHEETS_URL')

    try:
        ws = sheet.worksheet(sheet_name)
        data = ws.get_all_values()  # Get all values inside the specified worksheet
        if data:
            # The first column has ID, get the max value
            ids = [int(row[0]) for row in data[1:] if row[0].isdigit()]  # Ignore the first row (header)
            if ids:
                return max(ids)
    except WorksheetNotFound:
        pass
    return 0

def compare_reviews(word):
    website1_url = 'https://www.livingspaces.com/'
    website2_url = 'https://www.rcwilley.com/'

    reviews_website1 = scrape_website(website1_url, word)
    reviews_website2 = scrape_website(website2_url, word)

    website1_name = website1_url.replace('https://www.', '').replace('.com/', '').capitalize()
    website2_name = website2_url.replace('https://www.', '').replace('.com/', '').capitalize()

    sheet = client.open_by_url('GOOGLE_SHEETS_URL')


    # Check if reviews sheet exists
    try:
        reviews_ws = sheet.worksheet('reviews')
    except WorksheetNotFound:
        reviews_ws = sheet.add_worksheet(title="reviews", rows="1", cols="5")

    # Check if the reviews sheet is empty before writing headers
    if len(reviews_ws.get_all_values()) == 0:
        reviews_ws.append_row(['Website', 'Sku', 'Rating', 'Review', 'Date Added'])


    # Generate unique IDs for each item
    reviews_max_id = get_max_id('reviews')


    # Reviews for livingspaces.com
    reviews_rows = []
    for i, (sku, name, rating, review_text, _) in enumerate(reviews_website1, start=reviews_max_id+1):
        reviews_rows.append([website1_name, sku, rating, review_text, current_date])

    reviews_ws.append_rows(reviews_rows)


    # Reviews for rcwilley.com
    reviews_rows = []
    for i, (sku, name, rating, review_text, _) in enumerate(reviews_website2, start=reviews_max_id+1):
        reviews_rows.append([website2_name, sku, rating, review_text, current_date])

    reviews_ws.append_rows(reviews_rows)

    print("Google Sheets Updated")


# To get search word from google sheet
search_word = get_search_word_from_sheet()

if search_word is not None:
    compare_reviews(search_word)
else:
    print("No search word for today was found.")
