In [101]:
import json
import gc

In [102]:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [103]:
from selenium.common.exceptions import NoSuchElementException

In [None]:
save_path = "../data/bronze/"
file_to_save = "user_reviews.json"

In [104]:
with open("../data/bronze/Details.json", 'r') as json_file:
    data = json.load(json_file)

## FUNCTIONS

In [105]:
def get_number(text: str):
    cleaned = re.findall(r'\d+', text)[0]
    return int(cleaned)

In [106]:
def get_header(parent_container, supplement):
    information = {}
    try:
        card_header = parent_container.find_element(By.CLASS_NAME, "card-header")
        header_text = card_header.find_element(By.CLASS_NAME, "details").text
        header_date = card_header.find_element(By.CLASS_NAME, "date").text
        header_condition = parent_container.find_element(By.TAG_NAME, "strong").text
        
        information = {
            "Information": header_text,
            "Date": header_date,
            "Condition": header_condition
        }
    except Exception as e:
        print(f"Exception error encountered getting information for supplement {supplement} - {e}")   
    return information

In [107]:
def get_rating(parent_container, supplement):
    rating = {}
    try:        
        rating_container = parent_container.find_element(By.CLASS_NAME,"overall-rating")      
        rating_text = rating_container.find_element(By.TAG_NAME, "strong").text
        rating = {
            "Rating": rating_text
        }
    except Exception as e:
        print(f"Exception error encountered getting rating for supplement {supplement} - {e}")   
    finally:
        return rating

In [108]:
def get_categories(parent_container, supplement):
    categories = {}
    try:        
        categories_container = parent_container.find_element(By.CLASS_NAME,"categories")
        section_tags = categories_container.find_elements(By.TAG_NAME, "section")
        for tag in section_tags:
            try:
                strong_text = tag.find_element(By.TAG_NAME, "strong").text
                
                categories_div = tag.find_element(By.CSS_SELECTOR, ".webmd-rate.on-mobile")
                categories_value = categories_div.get_attribute("aria-valuenow")

                categories[strong_text] = categories_value
            except Exception as e:
                continue
    except Exception as e:
        print(f"Exception error encountered getting categories for supplement {supplement} - {e}")   
    finally:
        return categories

In [109]:
def get_comment(parent_container, supplement):
    description = {"Comment": ""}
    try:
        
        desc_container = parent_container.find_element(By.CLASS_NAME,"description")    
        check_span = desc_container.find_elements(By.CSS_SELECTOR, "span.readMore")
        if check_span:
            check_span[0].click()      
        
        desc_text = desc_container.find_element(By.CLASS_NAME, "description-text")
        description = {
            "Comment": desc_text.text.replace(" Read Less","")
        }
    except NoSuchElementException:
        print(f"Element not found in comments")   
    except Exception as e:
        print(f"Exception error encountered getting comment for supplement {supplement} - {e}")   
    finally:
        return description

In [110]:
def get_reviews(url, driver, waitTime, lastCount, supplement):
    try:
        ctr = lastCount
        reviews_by_page = {}

        driver.get(url)
        
        
        container = waitTime.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".reviews-page .shared-reviews-container")))
        
        review_card_holder = container.find_elements(By.CSS_SELECTOR, ".review-details-holder")
        
        for holder in review_card_holder:
            print(f"Getting REVIEW: {ctr}")
            review_details = holder.find_element(By.CSS_SELECTOR, ".review-details")
            
            headers = get_header(review_details, supplement)
            rating = get_rating(review_details, supplement)
            categories = get_categories(review_details, supplement)
            comment = get_comment(review_details, supplement)
            
            full_review = {**headers, **rating, **categories, **comment}
            
            reviews_by_page[f"Review {ctr}"] = full_review
            ctr += 1
    except NoSuchElementException:
        print(f"Element not found in reviews")   
    except Exception as e: 
        print(f"Exception error encountered getting reviews for supplement {supplement} - {e}")   
         
    return reviews_by_page, ctr 

In [None]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
waitTime = WebDriverWait(driver,10)
reviews = {}
for supplement, information in data.items():
    try:        
        supplement_review = {}
        numReviews = get_number(information["Reviews"]['NumberOfReviews'])
        url = information["Reviews"]['Url']

        if numReviews == 0:
            continue

        try:
            print(f"Processing for SUPPLEMENT: {supplement}")
            driver.get(url)       
                
            container = waitTime.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".reviews-page .shared-reviews-container")))
            ul_pagination = container.find_element(By.CSS_SELECTOR, ".pagination-holder .pagination-holder .pagination")
            li_items = ul_pagination.find_elements(By.CLASS_NAME, 'page-item')

            pagination_urls = []
            for li in li_items:
                li_class = li.get_attribute("class")
                if 'disabled' in li_class:
                    continue
                
                links = li.find_elements(By.TAG_NAME, 'a')
                if links:
                    pagination_urls.append(links[0].get_attribute("href"))                
                
            lastCount = 1
                
            for urls in pagination_urls:                
                details = get_reviews(urls, driver, waitTime, lastCount, supplement)
                lastCount = details[1]
                supplement_review.update(details[0])
                    
                if lastCount > 100:
                    break

            reviews[supplement] = supplement_review
            sleep(2)
            
        except Exception as e:
            print(f"Exception error encountered for supplement {supplement} - {e}")
            continue
    except Exception as e:
        continue
    
driver.quit()
del driver                  
gc.collect()                 
print("Resources cleaned up.")

In [None]:
file_path = f"{save_path}{file_to_save}"

with open(file_path, 'w') as json_file:
    json.dump(reviews, json_file, indent=4)