In [5]:
import pandas as pd
from time import sleep
import json
import pprint

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [78]:
supplements = pd.read_csv("Supplements.csv")

In [79]:
supplements = supplements[supplements["Name"] == "Aloe"]

## FUNCTIONS

In [41]:
def get_information(waitTime, tabName, supplementName, cssSelector):
    info = ""
    try:
        container = waitTime.until(EC.presence_of_element_located((By.CSS_SELECTOR, cssSelector)))
        div_text = container.text

        info = div_text
    except Exception as e:
        print(f"Error in getting {tabName} for: {supplementName} {e}")    
    finally:
        return info

In [42]:
def get_uses(waitTime, supplementName):
    uses = {}
    try:
        container = waitTime.until(EC.presence_of_element_located((By.ID, "uses-container")))        
        try:
            no_data = container.find_element(By.CSS_SELECTOR, "p.no-data-text")
            if no_data:
                return uses
        except:
            pass  
        
        parent_div = container.find_element(By.XPATH, "./div")
        child_divs = parent_div.find_elements(By.CSS_SELECTOR, ".vitamins-monograph-content.uses-content")
        
        tag_h3 = parent_div.find_elements(By.TAG_NAME, 'h3')
        
        if len(tag_h3) >= 1:
            ctr = 0  
            for child in child_divs:
                li_list = []
                
                ul_elements = child.find_elements(By.TAG_NAME, 'ul')
                
                if not ul_elements:
                    continue
                
                for ul in ul_elements:
                    li_elements = ul.find_elements(By.TAG_NAME, 'li')
                    for li in li_elements:
                        li_list.append([li.text])
                        
                uses[tag_h3[ctr].text] = li_list
                ctr += 1
                
    except Exception as e:
        print(f"Error in getting uses for: {supplementName} {e}")    
    finally:
        return uses


In [43]:
def get_interactions(waitTime, supplementName):
    interactions = {}  

    try:
        try:
            no_data = waitTime.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, "#interactions-container .no-data-text")
            ))
            if no_data:
                return interactions 
        except:
            pass  

        container = waitTime.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, ".vitamins-monograph-content.interactions-content")  
        ))

        tag_ul = container.find_elements(By.TAG_NAME, 'ul')
        if len(tag_ul) >= 1:
            for ul in tag_ul:
                tag_li = ul.find_elements(By.TAG_NAME, 'li')

                for li in tag_li:
                    h3_elements = li.find_elements(By.TAG_NAME, 'h3')
                    p_elements = li.find_elements(By.TAG_NAME, 'p')

                    for h3, p in zip(h3_elements, p_elements):
                        interactions[h3.text] = p.text

    except Exception as e:
        print(f"Error in getting interactions for: {supplementName} - {e}")

    finally:
        return interactions 


In [80]:
def get_reviewURL(driver, waitTime, supplementName):
    reviews = {}  
    try:
        try:
            ul_1 = driver.find_element(By.CSS_SELECTOR, ".tabs-container-holder .tabs-container .auto-tabs")
        except Exception:
            ul_1 = None

        try:
            ul_2 = waitTime.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, ".auto-container-holder .auto-tabs-container .auto-tabs")
            ))
        except Exception:
            ul_2 = None
        
        if ul_1:
            ul = ul_1
        elif ul_2:
            ul = ul_2
        else:
            return {}

        li_items = ul.find_elements(By.TAG_NAME, 'li')
        if li_items:
            last_li = li_items[-1] 
            a_tag = last_li.find_element(By.TAG_NAME, 'a')

            href_value = a_tag.get_attribute('href')
            a_text = a_tag.text 

            reviews = {
                'NumberOfReviews': a_text,
                'Url': href_value
            }

    except Exception as e:
        print(f"Error in getting review details for: {supplementName} - {e}")
    finally:
        return reviews


In [None]:
supplement_info = {}

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
waitTime = WebDriverWait(driver,10)

for _, info in supplements.iterrows():

    supplementName = info.Name
    supplementURL = info.Url
    
    driver.get(supplementURL)       

    overview = get_information(waitTime, "overview", supplementName, ".vitamins-monograph-content.overview-content")
    uses = get_uses(waitTime, supplementName)
    sideEffects = get_information(waitTime, "side effects", supplementName, ".vitamins-monograph-content.side-effects-content")
    preCautions = get_information(waitTime, "precautions", supplementName, ".vitamins-monograph-content.precautions-content")
    interactions = get_interactions(waitTime, supplementName)
    dosing = get_information(waitTime, "dosing", supplementName, "vitamins-monograph-content.dosage-content")
    reviews = get_reviewURL(driver, waitTime, supplementName)


    supplement_info[supplementName] = {
         "Overview": overview,
         "Uses": uses,
         "SideEffects": sideEffects,
         "Precautions": preCautions,
         "Interactions": interactions,
         "Dosing": dosing,
         "Reviews" : reviews
    }
    sleep(2)
driver.quit()

In [82]:
supplement_info

{'Aloe': {'Reviews': {'NumberOfReviews': 'Reviews (56)',
   'Url': 'https://reviews.webmd.com/vitamins-supplements/ingredientreview-607-ALOE'}}}