**1. Install Selenium**

Open bash and run:
```bash
pip install selenium
```
**2. Install driver**

1. [Download browser driver](https://selenium-python.readthedocs.io/installation.html#drivers)
2. Move to a location of choice and unzip.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import time
import pandas as pd

# Use the link path to your driver:
chrome_path = '/Users/data/chromedriver'  # copy and paste the path to your driver here
driver = webdriver.Chrome(chrome_path)
driver1 = webdriver.Chrome(chrome_path)

# Create lists for the dataframe:
item_names = list()
item_description = list()
item_brand = list()
review_titles= list()
review_contents = list()
product_helpful= list() 
product_not_helpful = list()
member_rating = list()
total_rate = list()
item_prices = list()
item_images = list()

URL = "https://ca.iherb.com/c/Vitamins?sr=2&noi=48&p="

# For a stable scraper, we should scrape all products within 3 pages of the Best Selling vitamin products:

for n in range(1,2):
    driver.get(f"{URL}{n}") # modify the page numbers to scrape the products information
    wait = WebDriverWait(driver, 10)

    # Store all the links in a list
    item_links = [item.get_attribute("href") for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".absolute-link-wrapper > a.product-link")))]

    # Iterate over the links
    for item_link in item_links:
        driver.get(item_link)
    
        # Locate and click on the `View All Reviews` link
        all_reviews_link = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"span.all-reviews-link > a")))
        time.sleep(2)

        x = all_reviews_link.get_attribute("href")

        MAX_PAGE_NUM = 60   # Scrape maximum 60 pages in the review section

        for i in range(1, MAX_PAGE_NUM + 1):
            page_num = str(i)
            url = x +'?&p='+ page_num 
            print(url)    
            driver1.get(url)
            review_containers = driver1.find_elements_by_class_name('review-row')

            for containers in review_containers:
                driver.implicitly_wait(5) # waiting for the browser to see the website elements
                # there are bullet points of the description so we have to join the each bullet point with a comma
                elements = ', '.join([item.text for item in driver.find_elements_by_css_selector("[itemprop='description'] > ul:nth-of-type(1) > li")])
                # append the item description
                item_description.append(elements)
                # append the image link of the item
                item_images.append(driver.find_element_by_xpath('//*[@id="product-image"]/div[1]/a').get_attribute('href'))
                # append the brand of the item
                item_brand.append(driver.find_element_by_xpath('.//*[@id="brand"]/a/span/bdi').get_attribute('textContent'))
                # append the price of the item
                item_prices.append(driver.find_element_by_css_selector('[id="price"]').text)
                # append the name of item
                item_names.append(driver1.find_element_by_css_selector('[class="nav-product-link-text"] span').text)
                # append the total rating score 
                total_rate.append(driver1.find_element_by_class_name('css-i36p8g').text)
                # append the review content       
                review_contents.append(containers.find_element_by_class_name('review-text').text)
                # append the number of "helpful" rating
                product_helpful.append(containers.find_element_by_css_selector('[title="Helpful"] span').text)
                # append the number of "not helpful" rating
                product_not_helpful.append(containers.find_element_by_css_selector('[title="Unhelpful"] span').text)
                # append the rating star
                stars = containers.find_elements_by_class_name("css-172co2l")
                rating = 0
                for star in stars:
                    star_color = star.find_element_by_tag_name("path").get_attribute("fill")
                    if star_color != "transparent":
                        rating += 1
                member_rating.append(rating)

            time.sleep(2) # Slow the script down

driver.quit()

data = {'item_image_link' : item_images,
        'item_brand' : item_brand, 
        'item_name' : item_names, 
        'item_description': item_description, 
        'item_price' : item_prices, 
        'total_rating' : total_rating, 
        'review_contents' : review_contents , 
        'individual_rating' : member_rating , 
        'product_helpful' : product_helpful, 
        'product_not_helpful' : product_not_helpful}

# Creates a dataframe 

df_product = pd.DataFrame(data) 

# Creates a csv file to the destination path on your computer

df_product.to_csv (r'data/iherb_best_selling_products_raw_dataset.csv', index = False, header=True) 

# Checks the dataframe

df_product.head()

In [None]:
# In order to keep the scraper functioning properly, I had to scrape 2 times. 
# The 1st time, I scraped from page 1 to page 3 ; the 2nd time, I scraped from page 2 to page 3. Then I could concatenate 2 datasets together.

data1 = pd.read_csv('data/iherb_best_selling_product_part1.csv')
data2 = pd.read_csv('data/iherb_best_selling_product_part2.csv')


# Concatenates all data together:

frames = [data1, data2]

df = pd.concat(frames)

# Exports to csv file:

df.to_csv (r'data/iherb_best_selling_products_raw_dataset.csv', index = False, header=True) 
