In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
import requests
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import chromedriver_binary

# 1. Data Collection

## 1.1 Sephora web scrapping to extract product's urls

1.	First of all, I need to generate a list of product’s url to retrieve information from. Basically, when we enter Sephora’s website, we’ll need to i. look under ‘Skincare’ to locate all the categories, ii. click on the specific category to get to the list of products, and iii. click on the specific products to retrieve product's information such as 
2.  Sephora because it uses Lazy Load which doenst load until the users scroll down on the page, this is very important to use the scrollDown function below instead of the built-in function inside WebDriver. The scrolldown funtion was found at: https://www.hackerearth.com/fr/practice/notes/praveen97uma/crawling-a-website-that-loads-content-using-javascript-with-selenium-webdriver-in-python/
3. To install chromedriver_binary in terminal, use this chromedriver-binary-auto will detect your current Chrome version and install the compatible chromedriver version: pip install --upgrade --force-reinstall chromedriver-binary-auto
4. Webdriver opens a new windown on your computer which will be controlled by the code and this browser has to remain open while the code is running
5. Credit this tutorial for web scrapping, which I followed pretty much step by step
7. Since the nature of webs-scrapping heavily depends on the website' features and implementation, any changes to the site after Dec 2021 will  require some tuning of this code. 
8. The xpath and CSS-Class for the products were hard-coded as ['css-1o0t476', 'css-1h1spyg'], it seemed that the first class was added recently to contain new and seasonal products (less than 5% of the total products). If reusing this code, one has to pay special attention to the website's feature changes to make sure that
it works

### 1.1.1 Functions

In [None]:
#this code creates a function that gets the browser to scroll down
def scrollDown(driver, n_scroll):
    body = driver.find_element(By.TAG_NAME ,"body")
    while n_scroll >= 0:
        body.send_keys(Keys.PAGE_DOWN)
        n_scroll -= 1
    return driver

#driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # scroll to the bottom of the page

In [None]:
#this code closes the sign-in pop-up window
def close_sign_in(driver):
    xpath = '//*[@id="modalDialog"]/button'
    btn = driver.find_element(By. XPATH, xpath)
    btn.click()
    time.sleep(10)

In [None]:
#this function openes a new browwer, extracts all url per page and returns the CF with all the information
def extract_products_url(url, cSS_Class, class_code, current_page, cat):
    driver = webdriver.Chrome()
    driver.get(url)    
    time.sleep(40) # let the page load
        
    browser = scrollDown(driver, 15) #scroll down the page
    time.sleep(20) # let the whole page load
    
    try:
        elements = WebDriverWait(driver, 60).until(
            EC.visibility_of_all_elements_located((By.XPATH, cSS_Class))
        )
    except TimeoutException:
        print("Timed out waiting for page to load")
        driver.quit()
            
    url_per_page = pd.DataFrame({'category': [cat for i in range(len(elements))],
                    'CSS-class': [class_code for j in range(len(elements))],
                    'page': [current_page+1 for k in range(len(elements))], 
                    'url': [element.get_attribute('href') for element in elements]})
    driver.close()
    
    return url_per_page

### 1.1.2 Skincare product categories

1. Moisturizers: 'moisturizing-cream-oils-mists' 697 products
2. Cleansers: 'cleanser' 414 products
3. Treatments: 'facial-treatments' 533 products
4. Masks: 'face-mask' 209 products
5. Eye treatment: 'eye-treatment-dark-circle-treatment' 197 products
6. Lip Balms & Treatments: 'lip-balm-lip-care' 164 products
7. Sunscreen: 'sunscreen-sun-protection' 155 products

8. Vegan: 'vegan-skin-care' 803 products (overlapping with previous categories, vegan label isnt always included on the product's highlights) 
9. Clean: 'clean-skin-care' 1106 products (overlapping with previous categories, clean label included in the product's highlights, so we wont have to scrap this category


In [None]:
#this list contains different skin care categories and their number of products
categories = {'moisturizing-cream-oils-mists': 697,
             'cleanser': 414,
             'facial-treatments': 533,
              'face-mask': 209,
             'eye-treatment-dark-circle-treatment': 197,
              'lip-balm-lip-care': 164,
             'sunscreen-sun-protection': 155,
             'vegan-skin-care': 803}


cat_CSSClass = ['css-1o0t476', 'css-1h1spyg'] 

products_per_page = 100

#looping through all skincare categories
for cat, num_prods in categories.items():
    print(cat + '\t' + str(math.ceil(num_prods/products_per_page)))

### 1.1.3 "css-1o0t476": New and seasonal items
One complete, inspect the specific links to make sure it's a real product and not a promotional link

In [None]:
current_page = 1

products_url1 =pd.DataFrame() #DF holing all extracted info

for cat, num_prods in categories.items():
    print(cat)
    
    # category url
    url = 'https://www.sephora.com/shop/' + cat + '?pageSize=' + str(60) + '&currentPage=' + str(current_page)
            
    url_per_page = extract_products_url(url, '//div[@class = "css-1o0t476"]//a', "css-1o0t476", current_page, cat)
        
    #append new page to the main DF
    products_url1 = products_url1.append(url_per_page, ignore_index = True)
    
print(products_url1.shape[0])

### 1.1.4 "css-1h1spyg": Majority of the products

In [None]:
products_url2 =pd.DataFrame()

for cat, num_prods in categories.items():
    no_pages = math.ceil(num_prods/products_per_page)
    for current_page in range(no_pages):
        print(cat + '\t current page ' + str(current_page+1))
    
        # category url
        url = 'https://www.sephora.com/shop/' + cat + '?pageSize=' + str(products_per_page) + '&currentPage=' + str(current_page+1)
    
        url_per_page = extract_products_url(url, '//div[@class = "css-1h1spyg"]//a',"css-1h1spyg", current_page, cat)
        
        #append new page to the main DF
        products_url2 = products_url2.append(url_per_page, ignore_index = True)
    
print(len(elements))

In [None]:
# concatenate the 2 CSS-class
products_url_final = products_url1.append(products_url2, ignore_index = True)

In [None]:
# print to csv_file
products_url_final.to_csv(path_or_buf= 'Raw_data/products_url_sephora.csv', header = True)

In [None]:
# note: if cant fix the extract function, revert back to this version
# products_url2 =pd.DataFrame()

# for cat, num_prods in categories.items():
#     no_pages = math.ceil(num_prods/products_per_page)
#     for current_page in range(no_pages):
#         print(cat + '\t current page ' + str(current_page+1))
    
#         # category url
#         url = 'https://www.sephora.com/shop/' + cat + '?pageSize=' + str(products_per_page) + '&currentPage=' + str(current_page+1)
    
#         driver = webdriver.Chrome()
#         driver.get(url)    
#         time.sleep(40) # let the page load
        
#         browser = scrollDown(driver, 15) #scroll down the page
#         time.sleep(20) # let the whole page load
    
#         try:
#             elements = WebDriverWait(driver, 60).until(
#                 EC.visibility_of_all_elements_located((By.XPATH, '//div[@class = "css-1h1spyg"]//a'))
#             )
#         except TimeoutException:
#             print("Timed out waiting for page to load")
#             driver.quit()
            
#         url_per_page = pd.DataFrame({'category': [cat for i in range(len(elements))],
#                     'CSS-class': ["css-1h1spyg" for j in range(len(elements))],
#                     'page': [current_page+1 for k in range(len(elements))], 
#                     'url': [element.get_attribute('href') for element in elements]})
#         driver.close()
    
#         #append new page to the main DF
#         products_url2 = products_url2.append(url_per_page, ignore_index = True)
    
# print(len(elements))

In [None]:
#this is a working code but using find_element_by_class_name only returns the product price,
# driver = webdriver.Chrome()

# driver.get(url)
# print(driver.title)

# time.sleep(20)
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# time.sleep(20)
# elements = driver.find_elements(By.CLASS_NAME, "css-1o0t476")

# product_url = [ele.get_attribute('href') for ele in elements]
# print(len(elements))
# print(elements.text)
# driver.close()

### 1.1.5 Extract product's information from url

In [6]:
# import list of url, n = 3059
data = pd.read_csv('Raw_data/products_url_sephora.csv', index_col = 0)

In [41]:
data.head()

Unnamed: 0,category,CSS-class,url,classification,sku
0,moisturizing-cream-oils-mists,css-1o0t476,https://www.sephora.com/product/ultra-repair-c...,product,1217744
1,moisturizing-cream-oils-mists,css-1o0t476,https://www.sephora.com/product/water-drench-h...,product,1898550
2,moisturizing-cream-oils-mists,css-1o0t476,https://www.sephora.com/product/ceramidin-tm-c...,product,2077840
3,moisturizing-cream-oils-mists,css-1o0t476,https://www.sephora.com/product/ultra-facial-c...,product,2172526
4,moisturizing-cream-oils-mists,css-1o0t476,https://www.sephora.com/product/confidence-in-...,product,1868256


In [24]:
data['url'][0]

'https://www.sephora.com/product/ultra-repair-cream-intense-hydration-P248407?skuId=1217744'

In [73]:
# extract product names, classification, P_id (not sure what it is yet), sku from url
temp = data['url'].str.split('/')
data['classification'] = [row[3] for row in temp] #classification from url

temp2 = [row[4].split('?') for row in temp]
data['sku'] = [row[1].split('=')[1] for row in temp2] #product sku
data['p_id'] = [row[0].split('-')[-1] for row in temp2]
data['product_name'] = ['-'.join(row[0].split('-')[:-1]) for row in temp2]

In [74]:
# filter out non-product urls, n = 3052
product = data[ data['classification'] == 'product']

In [75]:
product.head()

Unnamed: 0,category,CSS-class,url,classification,sku,p_id,product_name
0,moisturizing-cream-oils-mists,css-1o0t476,https://www.sephora.com/product/ultra-repair-c...,product,1217744,P248407,ultra-repair-cream-intense-hydration
1,moisturizing-cream-oils-mists,css-1o0t476,https://www.sephora.com/product/water-drench-h...,product,1898550,P415701,water-drench-hyaluronic-cloud-cream
2,moisturizing-cream-oils-mists,css-1o0t476,https://www.sephora.com/product/ceramidin-tm-c...,product,2077840,P434363,ceramidin-tm-cream
3,moisturizing-cream-oils-mists,css-1o0t476,https://www.sephora.com/product/ultra-facial-c...,product,2172526,P421996,ultra-facial-cream
4,moisturizing-cream-oils-mists,css-1o0t476,https://www.sephora.com/product/confidence-in-...,product,1868256,P411403,confidence-in-cream-transforming-moisturizing-...


In [76]:
# print to csv_file
product.to_csv(path_or_buf= 'Raw_data/products_url_with_name_sephora.csv', header = True)