In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### get all urls for all page results

In [178]:
import re
import time
import random
import warnings
import pandas as pd
from typing import List
from lxml import etree as et
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
import requests

warnings.filterwarnings('ignore')
driver = webdriver.Chrome(ChromeDriverManager().install())
url = "https://www.ulta.com/shop/fragrance/womens-fragrance/perfume?N=26wq"
driver.get(url)

def get_page_urls(url):
    page_urls = [url]
    while url:
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        next_page = soup.find('li', class_='next-prev floatl-span').find('a', class_='next')
        if next_page:
            url = "https://www.ulta.com" + next_page['href']
            page_urls.append(url)
        else:
            url = None
    driver.quit()
    return set(page_urls)
    

page_urls = get_page_urls(url)
page_urls


{'https://www.ulta.com/shop/fragrance/womens-fragrance/perfume?N=26wq',
 'https://www.ulta.com/shop/fragrance/womens-fragrance/perfume?N=26wq&No=192&Nrpp=96',
 'https://www.ulta.com/shop/fragrance/womens-fragrance/perfume?N=26wq&No=288&Nrpp=96',
 'https://www.ulta.com/shop/fragrance/womens-fragrance/perfume?N=26wq&No=96&Nrpp=96'}

### get all product links from page result urls

In [179]:
def get_product_links(page_urls: List[str]) -> List[str]:
    product_links = []
    for url in page_urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = ["https://www.ulta.com" + row.a['href'] for row in soup.find_all('p', class_='prod-desc')]
        product_links.extend(links)
    return product_links

product_links = get_product_links(page_urls)

### initiate dataframe

In [306]:
data = {
    'Product_url': [], 

}
# Creating a dataframe with those columns


df = pd.DataFrame(data)

### loop through all product links and insert as first column in dataframe

In [307]:
for i in product_links:
    df = df.append({'Product_url': i}, ignore_index=True)

### loop through all urls and get brand names, product names, and product descriptions. Insert into dataframe as new columns

In [308]:
brands = []
product_names = []
product_descriptions = []


for i in range(len(product_links)):
    product = requests.get(product_links[i])
    soup = BeautifulSoup(product.text, 'lxml')
    brand = soup.find("span", attrs={"class": "Text-ds Text-ds--body-1 Text-ds--left"}).find("a").string
    product_name = soup.find("span", attrs={"class": "Text-ds Text-ds--title-5 Text-ds--left"}).string
    product_desc = soup.find("p", attrs={"class": "Text-ds Text-ds--subtitle-1 Text-ds--left"}).string
    brands.append(brand)
    product_names.append(product_name)
    product_descriptions.append(product_desc)

df['Brand'] = brands
df['Product Name'] = product_names
df['Product Description'] = product_descriptions


### loop through all urls and get fragrance family, scent type, top notes, middle notes, bottom notes, and combined notes

<p>Note: not all fragrances have all of these, so I needed to test for their existence first</p>

In [304]:
fragrance_families_list = []
scent_types_list = []
top_notes_list = []
middle_notes_list = []
base_notes_list = []
combined_notes_list = []


for i in range(len(product_links)):
    product = requests.get(product_links[i])
    soup = BeautifulSoup(product.text, 'lxml')
    # find soup for Fragrance Info
    FragInfo = soup.find("div", attrs={"class": "Markdown Markdown--body-2"})
    frag_fam_result = FragInfo.find(string = 'Fragrance Family')
    scent_type_result = FragInfo.find(string = 'Scent Type')
    key_notes_result = FragInfo.find(string = 'Key Notes')
    # test for existence of fragrance family. If none, assign "none". if exists, assign to frgrnc_fmily_cleaned. 
    if frag_fam_result:
        frag_family = frag_fam_result.find_next('ul').find_all('li')
        frgrnc_fmly_cleaned = ''
        for j in range(len(frag_family)):
            if j < len(frag_family)-1:
                frgrnc_fmly_cleaned = frgrnc_fmly_cleaned + frag_family[j].text + ", "
            if j == len(frag_family)-1:
                frgrnc_fmly_cleaned = frgrnc_fmly_cleaned + frag_family[j].text
    else:
        frgrnc_fmly_cleaned = "none"

    # test for existence of scent type. If none, assign "none". if exists, assign to scnt_type_cleaned.
    if scent_type_result:
        scent_type = scent_type_result.find_next('ul').find_all('li')
        scnt_typ_cleaned = ''
        for j in range(len(scent_type)):
            if j < len(scent_type)-1:
                scnt_typ_cleaned = scnt_typ_cleaned + scent_type[j].text + ", "
            if j == len(scent_type)-1:
                scnt_typ_cleaned = scnt_typ_cleaned + scent_type[j].text
    else:
        scnt_typ_cleaned = "none"
        

    # test for existence of key notes. if none, assign "none" to all. If top / middle / bottom, assign respectively. if only combined, assign combined.
    if key_notes_result:
        key_notes = key_notes_result.find_next('ul').find_all('li')
        if len(key_notes) == 3:
            top_notes = key_notes[0].text
            middle_notes = key_notes[1].text
            base_notes = key_notes[2].text
            combined_notes = "none"
        if len(key_notes) == 1:
            combined_notes = key_notes[0].text
            top_notes = "none"
            middle_notes = "none"
            base_notes = "none"  
    else:
        combined_notes = "none"
        top_notes = "none"
        middle_notes = "none"
        base_notes = "none" 

    #append all variables to lists
    fragrance_families_list.append(frgrnc_fmly_cleaned)
    scent_types_list.append(scnt_typ_cleaned)
    top_notes_list.append(top_notes)
    middle_notes_list.append(middle_notes)
    base_notes_list.append(base_notes)
    combined_notes_list.append(combined_notes)

### Insert new columns into dataframe

In [309]:
df['Fragrance Family'] = fragrance_families_list
df['Scent Type'] = scent_types_list
df['Top Notes'] = top_notes_list
df['Middle Notes'] = middle_notes_list
df['Base Notes'] = base_notes_list
df['Combined Notes'] = combined_notes_list

In [310]:
df

Unnamed: 0,Product_url,Brand,Product Name,Product Description,Fragrance Family,Scent Type,Top Notes,Middle Notes,Base Notes,Combined Notes
0,https://www.ulta.com/p/cloud-eau-de-parfum-pim...,Ariana Grande,Cloud Eau de Parfum,Ariana Grande Cloud Eau de Parfum is an uplift...,Warm & Spicy,Gourmand,"Top - lavender blossom, juicy pear, bergamot","Middle - crÃ¨me de coconut, indulgent praline,...","Base - sensual musks, creamy blonde woods",none
1,https://www.ulta.com/p/eilish-eau-de-parfum-pi...,Billie Eilish,Eilish Eau de Parfum,Eilish Eau de Parfum is a captivating Amber Go...,Amber Gourmand,none,"Top - sugared petals, mandarin, warm berries","Middle - creamy vanilla, soft spices, cocoa","Base - sleek woods, warm musks, tonka beans",none
2,https://www.ulta.com/p/her-eau-de-parfum-pimpr...,Burberry,Her Eau de Parfum,Burberry Her Eau de Parfum for Women is an art...,Gourmand,Fruity Gourmand,Top - red and dark berries,Middle - floral with a scent of jasmine,Base - sensual musk-amber,none
3,https://www.ulta.com/p/donna-born-in-roma-eau-...,Valentino,Donna Born In Roma Eau de Parfum,Valentino Donna Born In Roma Eau de Parfum is ...,Warm Floral,none,Top - blackcurrant,Middle - jasmine grandiflorum absolute,Base - bourbon vanilla,none
4,https://www.ulta.com/p/black-opium-eau-de-parf...,Yves Saint Laurent,Black Opium Eau de Parfum,Yves Saint Laurent Black Opium Eau de Parfum i...,"Warm & Spicy, Sweet Gourmand",none,Top - coffee,Middle - white flowers,Base - vanilla,none
...,...,...,...,...,...,...,...,...,...,...
325,https://www.ulta.com/p/hot-couture-eau-de-toil...,Givenchy,Hot Couture Eau de Toilette,"A glamorous Eau de Toilette, Hot Couture creat...",Floral,Fruity Florals,Top - raspberry,Middle - magnolia,Base - ambered vetiver,none
326,https://www.ulta.com/p/beautiful-magnolia-inte...,EstÃ©e Lauder,Beautiful Magnolia Intense Eau de Parfum,EstÃ©e Lauder's Beautiful Magnolia Intense Eau...,Chypre Floral,none,"Top - mandarin oil, cardamom oil, violet leaf ...","Middle - chinese magnolia oil, iris concrete, ...","Base - virginia cedarwood, illumina musk, vani...",none
327,https://www.ulta.com/p/i-love-juicy-couture-ea...,Juicy Couture,I Love Juicy Couture Eau de Parfum,I don't love lightly. I don't love half-hearte...,"Fruity, Floral",none,none,none,none,"Taffy Apple, Tiger Lily, Patchouli"
328,https://www.ulta.com/p/sun-saint-eau-de-parfum...,Pinrose,Sun Saint Eau de Parfum,Pinrose Sun Saint Eau de Parfum. Effortlessly ...,Fresh Citrus & Fruits,none,"Top - lime, eucalyptus, mandarin, yuzu","Middle - sea salt, cashmere musk, violet","Base - coconut wood, sandalwood",none


In [None]:
##dynamic content that needs selenium
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(product_links[0])
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
num_reviews = soup.find("span", attrs={"class": "pr-snippet-review-count"}).string
avg_rating = soup.find("div", attrs={"class": "pr-snippet-rating-decimal"}).string
price = soup.find("span", attrs={"class": "Text-ds Text-ds--title-6 Text-ds--left Text-ds--black"}).string