In [15]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### get all urls for all page results

<p> on Ulta's website, there were 4 page results of Women's Fragrances. The following code retrieves the URL for all page results. </p>

In [33]:
import re
import time
import random
import warnings
import pandas as pd
from typing import List
from lxml import etree as et
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
import requests

warnings.filterwarnings('ignore')
driver = webdriver.Chrome(ChromeDriverManager().install())
url = "https://www.ulta.com/shop/fragrance/womens-fragrance/perfume?N=26wq"
driver.get(url)

def get_page_urls(url):
    page_urls = [url]
    while url:
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        next_page = soup.find('li', class_='next-prev floatl-span').find('a', class_='next')
        if next_page:
            url = "https://www.ulta.com" + next_page['href']
            page_urls.append(url)
        else:
            url = None
    driver.quit()
    return set(page_urls)
    

page_urls = get_page_urls(url)
page_urls


{'https://www.ulta.com/shop/fragrance/womens-fragrance/perfume?N=26wq',
 'https://www.ulta.com/shop/fragrance/womens-fragrance/perfume?N=26wq&No=192&Nrpp=96',
 'https://www.ulta.com/shop/fragrance/womens-fragrance/perfume?N=26wq&No=288&Nrpp=96',
 'https://www.ulta.com/shop/fragrance/womens-fragrance/perfume?N=26wq&No=96&Nrpp=96'}

### get all product links from page result urls

<p>From the 4 page results, I needed to retrieve all the individual urls for each women's fragrance.</p>

In [36]:
def get_product_links(page_urls: List[str]) -> List[str]:
    product_links = []
    for url in page_urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = ["https://www.ulta.com" + row.a['href'] for row in soup.find_all('p', class_='prod-desc')]
        product_links.extend(links)
    return product_links

product_links = get_product_links(page_urls)
len(product_links)

329

### initiate dataframe

In [82]:
data = {
    'Product_url': [], 

}

df = pd.DataFrame(data)

### insert product links as first column in dataframe

In [83]:
for i in product_links:
    df = df.append({'Product_url': i}, ignore_index=True)

In [84]:
df.head()
df.nunique(axis=0)

Unnamed: 0,Product_url
0,https://www.ulta.com/p/sweet-eau-de-parfum-pim...
1,https://www.ulta.com/p/linterdit-rouge-eau-de-...
2,https://www.ulta.com/p/lady-million-eau-de-par...
3,https://www.ulta.com/p/bee-eau-de-parfum-pimpr...
4,https://www.ulta.com/p/amazing-grace-bergamot-...


Product_url    329
dtype: int64

### loop through all urls and get brand names, product names, product descriptions, average ratings, and sizes. Insert into dataframe as new columns
<p>NOTE: Not all fragrances had a size listed so I needed to check if it existed first (if statement within the loop)</p>

In [85]:
brands = []
product_names = []
product_descriptions = []
avg_ratings = []
sizes = []

for i in range(len(product_links)):
    product = requests.get(product_links[i])
    soup = BeautifulSoup(product.text, 'lxml')
    brand = soup.find("span", attrs={"class": "Text-ds Text-ds--body-1 Text-ds--left"}).find("a").string
    product_name = soup.find("span", attrs={"class": "Text-ds Text-ds--title-5 Text-ds--left"}).string
    product_desc = soup.find("p", attrs={"class": "Text-ds Text-ds--subtitle-1 Text-ds--left"}).string
    avg_rating = soup.find("div", attrs={"class": "ReviewStars"}).find_next("span").string
    size = soup.find("span", attrs={"class": "Text-ds Text-ds--body-3 Text-ds--left Text-ds--black"})
    if size:
        size_clean = size.string
    else:
        size_clean = "N/A"
    brands.append(brand)
    product_names.append(product_name)
    product_descriptions.append(product_desc)
    avg_ratings.append(avg_rating)
    sizes.append(size_clean)

df['Brand'] = brands
df['Product Name'] = product_names
df['Product Description'] = product_descriptions
df['Size'] = sizes



In [86]:
df

Unnamed: 0,Product_url,Brand,Product Name,Product Description,Size
0,https://www.ulta.com/p/sweet-eau-de-parfum-pim...,Ellis Brooklyn,SWEET Eau de Parfum,A sweet memory of simpler times when living in...,1.6 oz
1,https://www.ulta.com/p/linterdit-rouge-eau-de-...,Givenchy,L'Interdit Rouge Eau de Parfum,Embrace the thrill of the spicy next chapter o...,2.7 oz
2,https://www.ulta.com/p/lady-million-eau-de-par...,Paco Rabanne,Lady Million Eau de Parfum,Lady Million Eau de Parfum is a warm & spicy f...,1.7 oz
3,https://www.ulta.com/p/bee-eau-de-parfum-pimpr...,Ellis Brooklyn,BEE Eau de Parfum,The magical journey of finding and falling in ...,1.6 oz
4,https://www.ulta.com/p/amazing-grace-bergamot-...,Philosophy,Amazing Grace Bergamot Eau de Toilette,"Bright, buoyant, inspired. Philosophy's limite...",2.0 oz
...,...,...,...,...,...
324,https://www.ulta.com/p/daisy-love-paradise-eau...,Marc Jacobs,Daisy Love Paradise Eau de Toilette,"Floral. Free-spirited. Inspired by rare, natur...",1.7 oz
325,https://www.ulta.com/p/irresistible-rose-velve...,Givenchy,Irresistible Rose Velvet Eau de Parfum,Velvety Rose is enveloped in White Patchouli t...,2.7 oz
326,https://www.ulta.com/p/daisy-paradise-eau-de-t...,Marc Jacobs,Daisy Paradise Eau de Toilette,"Floral. Free-spirited. Inspired by rare, natur...",1.6 oz
327,https://www.ulta.com/p/angel-eau-de-parfum-ref...,MUGLER,Angel Eau de Parfum Refill,Celestial. Delicious. Fearless. Indulge in MUG...,3.4 oz


### loop through all urls and get fragrance family, scent type, top notes, middle notes, bottom notes, and combined notes

<p>Note: not all fragrances have all of these, so I needed to test for their existence first</p>
<p>Also, if a fragrance did not have the notes subdivided into TOP, MIDDLE, and BASE notes (and only one blurb was available for the notes in the fragrance), then I called that "combined notes". </P>

In [87]:
fragrance_families_list = []
scent_types_list = []
top_notes_list = []
middle_notes_list = []
base_notes_list = []
combined_notes_list = []


for i in range(len(product_links)):
    product = requests.get(product_links[i])
    soup = BeautifulSoup(product.text, 'lxml')
    # find soup for Fragrance Info
    FragInfo = soup.find("div", attrs={"class": "Markdown Markdown--body-2"})
    frag_fam_result = FragInfo.find(string = 'Fragrance Family')
    scent_type_result = FragInfo.find(string = 'Scent Type')
    key_notes_result = FragInfo.find(string = 'Key Notes')
    # test for existence of fragrance family. If none, assign "none". if exists, assign to frgrnc_fmily_cleaned. 
    if frag_fam_result:
        frag_family = frag_fam_result.find_next('ul').find_all('li')
        frgrnc_fmly_cleaned = ''
        for j in range(len(frag_family)):
            if j < len(frag_family)-1:
                frgrnc_fmly_cleaned = frgrnc_fmly_cleaned + frag_family[j].text + ", "
            if j == len(frag_family)-1:
                frgrnc_fmly_cleaned = frgrnc_fmly_cleaned + frag_family[j].text
    else:
        frgrnc_fmly_cleaned = "none"

    # test for existence of scent type. If none, assign "none". if exists, assign to scnt_type_cleaned.
    if scent_type_result:
        scent_type = scent_type_result.find_next('ul').find_all('li')
        scnt_typ_cleaned = ''
        for j in range(len(scent_type)):
            if j < len(scent_type)-1:
                scnt_typ_cleaned = scnt_typ_cleaned + scent_type[j].text + ", "
            if j == len(scent_type)-1:
                scnt_typ_cleaned = scnt_typ_cleaned + scent_type[j].text
    else:
        scnt_typ_cleaned = "none"
        

    # test for existence of key notes. if none, assign "none" to all. If top / middle / bottom, assign respectively. if only combined, assign combined.
    if key_notes_result:
        key_notes = key_notes_result.find_next('ul').find_all('li')
        if len(key_notes) == 3:
            top_notes = key_notes[0].text
            middle_notes = key_notes[1].text
            base_notes = key_notes[2].text
            combined_notes = "none"
        if len(key_notes) == 1:
            combined_notes = key_notes[0].text
            top_notes = "none"
            middle_notes = "none"
            base_notes = "none"  
    else:
        combined_notes = "none"
        top_notes = "none"
        middle_notes = "none"
        base_notes = "none" 

    #append all variables to lists
    fragrance_families_list.append(frgrnc_fmly_cleaned)
    scent_types_list.append(scnt_typ_cleaned)
    top_notes_list.append(top_notes)
    middle_notes_list.append(middle_notes)
    base_notes_list.append(base_notes)
    combined_notes_list.append(combined_notes)

### Insert new columns into dataframe

In [88]:
df['Fragrance Family'] = fragrance_families_list
df['Scent Type'] = scent_types_list
df['Top Notes'] = top_notes_list
df['Middle Notes'] = middle_notes_list
df['Base Notes'] = base_notes_list
df['Combined Notes'] = combined_notes_list

### Get number of reviews and price from script tag in product url soup

<p>There were some variables that were generated dynamically and could not be scraped using the above methods. I took some time to inspect script tags of the HTML to find the desired info I was looking for. Then, I searched for their position in the script tag and extracted the data points.</p>

In [89]:

prices = []
num_reviews_list = []

for i in range(len(product_links)):
    product = requests.get(product_links[i])
    soup = BeautifulSoup(product.text, 'lxml')
    script_tag = soup.find_all("script", attrs={"id": "apollo_state"})
    script_tag = str(script_tag)

    price = script_tag.partition("productListPrice")[2]
    price_find_period = price.find('.')
    price_clean = price[4:price_find_period+3]

    num_reviews = script_tag.partition("product_reviews_count")[2]
    num_reviews_first = num_reviews.find('[')+2
    num_reviews_last = num_reviews.find(']')
    num_reviews_clean = num_reviews[num_reviews_first:num_reviews_last-1]

    prices.append(price_clean)
    num_reviews_list.append(num_reviews_clean)


### Insert Price and Number of Reviews in Dataframe

In [90]:
df['Price'] = prices
df['Number of Reviews'] = num_reviews_list

In [91]:
df

Unnamed: 0,Product_url,Brand,Product Name,Product Description,Size,Fragrance Family,Scent Type,Top Notes,Middle Notes,Base Notes,Combined Notes,Price,Number of Reviews
0,https://www.ulta.com/p/sweet-eau-de-parfum-pim...,Ellis Brooklyn,SWEET Eau de Parfum,A sweet memory of simpler times when living in...,1.6 oz,Fresh,none,"Top - Pear, Ambrette, Bergamot","Middle - Violet, Orris, Heliotrope","Base - Marshmallow, White Amber, Cashmere",none,108.00,41
1,https://www.ulta.com/p/linterdit-rouge-eau-de-...,Givenchy,L'Interdit Rouge Eau de Parfum,Embrace the thrill of the spicy next chapter o...,2.7 oz,Floral,Warm & Spicy,"Top - blood orange, spicy red accord","Middle - orange blossom, jasmine, tuberose","Base - patchouli, sandalwood",none,135.00,387
2,https://www.ulta.com/p/lady-million-eau-de-par...,Paco Rabanne,Lady Million Eau de Parfum,Lady Million Eau de Parfum is a warm & spicy f...,1.7 oz,Warm & Spicy,none,"Top - bitter orange, neroli","Middle - jasmine sambac, orange flower absolute","Base - honey, patchouli",none,101.00,38
3,https://www.ulta.com/p/bee-eau-de-parfum-pimpr...,Ellis Brooklyn,BEE Eau de Parfum,The magical journey of finding and falling in ...,1.6 oz,Warm & Spicy,none,"Top - Dark Rum, Davana","Middle - Honey, Bran Absolute, Cinnamon","Base - Sandalwood, Vanilla Bean, Cocoa Absolut...",none,108.00,31
4,https://www.ulta.com/p/amazing-grace-bergamot-...,Philosophy,Amazing Grace Bergamot Eau de Toilette,"Bright, buoyant, inspired. Philosophy's limite...",2.0 oz,Floral,Delicate Floral,"Top - bergamot, cedrat, neroli, grapefruit","Middle - rose, jasmine, muguet, orange blossom",Base - musk blend,none,58.00,653
...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,https://www.ulta.com/p/daisy-love-paradise-eau...,Marc Jacobs,Daisy Love Paradise Eau de Toilette,"Floral. Free-spirited. Inspired by rare, natur...",1.7 oz,Floral,"Gourmand, Fruity, Floral",Top - iris firabs,Middle - patchouli coeur,Base - chantilly,none,90.00,168
325,https://www.ulta.com/p/irresistible-rose-velve...,Givenchy,Irresistible Rose Velvet Eau de Parfum,Velvety Rose is enveloped in White Patchouli t...,2.7 oz,Floral,Velvety Floral,"Top - pink peppercorn, blackcurrant, neroli","Middle - luscious rose, iris","Base - patchouli, sandalwood, musk",none,135.00,138
326,https://www.ulta.com/p/daisy-paradise-eau-de-t...,Marc Jacobs,Daisy Paradise Eau de Toilette,"Floral. Free-spirited. Inspired by rare, natur...",1.6 oz,Floral,"Floral, Woody",Top - poppy natureprint,Middle - oak chips,Base - ambrox,none,90.00,300
327,https://www.ulta.com/p/angel-eau-de-parfum-ref...,MUGLER,Angel Eau de Parfum Refill,Celestial. Delicious. Fearless. Indulge in MUG...,3.4 oz,none,none,none,none,none,none,142.00,1256


In [92]:
df.to_csv(r"C:\Users\Me\Desktop\desktop stuff\Code Projects\Ulta\UltaWomensFragrance.csv")
