# Arabian Perfumes Scraping
I'll be scraping the data from [Middle Eastern Scents](https://middleeasternscents.co.ke/). 

In [1]:
%pip install pandas webdriver_manager selenium


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
chrome_options = Options()
# Important Arguments won't eun without them in Gitpod
chrome_options.add_argument("--disable-dev-shm-usage") 
chrome_options.add_argument("--headless")  

# Setup ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

## 1. Get perfume names and URLs

In [4]:
def get_perfume_links(base_url, max_pages):
    perfume_links = []
    
    for page in range(1, max_pages + 1):
        url = f"{base_url}/page/{page}/"
        print(f"Scraping: {url}")
        driver.get(url)
        time.sleep(3)

        perfumes = driver.find_elements(By.CSS_SELECTOR, "h4.mkdf-product-list-title a")

        for perfume in perfumes:
            name = perfume.text
            link = perfume.get_attribute("href")
            perfume_links.append({"Name": name, "URL": link})

    return perfume_links

In [5]:
shop_url = "https://middleeasternscents.co.ke/shop/"
max_pages = 42
perfume_data = []

perfume_links = get_perfume_links(shop_url,max_pages)


Scraping: https://middleeasternscents.co.ke/shop//page/1/


Scraping: https://middleeasternscents.co.ke/shop//page/2/
Scraping: https://middleeasternscents.co.ke/shop//page/3/
Scraping: https://middleeasternscents.co.ke/shop//page/4/
Scraping: https://middleeasternscents.co.ke/shop//page/5/
Scraping: https://middleeasternscents.co.ke/shop//page/6/
Scraping: https://middleeasternscents.co.ke/shop//page/7/
Scraping: https://middleeasternscents.co.ke/shop//page/8/
Scraping: https://middleeasternscents.co.ke/shop//page/9/
Scraping: https://middleeasternscents.co.ke/shop//page/10/
Scraping: https://middleeasternscents.co.ke/shop//page/11/
Scraping: https://middleeasternscents.co.ke/shop//page/12/
Scraping: https://middleeasternscents.co.ke/shop//page/13/
Scraping: https://middleeasternscents.co.ke/shop//page/14/
Scraping: https://middleeasternscents.co.ke/shop//page/15/
Scraping: https://middleeasternscents.co.ke/shop//page/16/
Scraping: https://middleeasternscents.co.ke/shop//page/17/
Scraping: https://middleeasternscents.co.ke/shop//page/18/
Scrap

## 2. Scrap for the details of the perfume


### a) Get the prices of the perfumes

In [6]:
def extract_price():
    try:
        price = driver.find_element(By.TAG_NAME, "bdi").text
        return price
    except:
        return "Not Available"

In [7]:
def extract_brand():
    try:
        brand_tab = driver.find_element(By.XPATH, "//a[@href='#tab-pwb_tab']")
        driver.execute_script("arguments[0].click();", brand_tab)
        time.sleep(2)

        brand_element = driver.find_element(By.CSS_SELECTOR, "#tab-pwb_tab-content h3")  
        return brand_element.text.strip()
    except:
        try:
            brand_element = driver.find_element(By.TAG_NAME, 'h3')
            return brand_element.text.strip()
        except Exception as e:
            print("Error extracting brand:", e)
            return "Unknown"


In [8]:
def extract_category():
    try:
        category_element = driver.find_element(By.CSS_SELECTOR, "div.product_meta span.posted_in a")
        return category_element.text.strip()
    except Exception as e:
        print("Error extracting category:", e)
        return "Unknown"

In [9]:
# def extract_notes():
    
#     top_notes= middle_notes =base_notes = "Not Available"

#     try:
#         notes_element = driver.find_element(By.CLASS_NAME, "woocommerce-product-details__short-description")
#         notes = notes_element.find_element(By.TAG_NAME, "p").get_attribute("innerHTML")
#         notes_list = notes.split("<br>")
        

#         top_notes = next((line.replace("Top notes:", "").strip() for line in notes_list if "Top notes:" in line), "Not Available")
#         middle_notes = next((line.replace("Middle notes:", "").strip() for line in notes_list if "Middle notes:" in line), "Not Available")
#         base_notes = next((line.replace("Base notes:", "").strip() for line in notes_list if "Base notes:" in line), "Not Available")

#     except:
#         try:
#             top_notes_element = driver.find_element(By.XPATH, "//tr[contains(@class, 'attribute_pa_top-notes')]/td")
#             top_notes = ", ".join([a.text.strip() for a in top_notes_element.find_elements(By.TAG_NAME, "a")])
#         except:
#             top_notes = "Not Available"

#         try:
#             middle_notes_element = driver.find_element(By.XPATH, "//tr[contains(@class, 'attribute_pa_middle-notes')]/td")
#             middle_notes = ", ".join([a.text.strip() for a in middle_notes_element.find_elements(By.TAG_NAME, "a")])
#         except:
#             middle_notes = "Not Available"
#         try:
#             base_notes_element = driver.find_element(By.XPATH, "//tr[contains(@class, 'attribute_pa_base-notes')]/td")
#             base_notes = ", ".join([a.text.strip() for a in base_notes_element.find_elements(By.TAG_NAME, "a")])
#         except:
#             base_notes = "Not Available"
#         try:
#             notes_element = driver.find_element(By.XPATH, "//*[@id='tab-description' or contains(@class, 'woocommerce-Tabs-panel') or contains(@class, 'woocommerce-product-details__short-description')]")
#             notes = notes_element.find_elements(By.TAG_NAME, "p")

#             notes_list= [p.get_attribute("innerHTML").strip() for p in notes]
    
#             for line in notes_list:
#                 if "Top notes" in line:
#                     top_notes = re.sub(r"Top notes (are|:)", "", line).strip()
#                 elif "Middle notes" in line:
#                     middle_notes = re.sub(r"Middle notes (are|:)", "", line).strip()
#                 elif "Base notes" in line:
#                     base_notes = re.sub(r"Base notes (are|:)", "", line).strip()
#         except:
#             top_notes= middle_notes =base_notes = "Not Available"
#         try:
#             notes_element = driver.find_elements(By.CSS_SELECTOR, "div.woocommerce-product-details__short-description p")

#             for note in notes_element:
#                 text = note.text.strip()
#                 if "Top Notes:" in text:
#                     top_notes = text.replace("Top Notes:", "").strip()
#                 elif "Middle Notes:" in text:
#                     middle_notes = text.replace("Middle Notes:", "").strip()
#                 elif "Base Notes:" in text:
#                     base_notes = text.replace("Base Notes:", "").strip()
#         except:
#             top_notes= middle_notes =base_notes = "Not Available"

#     return notes

In [10]:
def extract_notes():
    """
    Extracts top, middle, and base notes from the perfume's webpage.
    Returns a dictionary containing the notes.
    """
    notes_data = {
        "top_notes": "Not Available",
        "middle_notes": "Not Available",
        "base_notes": "Not Available"
    }

    try:
        notes_element = driver.find_element(By.CLASS_NAME, "woocommerce-product-details__short-description")
        notes = notes_element.find_element(By.TAG_NAME, "p").get_attribute("innerHTML")
        notes_list = notes.split("<br>")

        notes_data["top_notes"] = next((line.replace("Top notes:", "").strip() for line in notes_list if "Top notes:" in line), "Not Available")
        notes_data["middle_notes"] = next((line.replace("Middle notes:", "").strip() for line in notes_list if "Middle notes:" in line), "Not Available")
        notes_data["base_notes"] = next((line.replace("Base notes:", "").strip() for line in notes_list if "Base notes:" in line), "Not Available")

    except:
        try:
            top_notes_element = driver.find_element(By.XPATH, "//tr[contains(@class, 'attribute_pa_top-notes')]/td")
            notes_data["top_notes"] = ", ".join([a.text.strip() for a in top_notes_element.find_elements(By.TAG_NAME, "a")])
        except:
            pass

        try:
            middle_notes_element = driver.find_element(By.XPATH, "//tr[contains(@class, 'attribute_pa_middle-notes')]/td")
            notes_data["middle_notes"] = ", ".join([a.text.strip() for a in middle_notes_element.find_elements(By.TAG_NAME, "a")])
        except:
            pass

        try:
            base_notes_element = driver.find_element(By.XPATH, "//tr[contains(@class, 'attribute_pa_base-notes')]/td")
            notes_data["base_notes"] = ", ".join([a.text.strip() for a in base_notes_element.find_elements(By.TAG_NAME, "a")])
        except:
            pass

        try:
            notes_element = driver.find_element(By.XPATH, "//*[@id='tab-description' or contains(@class, 'woocommerce-Tabs-panel') or contains(@class, 'woocommerce-product-details__short-description')]")
            notes_list = [p.get_attribute("innerHTML").strip() for p in notes_element.find_elements(By.TAG_NAME, "p")]

            for line in notes_list:
                if "Top notes" in line:
                    notes_data["top_notes"] = re.sub(r"Top notes (are|:)", "", line).strip()
                elif "Middle notes" in line:
                    notes_data["middle_notes"] = re.sub(r"Middle notes (are|:)", "", line).strip()
                elif "Base notes" in line:
                    notes_data["base_notes"] = re.sub(r"Base notes (are|:)", "", line).strip()
        except:
            pass

        try:
            notes_elements = driver.find_elements(By.CSS_SELECTOR, "div.woocommerce-product-details__short-description p")
            for note in notes_elements:
                text = note.text.strip()
                if "Top Notes:" in text:
                    notes_data["top_notes"] = text.replace("Top Notes:", "").strip()
                elif "Middle Notes:" in text:
                    notes_data["middle_notes"] = text.replace("Middle Notes:", "").strip()
                elif "Base Notes:" in text:
                    notes_data["base_notes"] = text.replace("Base Notes:", "").strip()
        except:
            pass
        try:
            notes_element = driver.find_element(By.ID, "tab-description")
            notes_list = notes_element.find_elements(By.TAG_NAME, "p")

            for note in notes_list:
                text = note.text.strip()
            if "Top notes:" in text:
                top_notes= text.replace("Top notes:", "").strip()
            elif "Middle notes:" in text:
                middle_notes = text.replace("Middle notes:", "").strip()
            elif "Base notes:" in text:
                base_notes = text.replace("Base notes:", "").strip()
        except:
            pass
    return notes_data

In [11]:
def extract_reviews():
    try:
        reviews_tab = driver.find_element(By.XPATH, '//a[@href="#tab-reviews"]')
        driver.execute_script("arguments[0].click();", reviews_tab)
        time.sleep(2)

        reviews_section = driver.find_element(By.ID, "tab-reviews")
        reviews_text = reviews_section.text.strip()

        if "There are no reviews yet." in reviews_text:
            return "No Reviews Available"
        else:
            return reviews_text

    except Exception as e:
        print("Error extracting reviews:", e)
        return "No Reviews Available"

In [12]:
def get_perfume_details(url):
    """
    Extracts perfume details including name, price, brand, category, notes, and reviews.
    Returns a dictionary with perfume details.
    """
    driver.get(perfume['URL'])
    time.sleep(2)  # Allow time for the page to load

    name = perfume['Name']
    price = extract_price()
    brand = extract_brand()
    category = extract_category()
    notes = extract_notes()  # Returns a dictionary
    reviews = extract_reviews()
    url = perfume['URL']

    return {
        "name": name,
        "price": price,
        "brand": brand,
        "category": category,
        "top_notes": notes.get("top_notes", "Not Available"),
        "middle_notes": notes.get("middle_notes", "Not Available"),
        "base_notes": notes.get("base_notes", "Not Available"),
        "reviews": reviews,
        "url": url
    }

In [13]:
for perfume in perfume_links:
    details = get_perfume_details(perfume)
    perfume_data.append(details)
    print(details)

{'name': '9AM DIVE', 'price': '4,500/=', 'brand': 'AFNAN', 'category': 'FRAGRANCE', 'top_notes': 'Mint, Lemon, Pink Pepper and Black Currant.', 'middle_notes': 'Apple, Incense and Cedar.', 'base_notes': 'Patchouli, Ginger, Jasmine and Sandalwood.', 'reviews': 'No Reviews Available', 'url': 'https://middleeasternscents.co.ke/product/9am-dive/'}
{'name': '9PM', 'price': '4,500/=', 'brand': 'AFNAN', 'category': 'FRAGRANCE', 'top_notes': 'Top notes: Apple, Cinnamon, Wild Lavender and Bergamot.<br>\nMiddle notes: Orange Blossom and Lily-of-the-Valley.<br>\nBase notes: Vanilla, Tonka Bean, Amber and Patchouli.', 'middle_notes': 'Not Available', 'base_notes': 'Not Available', 'reviews': 'No Reviews Available', 'url': 'https://middleeasternscents.co.ke/product/9pm/'}
{'name': 'ABEER', 'price': '3,000/=', 'brand': 'RIIFFS', 'category': 'FRAGRANCE', 'top_notes': 'Not Available', 'middle_notes': 'Not Available', 'base_notes': 'Not Available', 'reviews': 'No Reviews Available', 'url': 'https://mid

In [31]:
url = 'https://middleeasternscents.co.ke/product/abeer/'

driver.get(url)
time.sleep(2)

brand_tab = driver.find_element(By.XPATH, "//a[@href='#tab-pwb_tab']")
driver.execute_script("arguments[0].click();", brand_tab)  # Click via JavaScript in case of issues
time.sleep(2)  # Wait for the content to load

# Step 2: Extract Brand Name from the Opened Tab
brand_element = driver.find_element(By.CSS_SELECTOR, "#tab-pwb_tab-content h3")  # Adjust if needed
brand_name = brand_element.text.strip()

print("Brand Name:", brand_name)


Brand Name: RIIFFS


In [14]:
import pandas as pd
df = pd.DataFrame(perfume_data)
df.to_csv("perfumes.csv", index=False)

print("Scraping complete! Data saved in perfumes.csv")

driver.quit()

Scraping complete! Data saved in perfumes.csv
