In [1]:
%pip install pandas webdriver_manager selenium

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting selenium
  Downloading selenium-4.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager

chrome_options = Options()
# Important Arguments won't eun without them in Gitpod
chrome_options.add_argument("--disable-dev-shm-usage") 
chrome_options.add_argument("--headless")  

# Setup ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

In [3]:
def get_perfume_links(base_url, max_pages):
    perfume_links = []
    
    for page in range(1, max_pages + 1):
        url = f"{base_url}/page/{page}/"
        print(f"Scraping: {url}")
        driver.get(url)
        time.sleep(3)

        perfumes = driver.find_elements(By.CSS_SELECTOR, "h4.mkdf-product-list-title a")

        for perfume in perfumes:
            name = perfume.text
            link = perfume.get_attribute("href")
            perfume_links.append({"Name": name, "URL": link})

    return perfume_links

In [39]:
def get_perfume_details(url):
    driver.get(perfume["URL"])
    time.sleep(2)

    name = perfume["Name"]

    # Extract price
    try:
        price = driver.find_element(By.TAG_NAME, "bdi").text
    except:
        price = "Not Available"

    # Extract notes
    try:
        notes_element = driver.find_element(By.CLASS_NAME, "woocommerce-product-details__short-description")
        notes = notes_element.find_element(By.TAG_NAME, "p").get_attribute("innerHTML")
        notes_list = notes.split("<br>")
        

        top_notes = next((line.replace("Top notes:", "").strip() for line in notes_list if "Top notes:" in line), "Not Available")
        middle_notes = next((line.replace("Middle notes:", "").strip() for line in notes_list if "Middle notes:" in line), "Not Available")
        base_notes = next((line.replace("Base notes:", "").strip() for line in notes_list if "Base notes:" in line), "Not Available")

    except:
        try:
            top_notes_element = driver.find_element(By.XPATH, "//tr[contains(@class, 'attribute_pa_top-notes')]/td")
            top_notes = ", ".join([a.text.strip() for a in top_notes_element.find_elements(By.TAG_NAME, "a")])
        except:
            top_notes = "Not Available"

        try:
            middle_notes_element = driver.find_element(By.XPATH, "//tr[contains(@class, 'attribute_pa_middle-notes')]/td")
            middle_notes = ", ".join([a.text.strip() for a in middle_notes_element.find_elements(By.TAG_NAME, "a")])
        except:
            middle_notes = "Not Available"
        try:
            base_notes_element = driver.find_element(By.XPATH, "//tr[contains(@class, 'attribute_pa_base-notes')]/td")
            base_notes = ", ".join([a.text.strip() for a in base_notes_element.find_elements(By.TAG_NAME, "a")])
        except:
            base_notes = "Not Available"

    # Extract brand
    try:
        brand = driver.find_element(By.TAG_NAME, 'h3').text
    except:
        brand = "Unknown"

    # Extract reviews
    try:
        reviews_tab = driver.find_element(By.XPATH, '//a[@href="#tab-reviews"]')
        driver.execute_script("arguments[0].click();", reviews_tab)
        time.sleep(2)

        reviews_section = driver.find_element(By.ID, "tab-reviews")
        reviews = reviews_section.text.strip()

        if "There are no reviews yet." in reviews:
            reviews = "No Reviews Available"

    except Exception as e:
        print("Error extracting reviews:", e)
        reviews = "No Reviews Available"

    return {
        "name": name,
        "price": price,
        "brand": brand,
        "top_notes": top_notes,
        "middle_notes": middle_notes,
        "base_notes": base_notes,
        "reviews": reviews,
        "url": perfume["URL"]
    }


In [53]:
url = 'https://middleeasternscents.co.ke/product/ajwad-60ml-edp-by-lattafa/'

driver.get(url)
time.sleep(2)
try:
    brand = driver.find_element(By.TAG_NAME, 'h3').text
except:
    try:
        brand_tab = driver.find_element(By.XPATH, "//a[@href='#tab-pwb_tab']")
        print("Brand tab found!")
    except Exception as e:
        brand = "Unknown"
        print(f"Error extracting brand: {e}")
print(brand)





In [40]:
shop_url = "https://middleeasternscents.co.ke/shop/"
max_pages = 42
perfume_data = []

perfume_links = get_perfume_links(shop_url,max_pages)
perfume_links
for perfume in perfume_links:
    details = get_perfume_details(perfume)
    perfume_data.append(details)
    print(details)


Scraping: https://middleeasternscents.co.ke/shop//page/1/


Scraping: https://middleeasternscents.co.ke/shop//page/2/
Scraping: https://middleeasternscents.co.ke/shop//page/3/
Scraping: https://middleeasternscents.co.ke/shop//page/4/
Scraping: https://middleeasternscents.co.ke/shop//page/5/
Scraping: https://middleeasternscents.co.ke/shop//page/6/
Scraping: https://middleeasternscents.co.ke/shop//page/7/
Scraping: https://middleeasternscents.co.ke/shop//page/8/
Scraping: https://middleeasternscents.co.ke/shop//page/9/
Scraping: https://middleeasternscents.co.ke/shop//page/10/
Scraping: https://middleeasternscents.co.ke/shop//page/11/
Scraping: https://middleeasternscents.co.ke/shop//page/12/
Scraping: https://middleeasternscents.co.ke/shop//page/13/
Scraping: https://middleeasternscents.co.ke/shop//page/14/
Scraping: https://middleeasternscents.co.ke/shop//page/15/
Scraping: https://middleeasternscents.co.ke/shop//page/16/
Scraping: https://middleeasternscents.co.ke/shop//page/17/
Scraping: https://middleeasternscents.co.ke/shop//page/18/
Scrap

KeyboardInterrupt: 

In [None]:
import pandas as pd
df = pd.DataFrame(perfume_data)
df.to_csv("perfumes.csv", index=False)

print("Scraping complete! Data saved in perfumes.csv")

driver.quit()