# Dependencies

In [5]:
#!pip install selenium

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.34.2-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post

# Setup ChromeDriver

In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [8]:
options = Options()
options.add_argument("--headless")  #run in the background without opening a browser window
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--no-sandbox")  #for Linux environments like Colab
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/115.0 Safari/537.36"
)  #mimic real browser user-agent to reduce bot detection

#launch Chrome with the defined options
driver = webdriver.Chrome(options=options)

# URL and Product Storage

In [9]:
BASE_URL = "https://www.jumia.com.eg/laptops/?page={page}"
#list to store all scraped products
products = []

# Extract Product Data

    Extracts structured product info from a single product card on the page.
    Each 'card' is an HTML <article> element.

In [10]:
from bs4 import BeautifulSoup

In [11]:
def extract_data(card):
    try:
        title = card.find('h3').text.strip()
    except:
        title = None

    try:
        price = card.find('div', class_='prc').text.strip()
    except:
        price = None

    try:
        rating_tag = card.find('div', class_='stars')
        #extract rating percentage
        rating = rating_tag.get('style').split(':')[-1] if rating_tag else None
    except:
        rating = None

    try:
        #number of reviews as visible on product card
        reviews = card.find('div', class_='rev').text.strip()
    except:
        reviews = None

    try:
        #full product page URL
        url = card.find('a')['href']
        product_url = "https://www.jumia.com.eg" + url
    except:
        product_url = None

    try:
        #image source URL
        image = card.find('img')
        image_url = image.get('data-src') or image.get('src')
    except:
        image_url = None

    try:
        #brand extracted as first word in title
        brand = title.split()[0] if title else None
    except:
        brand = None

    return {
        'Title': title,
        'Price': price,
        'Rating': rating,
        'Number of Reviews': reviews,
        'Product URL': product_url,
        'Image URL': image_url,
        'Brand': brand
    }

# Scrape Multiple Pages Until 100 Products

In [12]:
import time

In [13]:
page = 1  #start from page 1

In [14]:
while len(products) < 100:  #stop once we collect 100 products
    print(f"Scraping page {page}...")

    #Load the webpage using Selenium
    driver.get(BASE_URL.format(page=page))

    #Wait a few seconds to let the JavaScript content fully load
    time.sleep(3)

    #use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    #select all product cards on the current page
    cards = soup.find_all('article', class_='prd')

    #if no products found, we either reached the end or got blocked
    if not cards:
        print("No products found or blocked.")
        break

    #extract data from each product card
    for card in cards:
        data = extract_data(card)
        products.append(data)
        if len(products) >= 100:
            break

    #move to the next page
    page += 1

Scraping page 1...
Scraping page 2...
Scraping page 3...


# Save Data to CSV and Close Browser

In [15]:
import pandas as pd

In [16]:
#close the browser instance
driver.quit()

In [17]:
#create a DataFrame from the collected product data
df = pd.DataFrame(products)

In [18]:
#save the data into a CSV file
df.to_csv("jumia_laptops.csv", index=False)
print("✅ Done — saved 100 laptops to jumia_laptops.csv")

✅ Done — saved 100 laptops to jumia_laptops.csv
