# Web Scraping Naivas Online for Mattress Prices

This notebook demonstrates how to scrape mattress price information from the Naivas online store (https://naivas.online/).

In [1]:
# Install required libraries if not already installed
!pip install requests beautifulsoup4 pandas



In [11]:
# Install ScrapingBee client library
!pip install scrapingbee

Collecting scrapingbee
  Downloading scrapingbee-2.0.1-py3-none-any.whl.metadata (5.8 kB)
  Downloading scrapingbee-2.0.1-py3-none-any.whl.metadata (5.8 kB)
Downloading scrapingbee-2.0.1-py3-none-any.whl (5.2 kB)
Downloading scrapingbee-2.0.1-py3-none-any.whl (5.2 kB)
Installing collected packages: scrapingbee
Installing collected packages: scrapingbee
Successfully installed scrapingbee-2.0.1
Successfully installed scrapingbee-2.0.1


In [2]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from urllib.parse import urljoin

In [None]:
# Set up the base URL and search URL
base_url = 'https://naivas.online/'
search_url = urljoin(base_url, 'search?q=mattress')

# Set up enhanced headers to better mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Referer': 'https://naivas.online/',
    'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'
}

# Create a persistent session
session = requests.Session()
session.headers.update(headers)

# Print a note about 403 errors
print("Note: If you encounter 403 Forbidden errors, the website may be blocking web scraping attempts.")



In [14]:
# ScrapingBee implementation
import json
from urllib.parse import urlencode

# You need to sign up on ScrapingBee.com to get your API key
SCRAPING_BEE_API_KEY = "RGE1GF6765MPKI50NKYTUXKKIL5LGXVE1YSLFXHEAIVL9PRCQ67EVU92KGQJ6QP1D4YJB4H91NBKV58K"  # Replace with your actual ScrapingBee API key

def get_soup_with_scrapingbee(url):
    """
    Use ScrapingBee API to bypass anti-scraping protections
    """
    print(f"Sending request to {url} via ScrapingBee")
    
    # Set up ScrapingBee parameters
    params = {
        "api_key": SCRAPING_BEE_API_KEY,
        "url": url,
        "premium_proxy": "true",  # Use premium proxies to avoid being blocked
        "country_code": "ke",     # Kenya, as we're accessing a Kenyan website
        "render_js": "true",      # Render JavaScript to ensure all content loads
        "js_scroll": "true",      # Scroll the page to load lazy-loaded elements
    }
    
    # Construct the API endpoint with parameters
    api_url = f"https://app.scrapingbee.com/api/v1/?{urlencode(params)}"
    
    try:
        # Send request to ScrapingBee API
        response = requests.get(api_url, timeout=60)  # Longer timeout for rendered pages
        
        if response.status_code == 200:
            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        else:
            # Print error details from ScrapingBee
            print(f"ScrapingBee error - Status code: {response.status_code}")
            print(f"Response: {response.text}")
            return None
            
    except Exception as e:
        print(f"Error using ScrapingBee: {e}")
        return None

In [10]:
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin

# Base URL of the website
base_url = 'https://www.example.com'

# Search URL (this should be the URL of the search results page)
search_url = urljoin(base_url, 'search/results')

# Headers to simulate a real browser (update with real user-agent if needed)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Start a session
session = requests.Session()
session.headers.update(headers)

# Function to get the webpage content with enhanced error handling
def get_soup(url, retry_count=3, delay=2):
    for attempt in range(retry_count):
        try:
            # Add a delay to be respectful to the server and avoid detection
            time.sleep(delay)
            
            # Send the request using the persistent session
            print(f"Sending request to {url} (Attempt {attempt+1}/{retry_count})")
            response = session.get(url, timeout=10)
            
            # Check if the request was successful
            if response.status_code == 200:
                # Parse the HTML content
                soup = BeautifulSoup(response.text, 'html.parser')
                return soup
            elif response.status_code == 403:
                print(f"Access Forbidden (403). The website may be blocking scraping attempts.")
                # Increase delay before next attempt
                delay += 2
            else:
                print(f"Failed to fetch the page. Status code: {response.status_code}")
                
            # If this wasn't the last attempt, wait before trying again
            if attempt < retry_count - 1:
                wait_time = delay * (attempt + 1)  # Progressive backoff
                print(f"Waiting {wait_time} seconds before retrying...")
                time.sleep(wait_time)
                
        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}")
            if attempt < retry_count - 1:
                wait_time = delay * (attempt + 1)
                print(f"Waiting {wait_time} seconds before retrying...")
                time.sleep(wait_time)
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break
    
    return None

# Try an alternative approach if direct access doesn't work
def try_alternative_access_methods():
    print("\nTrying alternative access methods...")
    
    # Method 1: Access the homepage first, then navigate to search
    try:
        print("Method 1: Visiting homepage first...")
        home_page = session.get(base_url, timeout=10)
        if home_page.status_code == 200:
            print("Successfully accessed homepage. Now trying search page...")
            time.sleep(3)  # Wait before next request
            return get_soup(search_url)
    except Exception as e:
        print(f"Method 1 failed: {e}")
    
    # Method 2: Try with a different search query
    try:
        print("\nMethod 2: Trying with a different search term...")
        alt_search_url = urljoin(base_url, 'search?q=furniture')
        return get_soup(alt_search_url)
    except Exception as e:
        print(f"Method 2 failed: {e}")
    
    return None

# First try the direct approach
print("Attempting to access the search page directly...")
soup = get_soup(search_url)

# If direct access fails, try alternatives
if not soup:
    soup = try_alternative_access_methods()

if soup:
    print("\nSuccess! Successfully retrieved the webpage.")
else:
    print("\nAll attempts failed. The website might be actively blocking scraping attempts.")
    print("Possible solutions:")
    print("1. Use a proper web scraping service that rotates IP addresses")
    print("2. Consider using the official API if available")
    print("3. Look into using Selenium to simulate a real browser")
    print("4. Respect the website's robots.txt and terms of service")

Attempting to access the search page directly...
Sending request to https://www.example.com/search/results (Attempt 1/3)
Sending request to https://www.example.com/search/results (Attempt 1/3)
Failed to fetch the page. Status code: 404
Waiting 2 seconds before retrying...
Failed to fetch the page. Status code: 404
Waiting 2 seconds before retrying...
Sending request to https://www.example.com/search/results (Attempt 2/3)
Failed to fetch the page. Status code: 404
Waiting 4 seconds before retrying...
Sending request to https://www.example.com/search/results (Attempt 2/3)
Failed to fetch the page. Status code: 404
Waiting 4 seconds before retrying...
Sending request to https://www.example.com/search/results (Attempt 3/3)
Failed to fetch the page. Status code: 404

Trying alternative access methods...
Method 1: Visiting homepage first...
Successfully accessed homepage. Now trying search page...
Sending request to https://www.example.com/search/results (Attempt 3/3)
Failed to fetch the pag

In [16]:
# ScrapingBee wrapper function with fallback to original methods
def scrape_with_fallback(url, max_attempts=2):
    """
    Try scraping with ScrapingBee first, then fall back to other methods if it fails
    """
    print("\nAttempt 1: Using ScrapingBee API...")
    soup = get_soup_with_scrapingbee(url)
    
    if soup:
        print("✅ Success using ScrapingBee!")
        return soup
    
    print("\nScrapingBee attempt failed. Trying alternative methods...")
    
    # Try original methods as fallbacks
    print("\nAttempt 2: Using enhanced request headers...")
    soup = get_soup(url)
    
    if soup:
        print("✅ Success using enhanced headers!")
        return soup
    
    # If we still don't have the soup, try alternative methods
    print("\nStandard methods failed. Trying more aggressive approaches...")
    soup = try_alternative_access_methods()
    
    return soup

# Try to scrape the mattress search page
soup = scrape_with_fallback(search_url)

if soup:
    print("\n✅ Successfully retrieved the webpage!")
    print("Page title:", soup.title.text if soup.title else "No title found")
    
    # Print a few elements to verify we have the right content
    print("\nVerifying page content...")
    products = soup.find_all(['div', 'li'], class_=lambda x: x and ('product' in x.lower() or 'item' in x.lower()) if x else False)
    print(f"Found {len(products)} potential product elements")
    
    # Show a sample of what we found
    if products:
        sample = products[0]
        print("\nSample product HTML structure:")
        print(sample.prettify()[:500] + "...")
    
else:
    print("\n❌ All attempts failed. Consider these options:")
    print("1. Check that your ScrapingBee API key is valid")
    print("2. Try using Selenium with browser automation")
    print("3. Check if the website has an official API")
    print("4. Consider if manual data collection is feasible for your needs")


Attempt 1: Using ScrapingBee API...
Sending request to https://www.example.com/search/results via ScrapingBee
ScrapingBee error - Status code: 404
Response: <!DOCTYPE html><html><head>
    <title>Example Domain</title>

    <meta charset="utf-8">
    <meta http-equiv="Content-type" content="text/html; charset=utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
   

In [None]:
# Alternative approach: Using Selenium (uncomment and install selenium if needed)

# !pip install selenium webdriver-manager

# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from webdriver_manager.chrome import ChromeDriverManager
# import time

# def get_page_with_selenium(url, headless=True):
#     try:
#         # Set up Chrome options
#         chrome_options = Options()
#         if headless:
#             chrome_options.add_argument("--headless")  # Run in headless mode
#         chrome_options.add_argument("--no-sandbox")
#         chrome_options.add_argument("--disable-dev-shm-usage")
#         chrome_options.add_argument("--disable-gpu")
#         chrome_options.add_argument(f"user-agent={headers['User-Agent']}")
        
#         # Create a new Chrome webdriver
#         driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), 
#                                  options=chrome_options)
        
#         # Navigate to the specified URL
#         print(f"Navigating to {url} with Selenium...")
#         driver.get(url)
        
#         # Wait for the page to load
#         time.sleep(5)
        
#         # Get the page source
#         page_source = driver.page_source
        
#         # Parse with BeautifulSoup
#         soup = BeautifulSoup(page_source, 'html.parser')
        
#         # Close the browser
#         driver.quit()
        
#         return soup
#     except Exception as e:
#         print(f"Selenium error: {e}")
#         return None

# # Try with Selenium if the previous methods fail
# if not soup:
#     print("\nAttempting to use Selenium for browser automation...")
#     soup = get_page_with_selenium(search_url)
    
#     if soup:
#         print("Successfully retrieved the page using Selenium!")
#     else:
#         print("Selenium approach failed as well.")

In [7]:
# Function to extract product information
def extract_product_info(soup):
    products = []
    
    # This will need to be adjusted based on the actual HTML structure of the website
    # Let's try to find product containers
    product_containers = soup.find_all('div', class_=lambda x: x and ('product' in x.lower() or 'item' in x.lower()))
    
    if not product_containers:
        # Try alternative selectors if the first attempt didn't work
        product_containers = soup.select('div.product, div.item, li.product, li.item')
    
    if not product_containers:
        # Just get all divs with id or class containing product
        product_containers = soup.find_all(['div', 'li'], id=lambda x: x and 'product' in x.lower())
    
    print(f"Found {len(product_containers)} potential product containers")
    
    for container in product_containers:
        try:
            # Try to extract product name
            name_tag = container.find(['h2', 'h3', 'h4', 'a'], class_=lambda x: x and 'title' in x.lower() if x else False)
            if not name_tag:
                name_tag = container.find(['h2', 'h3', 'h4', 'a'])
            
            name = name_tag.get_text().strip() if name_tag else "No name found"
            
            # Try to extract price
            price_tag = container.find(['span', 'div'], class_=lambda x: x and 'price' in x.lower() if x else False)
            price = price_tag.get_text().strip() if price_tag else "No price found"
            
            # Clean price (removing currency symbols and formatting)
            cleaned_price = re.sub(r'[^\d.]', '', price) if price != "No price found" else "No price found"
            
            # Try to extract link
            link_tag = container.find('a', href=True)
            link = urljoin(base_url, link_tag['href']) if link_tag else "No link found"
            
            # Add to products list
            if 'mattress' in name.lower() or 'bed' in name.lower():
                products.append({
                    'Name': name,
                    'Price': price,
                    'Numeric_Price': cleaned_price,
                    'Link': link
                })
        except Exception as e:
            print(f"Error extracting product info: {e}")
            continue
    
    return products

# Extract product information
products = extract_product_info(soup)

# Convert to DataFrame
df = pd.DataFrame(products)

# Display the first few rows
if not df.empty:
    print(f"Found {len(df)} mattress products")
    df.head()
else:
    print("No products found. The webpage might have a different structure than expected.")

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
# Function to handle pagination and extract products from multiple pages
def scrape_all_pages(base_search_url, max_pages=5):
    all_products = []
    current_page = 1
    
    while current_page <= max_pages:
        page_url = f"{base_search_url}&page={current_page}" if current_page > 1 else base_search_url
        print(f"Scraping page {current_page}: {page_url}")
        
        soup = get_soup(page_url)
        if not soup:
            print(f"Failed to retrieve page {current_page}. Stopping.")
            break
            
        page_products = extract_product_info(soup)
        if not page_products:
            print(f"No products found on page {current_page}. This might be the last page.")
            break
            
        all_products.extend(page_products)
        
        # Check if there's a next page by looking for pagination links
        next_page_link = soup.find('a', text=lambda x: x and 'Next' in x or '→' in x or '>' in x)
        if not next_page_link:
            next_page_link = soup.find('a', class_=lambda x: x and 'next' in x.lower() if x else False)
        
        if not next_page_link:
            print("No next page link found. This might be the last page.")
            break
            
        current_page += 1
        
    return all_products

# Uncomment the following lines to scrape all pages
# all_products = scrape_all_pages(search_url)
# all_products_df = pd.DataFrame(all_products)
# all_products_df.head()

In [None]:
# Function to get detailed information for each product
def get_product_details(product_url):
    soup = get_soup(product_url)
    if not soup:
        return {}
    
    details = {}
    
    # Try to extract product details
    # This will need to be adjusted based on the actual structure of the product page
    try:
        # Extract product title
        title_tag = soup.find(['h1', 'h2'], class_=lambda x: x and 'title' in x.lower() if x else False)
        if not title_tag:
            title_tag = soup.find(['h1', 'h2'])
        details['Title'] = title_tag.get_text().strip() if title_tag else "No title found"
        
        # Extract product description
        desc_tag = soup.find(['div', 'p'], class_=lambda x: x and 'description' in x.lower() if x else False)
        details['Description'] = desc_tag.get_text().strip() if desc_tag else "No description found"
        
        # Extract specs/features
        specs = {}
        spec_container = soup.find('table') or soup.find('div', class_=lambda x: x and ('spec' in x.lower() or 'detail' in x.lower() or 'feature' in x.lower()) if x else False)
        
        if spec_container:
            rows = spec_container.find_all(['tr', 'li'])
            for row in rows:
                if row.name == 'tr':
                    cols = row.find_all(['th', 'td'])
                    if len(cols) >= 2:
                        key = cols[0].get_text().strip()
                        value = cols[1].get_text().strip()
                        specs[key] = value
                else:  # li
                    text = row.get_text().strip()
                    if ':' in text:
                        key, value = text.split(':', 1)
                        specs[key.strip()] = value.strip()
                    else:
                        specs[f"Feature {len(specs) + 1}"] = text
                        
        details['Specifications'] = specs
        
        # Extract images
        img_tags = soup.find_all('img', src=True)
        product_images = [urljoin(base_url, img['src']) for img in img_tags if 'product' in img.get('class', [''])[0].lower() if img.get('class') else False]
        details['Images'] = product_images if product_images else []
        
    except Exception as e:
        print(f"Error extracting product details: {e}")
    
    return details

# Example usage to get detailed information for a single product
# (Uncomment and run after getting the product links)

# if not df.empty and 'Link' in df.columns:
#     sample_product = df.iloc[0]
#     print(f"Getting details for: {sample_product['Name']}")
#     details = get_product_details(sample_product['Link'])
#     print("Product Details:")
#     for key, value in details.items():
#         print(f"{key}: {value}")

In [None]:
# Data Analysis (uncomment and run after collecting data)

# Convert price to numeric
# df['Numeric_Price'] = pd.to_numeric(df['Numeric_Price'], errors='coerce')

# # Basic statistics
# if 'Numeric_Price' in df.columns:
#     print("Price Statistics:")
#     print(df['Numeric_Price'].describe())
    
#     # Import visualization libraries
#     import matplotlib.pyplot as plt
#     import seaborn as sns
    
#     # Set style
#     plt.style.use('seaborn')
#     sns.set(font_scale=1.2)
    
#     # Price distribution
#     plt.figure(figsize=(10, 6))
#     sns.histplot(df['Numeric_Price'].dropna(), kde=True)
#     plt.title('Distribution of Mattress Prices')
#     plt.xlabel('Price (KSH)')
#     plt.ylabel('Count')
#     plt.tight_layout()
#     plt.show()
    
#     # Boxplot to see outliers
#     plt.figure(figsize=(10, 6))
#     sns.boxplot(y=df['Numeric_Price'].dropna())
#     plt.title('Boxplot of Mattress Prices')
#     plt.ylabel('Price (KSH)')
#     plt.tight_layout()
#     plt.show()


In [None]:
import time
import pandas as pd

# Save the data to a CSV file
def save_to_csv(dataframe, filename='naivas_mattresses.csv'):
    try:
        dataframe.to_csv(filename, index=False)
        print(f"Data successfully saved to {filename}")
    except Exception as e:
        print(f"Error saving data to CSV: {e}")

# Function to handle pagination and extract products from multiple pages
def scrape_all_pages(base_search_url, max_pages=5):
    all_products = []
    current_page = 1
    
    while current_page <= max_pages:
        page_url = f"{base_search_url}&page={current_page}" if current_page > 1 else base_search_url
        print(f"Scraping page {current_page}: {page_url}")
        
        # Use ScrapingBee to get the page
        soup = scrape_with_fallback(page_url)
        if not soup:
            print(f"Failed to retrieve page {current_page}. Stopping.")
            break
            
        page_products = extract_product_info(soup)
        if not page_products:
            print(f"No products found on page {current_page}. This might be the last page.")
            break
            
        all_products.extend(page_products)
        print(f"Added {len(page_products)} products from page {current_page}")
        
        # Check if there's a next page by looking for pagination links
        next_page_link = soup.find('a', text=lambda x: x and 'Next' in x or '→' in x or '>' in x)
        if not next_page_link:
            next_page_link = soup.find('a', class_=lambda x: x and 'next' in x.lower() if x else False)
        
        if not next_page_link:
            print("No next page link found. This might be the last page.")
            break
            
        current_page += 1
        
        # Add a delay between pages to be respectful to ScrapingBee API
        if current_page <= max_pages:
            print(f"Waiting 5 seconds before fetching next page...")
            time.sleep(5)
    
    print(f"\nScraped a total of {len(all_products)} products from {current_page} pages")
    return all_products

# Uncomment the following lines to scrape all pages
# all_products = scrape_all_pages(search_url, max_pages=3)  # Start with a small number to test
# all_products_df = pd.DataFrame(all_products)
# if not all_products_df.empty:
#     print("\nSample of scraped products:")
#     display(all_products_df.head())
# else:
#     print("No products found across all pages.")

# Uncomment to save the data
# if not df.empty:
#     save_to_csv(df)

## Conclusion and Next Steps

This notebook provides a framework for scraping mattress price information from Naivas Online. The code can be adjusted based on the actual HTML structure of the website.

Next steps could include:

1. Running a more comprehensive analysis on the collected data
2. Creating visualizations to compare prices by brand or type
3. Setting up automated scraping to track price changes over time
4. Adding more error handling and robustness to the scraper

**Note:** Always be sure to check the website's terms of service and robots.txt file to ensure that web scraping is allowed.

# Function to get detailed information for each product
def get_product_details(product_url):
    # Use ScrapingBee for product pages too
    soup = scrape_with_fallback(product_url)
    if not soup:
        return {}
    
    details = {}
    
    # Try to extract product details
    # This will need to be adjusted based on the actual structure of the product page
    try:
        # Extract product title
        title_tag = soup.find(['h1', 'h2'], class_=lambda x: x and 'title' in x.lower() if x else False)
        if not title_tag:
            title_tag = soup.find(['h1', 'h2'])
        details['Title'] = title_tag.get_text().strip() if title_tag else "No title found"
        
        # Extract product description
        desc_tag = soup.find(['div', 'p'], class_=lambda x: x and 'description' in x.lower() if x else False)
        details['Description'] = desc_tag.get_text().strip() if desc_tag else "No description found"
        
        # Extract specs/features
        specs = {}
        spec_container = soup.find('table') or soup.find('div', class_=lambda x: x and ('spec' in x.lower() or 'detail' in x.lower() or 'feature' in x.lower()) if x else False)
        
        if spec_container:
            rows = spec_container.find_all(['tr', 'li'])
            for row in rows:
                if row.name == 'tr':
                    cols = row.find_all(['th', 'td'])
                    if len(cols) >= 2:
                        key = cols[0].get_text().strip()
                        value = cols[1].get_text().strip()
                        specs[key] = value
                else:  # li
                    text = row.get_text().strip()
                    if ':' in text:
                        key, value = text.split(':', 1)
                        specs[key.strip()] = value.strip()
                    else:
                        specs[f"Feature {len(specs) + 1}"] = text
                        
        details['Specifications'] = specs
        
        # Extract images
        img_tags = soup.find_all('img', src=True)
        product_images = [urljoin(base_url, img['src']) for img in img_tags if 'product' in img.get('class', [''])[0].lower() if img.get('class') else False]
        details['Images'] = product_images if product_images else []
        
    except Exception as e:
        print(f"Error extracting product details: {e}")
    
    return details

# Example usage to get detailed information for a single product
# (Uncomment and run after getting the product links)

# if not df.empty and 'Link' in df.columns:
#     sample_product = df.iloc[0]
#     print(f"Getting details for: {sample_product['Name']}")
#     details = get_product_details(sample_product['Link'])
#     print("Product Details:")
#     for key, value in details.items():
#         print(f"{key}: {value}")

## Alternative Approaches for Data Collection

If direct web scraping doesn't work due to website protections, consider these alternatives:

### 1. Check for an Official API
Many e-commerce platforms provide official APIs for developers to access their data. This is the most proper and reliable way to collect data.

### 2. Use Specialized Web Scraping Services
Services like:
- ScraperAPI
- Bright Data (formerly Luminati)
- Scraping Bee
- Zenscrape

These services handle proxy rotation, CAPTCHA solving, and other anti-scraping measures.

### 3. Browser Automation with Selenium
The Selenium approach in the previous cell can be enhanced with:
- Random delays between actions
- Mouse movements
- Scrolling behaviors

### 4. Respect Robots.txt and Website Terms of Service
Always check if scraping is allowed by reading the website's:
- Robots.txt file
- Terms of Service
- Fair use policies

### 5. Manual Data Collection
If the dataset is small, consider manual collection or creating a simple data entry form.

## Using ScrapingBee for Web Scraping

This notebook now integrates with [ScrapingBee](https://www.scrapingbee.com/), a web scraping API that helps bypass anti-scraping measures. Here's how it works:

### Benefits of using ScrapingBee:

1. **Proxy Rotation**: ScrapingBee automatically rotates IP addresses to avoid being blocked.
2. **JavaScript Rendering**: Renders JavaScript-heavy websites that require a real browser.
3. **CAPTCHA Handling**: Helps bypass CAPTCHAs that might be triggered by scraping attempts.
4. **Geolocation**: Allows accessing websites as if from specific countries (e.g., Kenya for local websites).
5. **Custom Headers & Cookies**: Manages browser fingerprinting to appear as a legitimate user.

### Important steps for using ScrapingBee:

1. Sign up for an account at [ScrapingBee.com](https://www.scrapingbee.com/)
2. Get your API key from the dashboard
3. Replace `YOUR_API_KEY` in the code with your actual API key
4. Monitor your API usage in the ScrapingBee dashboard

### Pricing Considerations:

ScrapingBee operates on a credit-based system. Each request costs a certain number of credits depending on features used:
- Basic requests: 1 credit
- JavaScript rendering: 5 credits
- Premium proxies: Additional credits

For occasional scraping of a few pages, the free plan (1,000 credits) may be sufficient. For regular scraping, you'll need a paid plan.

In [None]:
# ScrapingBee API Usage Tracker
def estimate_scrapingbee_usage(pages_scraped, products_per_page=20, detailed_views=0):
    """
    Estimates ScrapingBee API usage and costs
    
    Args:
        pages_scraped: Number of search results pages scraped
        products_per_page: Average number of products per page
        detailed_views: Number of individual product pages scraped
    """
    # Credits per request with JavaScript rendering + premium proxies
    credits_per_request = 5 + 1  # 5 credits for JS rendering + 1 for premium proxies
    
    # Calculate total requests and credits
    search_page_requests = pages_scraped
    product_page_requests = detailed_views
    total_requests = search_page_requests + product_page_requests
    total_credits = total_requests * credits_per_request
    
    # Print usage summary
    print("\n===== ScrapingBee API Usage Estimate =====")
    print(f"Search pages scraped: {pages_scraped}")
    print(f"Individual product pages scraped: {detailed_views}")
    print(f"Total API requests: {total_requests}")
    print(f"Estimated credits used: {total_credits}")
    
    # Show free tier vs paid tier information
    print("\n----- Plan Information -----")
    print("Free plan: 1,000 credits")  
    print("Starter plan: 25,000 credits ($49/month)")
    
    # Show credits remaining on free plan
    free_tier_remaining = 1000 - total_credits
    if free_tier_remaining > 0:
        print(f"\nCredits remaining on free plan: {free_tier_remaining}")
        print(f"Can still scrape approximately {free_tier_remaining // credits_per_request} more pages")
    else:
        print(f"\nFree tier credits exceeded by {abs(free_tier_remaining)}")
        print(f"Consider upgrading to a paid plan")

# Example usage - uncomment to run after scraping
# estimate_scrapingbee_usage(
#     pages_scraped=3,  
#     products_per_page=20,  
#     detailed_views=5  # If you viewed 5 individual product pages
# )