In [None]:
# Import necessary libraries
import requests # make HTTP requests to fetch web pages content
from bs4 import BeautifulSoup # parse HTML and XML docs for easier data extraction
import csv # write scraped data into CSV file
import time # introduce delays between requests to avoid server overload
import re #regular expression
import random # vary time delays to simulate human-like behaviour
import psycopg2 #py package to interact with PostgreSQL
from psycopg2 import sql
import logging
#import json

# User-Agent headers for automating requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Function to check URL response
def check_response(urls, headers):
    for url in urls:
        try:
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                logging.warning(f"Failed to retrieve {url}. Status code: {response.status_code}")
                return False
        except requests.exceptions.RequestException as e:
            logging.error(f"Error requesting {url}: {str(e)}")
            return False
    return True

# Function to scrape product details
def scrape_product_details(url, headers):
    products = []
    #try:
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    catalog_listing = soup.find('div', {'id': 'catalog-listing'})
    
    products = []
    for product in catalog_listing:
        # Loop through each product inside the catalog listing
        prod = product.find('a', {'class': 'core'})
    
        if product:
            name = prod.find('h3', class_='name').get_text(strip=True) if prod.find('h3', class_='name') else "N/A"
            price = prod.find('div', class_='prc').get_text(strip=True) if prod.find('div', class_='prc') else "N/A"
            old_price = prod.find('div', class_='old').get_text(strip=True) if prod.find('div', class_='old') else "N/A"
            discount = prod.find('div', class_='bdg _dsct _sm').get_text(strip=True) if prod.find('div', class_='bdg _dsct _sm') else "N/A"
            
            rating = prod.find('div', class_='rev')
            if rating:
                stars = rating.find('div', class_='stars _s').get_text(strip=True) if rating.find('div', class_='stars _s') else "N/A"
                reviews_count = rating.get_text(strip=True).split('(')[-1].strip(')') if '(' in rating.get_text() else "N/A"
            else:
                stars = "N/A"
                reviews_count = "N/A"

            item_id = prod.get('data-gtm-id', "N/A")
            item_brand = prod.get('data-gtm-brand', "N/A")
            product_url = f"https://www.jumia.co.ke{product['href']}" if prod.find('a') else "N/A"
                    
            product_info = {
                "name": name,
                "price": price,
                "old_price": old_price,
                "rating": rating,
                "reviews": reviews_count,
                "product_id": item_id,
                "brand": item_brand,
                "discount": discount,
                "product_url": product_url,
            }

            products.append(product_info)
    
        # Optional delay between requests
        time.sleep(random.uniform(1, 3))  # Random delay between 1 and 3 seconds

    return products

# Function to get the last page number
def get_last_page_number(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the last page link
        last_page_link = soup.find('a', {'aria-label': 'Last Page'})
        
        if last_page_link:
            last_page_url = last_page_link.get('href', '')
            
            # Extract the page number from the URL
            page_number_str = last_page_url.split('=')[-1]
            page_number_str = page_number_str.split('#')[0]  # Remove any fragment identifier (#catalog-listing)
            
            page_number = int(page_number_str) if page_number_str.isdigit() else 1
            return page_number
        else:
            logging.warning(f"Last page link not found for {url}. Defaulting to page 1.")
            return 1
    except Exception as e:
        logging.error(f"Error fetching last page number for {url}: {str(e)}")
        return 1


# Function to save data to CSV
def save_to_csv(products, filename):
    try:
        keys = products[0].keys()
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=keys)
            writer.writeheader()
            writer.writerows(products)
        logging.info(f"Data saved to {filename}")
    except Exception as e:
        logging.error(f"Error saving data to CSV: {str(e)}")

# Function to save data to PostgreSQL
def save_to_postgresql(products, db_params, table_name):
    try:
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()
        
        create_table_query = f"""
            CREATE TABLE IF NOT EXISTS {table_name} (
                id SERIAL PRIMARY KEY,
                product_name TEXT,
                product_id TEXT UNIQUE,
                price TEXT,
                old_price TEXT,
                discount TEXT,
                brand TEXT,
                rating TEXT,
                reviews TEXT,
                product_url TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """
        cursor.execute(create_table_query)
        conn.commit()
        
        insert_query = f"""
            INSERT INTO {table_name} (product_name, product_id, price, old_price, discount, brand, rating, reviews, product_url)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
        """
        
        for product in products:
            cursor.execute(insert_query, (
                product["name"],
                product["product_id"],
                product["price"],
                product["old_price"],
                product["discount"],
                product["brand"],
                product["rating"],
                product["reviews_count"],
                product["product_url"]
            ))

        conn.commit()
        logging.info(f"Successfully inserted {len(products)} products into {table_name}.")
    except Exception as e:
        logging.error(f"Error inserting data into PostgreSQL: {str(e)}")
        if conn:
            conn.rollback()
    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()

# Main function to scrape and save data
def scrape_and_save(url, headers, db_params, table_name, output_filename):
    logging.info(f"Starting scrape for {url}...")

    last_page = get_last_page_number(url, headers)
    logging.info(f"Found {last_page} pages to scrape.")

    products = []
    
    for page_num in range(1, last_page + 1):
        page_url = f"{url}?page={page_num}#catalog-listing"
        logging.info(f"Scraping page {page_num} of {last_page}...")
        page_products = scrape_product_details(page_url, headers)
        products.extend(page_products)

        time.sleep(random.uniform(1, 3))  # Random delay to avoid overloading the server
    
    logging.info(f"Scraping complete. Total products scraped: {len(products)}.")

    # Save products to PostgreSQL and CSV
    if products:
        save_to_postgresql(products, db_params, table_name)
        save_to_csv(products, output_filename)
    else:
        logging.warning("No products found to save.")

# Example URLs and Database Params
url1 = "https://www.jumia.co.ke/televisions/#catalog-listing"
url2 = "https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing"

db_params = {
    "host": "localhost",
    "database": "e_analytics_db",
    "user": "postgres",
    "password": "password"
}

# Output CSV files
output_file_1 = "jumia_televisions.csv"
output_file_2 = "jumia_cookers.csv"

# Scrape and save data for both URLs
if check_response([url1, url2], headers):
    scrape_and_save(url1, headers, db_params, "jumia_televisions", output_file_1)
    scrape_and_save(url2, headers, db_params, "jumia_cookers", output_file_2)
else:
    logging.error("One or more URLs failed to load.")


In [None]:
def scrape_product_details(url, headers):
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        logging.error(f"Failed to retrieve page: {url} (Status code: {response.status_code})")
        return []

    # Optionally print part of the response to debug
    print(f"Response content from {url}: {response.text[:500]}")  # Print the first 500 characters of the response
    
    soup = BeautifulSoup(response.text, 'html.parser')
    catalog_listing = soup.find('div', {'id': 'catalog-listing'})

    if not catalog_listing:
        logging.warning(f"Could not find catalog listing on page: {url}")
        return []

    products = []
    for catalog in catalog_listing.find_all('a', class_='core'):
        # Extract product details here
        product = {
            'name': catalog.text.strip(),
            'url': catalog['href']
            # Add other product details as needed
        }
        products.append(product)

    # Print the list of products
    print("Scraped Products:", products)

    return products

# URL to scrape
url = "https://www.jumia.co.ke/televisions/#catalog-listing"

# Headers to mimic a real browser request (you can modify headers as needed)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Call the scrape_product_details function with the URL and headers
scrape_product_details(url, headers)

    

In [None]:
import requests
from bs4 import BeautifulSoup
import logging

def scrape_product_details(url, headers):
    # Send a GET request to the URL
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        logging.error(f"Failed to retrieve page: {url} (Status code: {response.status_code})")
        return []

    # Optionally print part of the response to debug
    print(f"Response content from {url}: {response.text[:500]}")  # Print the first 500 characters of the response
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Attempt to find the 'catalog-listing' div by its ID
    catalog_listing = soup.find('div', {'id': 'catalog-listing'})

    if not catalog_listing:
        logging.warning(f"Could not find catalog listing on page: {url}")
        return []

    # Create an empty list to store product data
    products = []
    # Find all product elements within the catalog listing
    for catalog in catalog_listing.find_all('a', class_='core'):
        # Extract product details
        product = {
            'name': catalog.text.strip(),
            'url': catalog['href']
            # Add more product details as needed
        }
        products.append(product)

    # Print the list of products
    print("Scraped Products:", products)

    return products

# URL to scrape
url = "https://www.jumia.co.ke/televisions/#catalog-listing"

# Headers to mimic a real browser request (you can modify headers as needed)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Call the scrape_product_details function with the URL and headers
scrape_product_details(url, headers)



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from datetime import datetime
import re
import time
import random

def clean_price(price_str):
    if not price_str:
        return None
    return float(re.sub(r'[^\d.]', '', price_str))

def extract_product_info(product):
    try:
        # Basic product info
        name = product.find('h3', class_='name').text.strip()
        
        # Extract product URL and ID
        product_url = product.find('a', class_='core')['href']
        item_id = product_url.split('/')[-1].split('-')[-1]
        
        # Price information
        price_container = product.find('div', class_='prc')
        current_price = clean_price(price_container.text.strip()) if price_container else None
        
        old_price_container = product.find('div', class_='old')
        old_price = clean_price(old_price_container.text.strip()) if old_price_container else None
        
        # Discount
        discount_container = product.find('div', class_='bdg _dsct')
        discount = discount_container.text.strip() if discount_container else None
        if discount:
            discount = int(discount.replace('%', '').replace('-', ''))
        
        # Brand (extract from product name)
        brand = name.split()[0]
        
        # Additional features
        express_badge = product.find('div', class_='bdg _express')
        has_express_shipping = bool(express_badge)
        
        official_store_badge = product.find('div', class_='bdg _mall')
        is_official_store = bool(official_store_badge)
        
        return {
            'name': name,
            'item_id': item_id,
            'price': current_price,
            'old_price': old_price,
            'discount': discount,
            'brand': brand,
            'category': 'Electronics',
            'subcategory': 'Televisions',
            'source': 'Jumia',
            'has_express_shipping': has_express_shipping,
            'is_official_store': is_official_store,
            'url': f'https://www.jumia.co.ke{product_url}',
            'scraping_timestamp': datetime.now().isoformat()
        }
    except Exception as e:
        print(f"Error processing product: {str(e)}")
        return None

def scrape_jumia_tvs():
    base_url = 'https://www.jumia.co.ke/televisions/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    all_products = []
    page = 1
    
    while True:
        try:
            url = f"{base_url}?page={page}#catalog-listing"
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            products = soup.find_all('article', class_='prd _fb col c-prd')
            
            if not products:
                break
                
            for product in products:
                product_info = extract_product_info(product)
                if product_info:
                    all_products.append(product_info)
            
            print(f"Scraped page {page}, found {len(products)} products")
            page += 1
            
            # Add delay to be respectful to the server
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error scraping page {page}: {str(e)}")
            break
    
    return all_products

def main():
    # Create data directories if they don't exist
    os.makedirs('data/scraped', exist_ok=True)
    
    # Scrape products
    print("Starting scraping process...")
    products = scrape_jumia_tvs()
    
    # Save to CSV
    if products:
        df = pd.DataFrame(products)
        output_file = 'data/scraped/jumia_tvs.csv'
        df.to_csv(output_file, index=False)
        print(f"Successfully scraped {len(products)} products and saved to {output_file}")
    else:
        print("No products were scraped")

if __name__ == "__main__":
    main()


In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from datetime import datetime
import re
import time
import random

def clean_price(price_str):
    if not price_str:
        return None
    return float(re.sub(r'[^\d.]', '', price_str))

def get_category_hierarchy():
    base_url = 'https://www.jumia.co.ke/televisions/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(base_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        categories = []
        # Find all filter boxes
        filter_boxes = soup.find_all('div', class_='fbox')
        
        for box in filter_boxes:
            title = box.find('div', class_='title')
            if title:
                category_name = title.text.strip()
                items = box.find_all('a', class_='fil')
                
                subcategories = []
                for item in items:
                    label = item.find('label')
                    count = item.find('span')
                    if label and count:
                        subcategories.append({
                            'name': label.text.strip(),
                            'count': count.text.strip('()'),
                            'url': item.get('href', '')
                        })
                
                if subcategories:
                    categories.append({
                        'category': category_name,
                        'subcategories': subcategories
                    })
        
        return categories
    except Exception as e:
        print(f"Error getting category hierarchy: {str(e)}")
        return []

def extract_product_info(product):
    try:
        # Extract item_id and brand using data attributes
        item_id = product.get('data-gtm-id', "N/A")
        item_brand = product.get('data-gtm-brand', "N/A")
        
        # Basic product info
        name = product.find('h3', class_='name').text.strip()
        
        # Extract product URL
        product_url = product.find('a', class_='core')['href']
        
        # Price information
        price_container = product.find('div', class_='prc')
        current_price = clean_price(price_container.text.strip()) if price_container else None
        
        old_price_container = product.find('div', class_='old')
        old_price = clean_price(old_price_container.text.strip()) if old_price_container else None
        
        # Discount with updated class
        discount_container = product.find('div', class_='bdg _dsct _sm')
        discount = discount_container.text.strip() if discount_container else None
        if discount:
            discount = int(discount.replace('%', '').replace('-', ''))
        
        # Extract ratings and reviews
        rating = product.find('div', class_='rev')
        if rating:
            stars = rating.find('div', class_='stars _s').get_text(strip=True) if rating.find('div', class_='stars _s') else "N/A"
            reviews_count = rating.get_text(strip=True).split('(')[-1].strip(')') if '(' in rating.get_text() else "N/A"
        else:
            stars = "N/A"
            reviews_count = "N/A"
        
        # Additional features
        express_badge = product.find('div', class_='bdg _express')
        has_express_shipping = bool(express_badge)
        
        official_store_badge = product.find('div', class_='bdg _mall')
        is_official_store = bool(official_store_badge)
        
        # Get breadcrumb navigation for detailed categorization
        breadcrumb = product.find_parent('div', class_='content').find('nav', class_='brcbs')
        categories = []
        if breadcrumb:
            category_links = breadcrumb.find_all('a')
            categories = [link.text.strip() for link in category_links]
        
        return {
            'name': name,
            'item_id': item_id,
            'brand': item_brand,
            'price': current_price,
            'old_price': old_price,
            'discount': discount,
            'stars_rating': stars,
            'reviews_count': reviews_count,
            'category': categories[1] if len(categories) > 1 else 'Electronics',
            'subcategory': categories[2] if len(categories) > 2 else 'Televisions',
            'sub_subcategory': categories[3] if len(categories) > 3 else '',
            'source': 'Jumia',
            'url': f'https://www.jumia.co.ke{product_url}',
            'has_express_shipping': has_express_shipping,
            'is_official_store': is_official_store,
            'scraping_timestamp': datetime.now().isoformat()
        }
    except Exception as e:
        print(f"Error processing product: {str(e)}")
        return None

def scrape_jumia_tvs():
    base_url = 'https://www.jumia.co.ke/televisions/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    all_products = []
    page = 1
    
    while True:
        try:
            url = f"{base_url}?page={page}#catalog-listing"
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            products = soup.find_all('article', class_='prd _fb col c-prd')
            
            if not products:
                break
                
            for product in products:
                product_info = extract_product_info(product)
                if product_info:
                    all_products.append(product_info)
            
            print(f"Scraped page {page}, found {len(products)} products")
            page += 1
            
            # Add delay to be respectful to the server
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error scraping page {page}: {str(e)}")
            break
    
    return all_products

def main():
    # Create data directories if they don't exist
    os.makedirs('data/scraped', exist_ok=True)
    
    # Get category hierarchy first
    print("Getting category hierarchy...")
    categories = get_category_hierarchy()
    if categories:
        # Save category hierarchy to CSV
        hierarchy_data = []
        for cat in categories:
            for subcat in cat['subcategories']:
                hierarchy_data.append({
                    'main_category': 'Electronics',
                    'category': cat['category'],
                    'subcategory': subcat['name'],
                    'product_count': subcat['count'],
                    'url': subcat['url']
                })
        
        df_hierarchy = pd.DataFrame(hierarchy_data)
        df_hierarchy.to_csv('data/scraped/jumia_tv_categories_hierarchy.csv', index=False)
        print("\nCategory hierarchy saved to data/scraped/jumia_tv_categories_hierarchy.csv")
        
        # Print hierarchy for visibility
        print("\nCategory Hierarchy:")
        for cat in categories:
            print(f"\n{cat['category']}:")
            for subcat in cat['subcategories']:
                print(f"  - {subcat['name']} ({subcat['count']} products)")
    
    # Scrape products
    print("\nStarting product scraping process...")
    products = scrape_jumia_tvs()
    
    # Save to CSV
    if products:
        df = pd.DataFrame(products)
        output_file = 'data/scraped/jumia_tvs.csv'
        df.to_csv(output_file, index=False)
        print(f"Successfully scraped {len(products)} products and saved to {output_file}")
    else:
        print("No products were scraped")

if __name__ == "__main__":
    main()


Getting category hierarchy...

Starting product scraping process...
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has no attribute 'find'
Error processing product: 'NoneType' object has 

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from datetime import datetime
import re
import time
import random

def clean_price(price_str):
    if not price_str:
        return None
    return float(re.sub(r'[^\d.]', '', price_str))

def extract_product_info(product):
    try:
        # Find the core link element that contains all data attributes
        core_link = product.find('a', class_='core')
        if not core_link:
            return None
            
        # Extract item_id, brand, and categories using data-ga4 attributes
        item_id = core_link.get('data-ga4-item_id', "N/A")
        item_brand = core_link.get('data-ga4-item_brand', "N/A")
        item_name = core_link.get('data-ga4-item_name', "N/A")
        
        # Categories
        category = core_link.get('data-ga4-item_category', "Electronics")
        subcategory = core_link.get('data-ga4-item_category2', "Television & Video")
        subcategory2 = core_link.get('data-ga4-item_category3', "Televisions")
        subcategory3 = core_link.get('data-ga4-item_category4', "")
        
        # Extract product URL
        product_url = core_link['href']
        
        # Price information
        price_container = product.find('div', class_='prc')
        current_price = clean_price(price_container.text.strip()) if price_container else None
        
        old_price_container = product.find('div', class_='old')
        old_price = clean_price(old_price_container.text.strip()) if old_price_container else None
        
        # Discount with updated class
        discount_container = product.find('div', class_='bdg _dsct _sm')
        discount = discount_container.text.strip() if discount_container else None
        if discount:
            discount = int(discount.replace('%', '').replace('-', ''))
        
        # Extract ratings and reviews
        rating = product.find('div', class_='rev')
        if rating:
            stars = rating.find('div', class_='stars _s').get_text(strip=True) if rating.find('div', class_='stars _s') else "N/A"
            reviews_count = rating.get_text(strip=True).split('(')[-1].strip(')') if '(' in rating.get_text() else "N/A"
        else:
            stars = "N/A"
            reviews_count = "N/A"
        
        # Additional features
        express_badge = product.find('div', class_='bdg _express')
        has_express_shipping = bool(express_badge)
        
        official_store_badge = product.find('div', class_='bdg _mall')
        is_official_store = bool(official_store_badge)
        
        return {
            'name': item_name,
            'item_id': item_id,
            'brand': item_brand,
            'price': current_price,
            'old_price': old_price,
            'discount': discount,
            'stars_rating': stars,
            'reviews_count': reviews_count,
            'category': category,
            'subcategory': subcategory,
            'subcategory2': subcategory2,
            'subcategory3': subcategory3,
            'source': 'Jumia',
            'url': f'https://www.jumia.co.ke{product_url}',
            'has_express_shipping': has_express_shipping,
            'is_official_store': is_official_store,
            'scraping_timestamp': datetime.now().isoformat()
        }
    except Exception as e:
        print(f"Error processing product: {str(e)}")
        return None

def scrape_jumia_tvs():
    base_url = 'https://www.jumia.co.ke/televisions/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    all_products = []
    page = 1
    
    while True:
        try:
            url = f"{base_url}?page={page}#catalog-listing"
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            products = soup.find_all('article', class_='prd _fb col c-prd')
            
            if not products:
                break
                
            for product in products:
                product_info = extract_product_info(product)
                if product_info:
                    all_products.append(product_info)
            
            print(f"Scraped page {page}, found {len(products)} products")
            page += 1
            
            # Add delay to be respectful to the server
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error scraping page {page}: {str(e)}")
            break
    
    return all_products

def main():
    # Create data directories if they don't exist
    os.makedirs('data/scraped', exist_ok=True)
    
    # Scrape products
    print("Starting scraping process...")
    products = scrape_jumia_tvs()
    
    # Save to CSV
    if products:
        df = pd.DataFrame(products)
        output_file = 'data/scraped/jumia_tvs.csv'
        df.to_csv(output_file, index=False)
        print(f"Successfully scraped {len(products)} products and saved to {output_file}")
    else:
        print("No products were scraped")

if __name__ == "__main__":
    main()


Starting scraping process...
Scraped page 1, found 40 products
Scraped page 2, found 40 products
Scraped page 3, found 40 products
Scraped page 4, found 40 products
Scraped page 5, found 40 products
Scraped page 6, found 40 products
Scraped page 7, found 40 products
Scraped page 8, found 40 products
Scraped page 9, found 40 products
Scraped page 10, found 40 products
Scraped page 11, found 40 products
Scraped page 12, found 40 products
Scraped page 13, found 40 products
Scraped page 14, found 40 products
Scraped page 15, found 40 products
Scraped page 16, found 40 products
Scraped page 17, found 40 products
Scraped page 18, found 40 products
Scraped page 19, found 40 products
Scraped page 20, found 40 products
Scraped page 21, found 40 products
Scraped page 22, found 40 products
Scraped page 23, found 40 products
Scraped page 24, found 40 products
Scraped page 25, found 40 products
Scraped page 26, found 40 products
Scraped page 27, found 40 products
Scraped page 28, found 40 products


In [20]:
def scrape_jumia_cookers():
    base_url = 'https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    all_cookers = []
    page = 1
    
    while True:
        try:
            url = f"{base_url}?page={page}#catalog-listing"
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            cookers = soup.find_all('article', class_='prd _fb col c-prd')
            
            if not products:
                break
                
            for product in products:
                product_info = extract_product_info(product)
                if product_info:
                    all_cookers.append(product_info)
            
            print(f"Scraped page {page}, found {len(products)} products")
            page += 1
            
            # Add delay to be respectful to the server
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error scraping page {page}: {str(e)}")
            break
    
    return cookers

def main():
    # Create data directories if they don't exist
    os.makedirs('data/scraped', exist_ok=True)
    
    # Scrape products
    print("Starting scraping process...")
    products = scrape_jumia_cookers()
    
    # Save to CSV
    if products:
        df = pd.DataFrame(products)
        output_file = 'data/scraped/jumia_cookers.csv'
        df.to_csv(output_file, index=False)
        print(f"Successfully scraped {len(products)} products and saved to {output_file}")
    else:
        print("No products were scraped")

if __name__ == "__main__":
    main()

Starting scraping process...
Scraped page 1, found 40 products
Scraped page 2, found 40 products
Scraped page 3, found 40 products
Scraped page 4, found 40 products
Scraped page 5, found 40 products
Scraped page 6, found 40 products
Scraped page 7, found 40 products
Scraped page 8, found 40 products
Scraped page 9, found 40 products
Scraped page 10, found 40 products
Scraped page 11, found 40 products
Scraped page 12, found 40 products
Scraped page 13, found 40 products
Scraped page 14, found 40 products
Scraped page 15, found 40 products
Scraped page 16, found 40 products
Scraped page 17, found 40 products
Scraped page 18, found 40 products
Scraped page 19, found 40 products
Scraped page 20, found 40 products
Scraped page 21, found 40 products
Scraped page 22, found 40 products
Scraped page 23, found 40 products
Scraped page 24, found 40 products
Scraped page 25, found 40 products
Scraped page 26, found 40 products
Scraped page 27, found 40 products
Scraped page 28, found 40 products


KeyboardInterrupt: 

In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from datetime import datetime
import re
import time
import random

def clean_price(price_str):
    if not price_str:
        return None
    return float(re.sub(r'[^\d.]', '', price_str))

def extract_product_info(product):
    try:
        # Find the core link element that contains all data attributes
        core_link = product.find('a', class_='core')
        if not core_link:
            return None
            
        # Extract item_id, brand, and categories using data-ga4 attributes
        item_id = core_link.get('data-ga4-item_id', "N/A")
        item_brand = core_link.get('data-ga4-item_brand', "N/A")
        item_name = core_link.get('data-ga4-item_name', "N/A")
        
        # Categories
        category = core_link.get('data-ga4-item_category', "Electronics")
        subcategory = core_link.get('data-ga4-item_category2', "Television & Video")
        subcategory2 = core_link.get('data-ga4-item_category3', "Televisions")
        subcategory3 = core_link.get('data-ga4-item_category4', "")
        
        # Extract product URL
        product_url = core_link['href']
        
        # Price information
        price_container = product.find('div', class_='prc')
        current_price = clean_price(price_container.text.strip()) if price_container else None
        
        old_price_container = product.find('div', class_='old')
        old_price = clean_price(old_price_container.text.strip()) if old_price_container else None
        
        # Discount with updated class
        discount_container = product.find('div', class_='bdg _dsct _sm')
        discount = discount_container.text.strip() if discount_container else None
        if discount:
            discount = int(discount.replace('%', '').replace('-', ''))
        
        # Extract ratings and reviews
        rating = product.find('div', class_='rev')
        if rating:
            stars = rating.find('div', class_='stars _s').get_text(strip=True) if rating.find('div', class_='stars _s') else "N/A"
            reviews_count = rating.get_text(strip=True).split('(')[-1].strip(')') if '(' in rating.get_text() else "N/A"
        else:
            stars = "N/A"
            reviews_count = "N/A"
        
                
        return {
            'name': item_name,
            'item_id': item_id,
            'brand': item_brand,
            'price': current_price,
            'old_price': old_price,
            'discount': discount,
            'stars_rating': stars,
            'reviews_count': reviews_count,
            'category': category,
            'subcategory': subcategory,
            'subcategory2': subcategory2,
            'subcategory3': subcategory3,
            'source': 'Jumia',
            'url': f'https://www.jumia.co.ke{product_url}',
            'scraping_timestamp': datetime.now().isoformat()
        }
    except Exception as e:
        print(f"Error processing product: {str(e)}")
        return None

def scrape_jumia_tvs():
    base_url = 'https://www.jumia.co.ke/televisions/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    all_products = []
    page = 1
    
    while True:
        try:
            url = f"{base_url}?page={page}#catalog-listing"
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            products = soup.find_all('article', class_='prd _fb col c-prd')
            
            if not products:
                break
                
            for product in products:
                product_info = extract_product_info(product)
                if product_info:
                    all_products.append(product_info)
            
            print(f"Scraped page {page}, found {len(products)} products")
            page += 1
            
            # Add delay to be respectful to the server
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error scraping page {page}: {str(e)}")
            break
    
    return all_products

def scrape_jumia_cookers():
    base_url = 'https://www.jumia.co.ke/home-cooking-appliances-cookers/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    all_products = []
    page = 1
    
    while True:
        try:
            url = f"{base_url}?page={page}#catalog-listing"
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            products = soup.find_all('article', class_='prd _fb col c-prd')
            
            if not products:
                break
                
            for product in products:
                product_info = extract_product_info(product)
                if product_info:
                    # Update source and category for cookers
                    product_info['source'] = 'Jumia Cookers'
                    product_info['category'] = 'Home Appliances'
                    product_info['subcategory'] = 'Cooking Appliances'
                    all_products.append(product_info)
            
            print(f"Scraped cookers page {page}, found {len(products)} products")
            page += 1
            
            # Add delay to be respectful to the server
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error scraping cookers page {page}: {str(e)}")
            break
    
    return all_products

def main():
    # Create data directories if they don't exist
    os.makedirs('data/scraped', exist_ok=True)
    
    # Scrape TVs
    print("Starting TV scraping process...")
    tv_products = scrape_jumia_tvs()
    
    # Save TV products to CSV
    if tv_products:
        df_tvs = pd.DataFrame(tv_products)
        tv_output_file = 'data/scraped/jumia_tvs.csv'
        df_tvs.to_csv(tv_output_file, index=False)
        print(f"Successfully scraped {len(tv_products)} TV products and saved to {tv_output_file}")
    else:
        print("No TV products were scraped")
    
    # Scrape Cookers
    print("\nStarting cookers scraping process...")
    cooker_products = scrape_jumia_cookers()
    
    # Save Cooker products to CSV
    if cooker_products:
        df_cookers = pd.DataFrame(cooker_products)
        cookers_output_file = 'data/scraped/jumia_cookers.csv'
        df_cookers.to_csv(cookers_output_file, index=False)
        print(f"Successfully scraped {len(cooker_products)} cooker products and saved to {cookers_output_file}")
    else:
        print("No cooker products were scraped")

if __name__ == "__main__":
    main()


Starting TV scraping process...
Scraped page 1, found 40 products
Scraped page 2, found 40 products
Scraped page 3, found 40 products
Scraped page 4, found 40 products
Scraped page 5, found 40 products
Scraped page 6, found 40 products
Scraped page 7, found 40 products
Scraped page 8, found 40 products
Scraped page 9, found 40 products
Scraped page 10, found 40 products
Scraped page 11, found 40 products
Scraped page 12, found 40 products
Scraped page 13, found 40 products
Scraped page 14, found 40 products
Scraped page 15, found 40 products
Scraped page 16, found 40 products
Scraped page 17, found 40 products
Scraped page 18, found 40 products
Scraped page 19, found 40 products
Scraped page 20, found 40 products
Scraped page 21, found 40 products
Scraped page 22, found 40 products
Scraped page 23, found 40 products
Scraped page 24, found 40 products
Scraped page 25, found 40 products
Scraped page 26, found 40 products
Scraped page 27, found 40 products
Scraped page 28, found 40 produc