In [109]:
from config import *

In [110]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
import pymongo
import re

In [111]:
import os
from time import sleep
from math import ceil
from datetime import datetime
from pathlib import Path
import json
import random

In [112]:
client = pymongo.MongoClient(f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_url}")
db = client.jumia

In [113]:
try:
    current_path = os.path.dirname(os.path.abspath(__file__))
except:
    current_path = '.'

In [114]:
def init_driver(gecko_driver='', user_agent='', load_images=True, is_headless=False):
    firefox_profile = webdriver.FirefoxProfile()
    
    firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', False)
    firefox_profile.set_preference("media.volume_scale", "0.0")
    firefox_profile.set_preference("dom.webnotifications.enabled", False)
    if user_agent != '':
        firefox_profile.set_preference("general.useragent.override", user_agent)
    if not load_images:
        firefox_profile.set_preference('permissions.default.image', 2)

    options = Options()
    options.headless = is_headless
    
    service = Service(executable_path=f'{current_path}/{gecko_driver}')
    
    # ✅ Lancer le driver avec service
    driver = webdriver.Firefox(service=service,
                               options=options)
    
    return driver

In [115]:
def get_url(page_url, driver):
    driver.get(page_url)
    
    sleep(page_load_timeout)
    
    close_popup = driver.find_elements(By.CSS_SELECTOR,'button.cls')
    if len(close_popup) > 0:
        close_popup[0].click()
        
    return True

In [131]:
def get_products(driver):
    products = driver.find_elements(By.CSS_SELECTOR, 'section.card .prd')
    products_info = []
    
    for product in products:
    
        product_title = ''
        if len(product.find_elements(By.CSS_SELECTOR,'h3.name')) > 0:
            product_title = product.find_elements(By.CSS_SELECTOR,'h3.name')[0].text
    
        product_url = ''
        if len(product.find_elements(By.CSS_SELECTOR,'a.core')) > 0:
            product_url = product.find_elements(By.CSS_SELECTOR,'a.core')[0].get_attribute('href')
        
        current_price = 0
        if len(product.find_elements(By.CSS_SELECTOR,'.prc')) > 0:
            current_price_text = product.find_elements(By.CSS_SELECTOR,'.prc')[0].text
            #current_price = ceil( float(current_price) )    
            current_match = re.search(r'\d{1,3}(?:[ ,]\d{3})*(?:\.\d+)?', current_price_text)
            if current_match:
                cleaned_current_price_str = current_match.group().replace(",", "").replace(" ", "")
                current_price = ceil(float(cleaned_current_price_str))
            else:
                current_price = 0
    
        old_price = 0
        if len(product.find_elements(By.CSS_SELECTOR,'.old')) > 0:
            old_price_text = product.find_elements(By.CSS_SELECTOR,'.old')[0].text
            #old_price = ceil( float(old_price) )
            old_match = re.search(r'\d{1,3}(?:[ ,]\d{3})*(?:\.\d+)?', old_price_text)
            if old_match:
                cleaned_old_price_str = old_match.group().replace(",", "").replace(" ", "")
                old_price = ceil(float(cleaned_old_price_str))
            else:
                old_price = 0
    
        discount_percentage = 0
        discount_quantity = 0
    
        if current_price != 0 and old_price != 0 and current_price < old_price:
            discount_quantity = round( old_price - current_price )
            discount_percentage = round( 100 - ( (current_price / old_price) * 100 ) )
    
        
        if product_title == '' or product_url == '' or current_price == 0:
            continue
    
        product_info = {
            'product_title': product_title,
            'product_url': product_url,
            'current_price': current_price,
            'old_price': old_price,
            'discount_percentage': discount_percentage,
            'discount_quantity': discount_quantity,
            'inserted_at': datetime.now(),
            'updated_at': datetime.now(),
            'published_at': False
        }
    
        if db.products.count_documents( { '$or': [ {'product_title': product_title}, {'product_url':product_url} ]  } ) == 0:
            _ = db.products.insert_one( product_info )
        else:
            pd = db.products.find_one( { '$or': [ {'product_title': product_title}, {'product_url':product_url} ]  } )
            if pd['current_price'] != current_price or pd['old_price'] != old_price:
                # update prices
                db.products.update_one( {'_id': pd['_id'] },{'$set': 
                                                             {'current_price': current_price,
                                                             'old_price': old_price,
                                                             'discount_percentage': discount_percentage,
                                                             'discount_quantity':discount_quantity,
                                                             'updated_at': datetime.now(),
                                                            'published_at': False} }  ) 

        

        products_info.append( product_info )
    
    return products_info

In [132]:
from config import *

try:
    current_path = os.path.dirname(os.path.abspath(__file__))
except:
    current_path = '.'
driver = init_driver(gecko_driver, user_agent=user_agent, is_headless=headless)

categories = ['phones-tablets', 'electronics']
len_product=0
for category in categories:
    category_url = f"{jumia_base_url}/{category}"
    for page in range(1,6):
        page_url = f"{category_url}/?page={str(page)}"
        _ = get_url(page_url, driver)
        products = get_products(driver)
driver.quit()