In [1]:
from config import *
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import os
import pymongo
from time import sleep
import pandas as pd
import numpy as np
import re
import csv
import sys
from time import sleep
from multiprocessing import Process
from selenium.webdriver.common.by import By

In [2]:
client = pymongo.MongoClient(f"mongodb+srv://{mongo_user}:{mongo_pass}@{mongo_url}")
db = client.jumia

In [3]:
try:
    current_path = os.path.dirname(os.path.abspath(__file__))
except:
    current_path = '.'

In [4]:
def init_driver(gecko_driver='', user_agent='', load_images=True, is_headless=False):
    '''
        This function is just to set up some of default for browser
    '''
    firefox_profile = webdriver.FirefoxProfile()
    
    firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', False)
    firefox_profile.set_preference("media.volume_scale", "0.0")
    firefox_profile.set_preference("dom.webnotifications.enabled", False)
    if user_agent != '':
        firefox_profile.set_preference("general.useragent.override", user_agent)
    if not load_images:
        firefox_profile.set_preference('permissions.default.image', 2)

    options = Options()
    options.headless = is_headless
    
    driver = webdriver.Firefox(options=options,
                               executable_path=f'{current_path}/{gecko_driver}',
                               firefox_profile=firefox_profile)
    
    return driver

In [5]:
def get_url(url, driver):
    '''
    Argument:
        url of any page to get
        driver that was inilized
    return:
        True
    '''
    driver.get(url)
    driver.refresh()
#     sleep(2)
    return True


In [6]:
def clean_number(money):
    '''
    Argument:
        a function take a money but as string and clean it,
        to be value that can be used for math operations
        
    return:
        money as float number
    '''

    money = re.findall('[0-9.]', money)
    money = "".join(money)
    return float(money)

In [7]:
def main_feature(driver2):
    '''
        Argument:
            driver to find elements int the page
        return:
            most of feature related to one product
    '''
    genral_info = []
# get most information about product
    try:
        features = driver2.find_elements_by_css_selector("#jm div.card div.markup ul")
        for ul in features:
            li_inside_ul = ul.find_elements_by_css_selector('li')
            strong_li = ''
            for li in li_inside_ul:
                genral_info.append(li.text)
    except Exception as e:
# send exception to log folder
        file = open("logs_files/main_feature_product.log","+a")
        file.write("This error related to function main_feature of Jumia_scrapping_multithreading file\n" 
                   + str(e) + "\n" + "#" *99 + "\n") # "#" *99 as separated lines
    return genral_info

In [8]:
def one_product_reviews(driver2):
    '''
        This functions used to get one product reviews for any product in souq site 
        just pass driver for this product then you will get all reviews
        some of these products have more than 100 reviews but souq display just first 5 reviews
        so we use the button show-more-result to display all reviews then we get all of product reviews
    Argument:
        driver of product page
    return:
        All reviews of this products as list of lists each of them display one use review.
        some of these reviews are arabic and english,
        this handling at second stage of cleaning data we separate them.
    '''
    
    all_reviews_for_one_pro = []
    try:
# represent all reviews pages
        show_more = driver2.find_element(By.LINK_TEXT, "عرض الكل") 
        show_more.click()
        while True:
            reviews = driver2.find_elements_by_css_selector('article p.-pvs')
            for review in reviews:
                all_reviews_for_one_pro.append(review.text)
#get all reviews of each page            
            show_more = driver2.find_element_by_xpath('//a[@aria-label=\'الصفحة التالية\']')
            previews_url = driver2.current_url
            show_more.click()
            next_url = driver2.current_url
 # break if there is no other reviews
            assert(previews_url != next_url)
    except Exception as e:
# send exception to log folder
        file = open("logs_files/one_product_reviews.log","+a")
        file.write("This error related to function one_product_reviews of Jumia_scrapping_multithreading file\n" 
               + str(e) + "\n" + "#" *99 + "\n") # "#" *99 as separated lines
    return all_reviews_for_one_pro


In [9]:
def products_info(driver):
    '''
    Argumetn:
        Driver of page with products
    return:
        all info related to these prodcuts for each prodcut
    '''
    
    products = driver.find_elements_by_css_selector('.products .sku')
    page_products_info = []        
    
    for pro in products:
        pro_url        = ''
        pro_brand      = ''
        pro_title      = ''
        old_price      = ''
        new_price      = ''
        pro_disc_prc   = 0.0
        pro_disc_val   = 0.0
        image_src = ''
        selector = pro.find_elements_by_css_selector
        
# first try to get main info about the product like title and url
        try:
            pro_url = selector('a.link')[0].get_attribute('href')
            pro_brand = selector('h2.title span.brand')[0].text
            pro_title = selector('h2.title span.name')[0].text
            new_price = selector('div.price-container .price-box .price')[0].text
            new_price = clean_number(new_price)
            image_src = selector('div.image-wrapper img')
            image_src = image_src[0].get_attribute('data-src')
# check if there is oldprice of this product to get discount
            try:
                len(selector('div.price-container .price-box span.-old')[0].text)
                old_price = selector('div.col-buy ul.list-blocks li .price-inline span.itemOldPrice')[0].text
                old_price = clean_number(old_price)   
                pro_disc_prc = round(100 - ((new_price / old_price) * 100))
                pro_disc_val = old_price - new_price
            except:
                old_price = 0.0

# Check of this product on our mongo cloud database
            if db.products.count_documents({'$or': [{"product_url": pro_url}, {"product_title":pro_title}]}) == 0:
                
#get the features and reviews of the prodcut
                driver2 = init_driver(gecko_driver,user_agent=user_agent)
                _ = get_url(pro_url, driver2)
                main_feature_of_product = main_feature(driver2)
                product_reviews = one_product_reviews(driver2)
                driver2.close()
                one_product_info = {
                    'pro_brand'                   : pro_brand,
                    'product_title'               : pro_title,
                    'product_url'                 : pro_url,
                    'image_src'                   : image_src,
                    'product_new_price'           : new_price,
                    'product_old_price'           : old_price,
                    'product_discount_percentage' : pro_disc_prc,
                    'product_discount_value'      : pro_disc_val,
                    'product_reviews'             : product_reviews,
                    'main_feature_of_product'     : main_feature_of_product,
                    'Uploaded_product'                    : False                   
                }
                _ = db.products.insert_one(one_product_info)
                page_products_info.append(one_product_info)
                driver2.close()
            else:
# once product is exist get it and update it
                pd = db.products.find_one({'$or': [{"product_url": pro_url}, {"product_title":pro_title}]})
                driver2 = init_driver(gecko_driver,user_agent=user_agent)
                _ = get_url(pro_url, driver2)
                reviews_number = driver2.find_element_by_css_selector("h2.-ptm")
                reviews_number = int(clean_number(reviews_number.text))                 
# no need to call one_product_reviews function and hit the show_more button for just few added reviews
# so comapre different between last count of this product reviews with new added reviews
                if abs(len(pd['product_reviews']) - reviews_number) > 5:
                    product_reviews = one_product_reviews(driver2)
        
                if pd['product_new_price'] != new_price or pd['product_old_price'] != old_price:
                      db.products.update_one({'_id': pd['_id']}, { '$set':{
                        'product_title'               :pro_title,
                        'product_url'                 : pro_url,
                        'product_new_price'           : new_price,
                        'product_old_price'           : old_price,
                        'product_discount_percentage' : pro_disc_prc,
                        'product_discount_value'      : pro_disc_val,
                        'product_reviews'             : product_reviews,
                        }
                                                                
                }) # end of update_one
                    
                driver2.close()

# send exception to log folder
        except Exception as e:
            file = open("logs_files/products_info.log","+a")
            file.write("This error related to function products_info of Souq_scrapping_multithreading file\n" 
               + str(e) + "\n" + "#" *99 + "\n") # "#" *99 as separated lines
    return page_products_info
        

      
    

In [10]:
def scrap_pages(page_url,next_page):
    '''
    Argument:
        page_url to as start page
    return:
        dictionary for all pages contain:
        for each page get all prdocuts info contain:
        for each prodcut get all reviews and main features  
    '''
    all_page_products = []
    next_page = page_url
    while len(next_page):
        try:
            driver = init_driver(gecko_driver,user_agent=user_agent)
            get_url(next_page, driver)
            products_infos = products_info(driver)
            all_page_products.append(products_infos)
            click_next = driver.find_element_by_css_selector('.pagination .osh-pagination .item .osh-font-light-arrow-right')
            click_next.click()
            sleep(2)
            next_page = driver.current_url
            driver.quit()
        except Exception as e:
            sleep(10000)
            next_page = page_url
            driver.quit()
            file = open("logs_files/scrap_pages.log","+a")
            file.write("This error related to function scrap_pages of Jumia_scrapping_multithreading file\n" 
               + str(e) + "\n" + "#" *99 + "\n") # "#" *99 as separated lines
    return all_page_products


### Call the main function with multithreading process

In [None]:
if __name__ == '__main__':
    p1 = Process(target=scrap_pages, args=(jumia_url_mobile,1))
    p1.start()
    p2 = Process(target=scrap_pages, args=(jumia_url_electronics,1))
    p2.start()
    p3 = Process(target=scrap_pages, args=(jumia_url_computing,1))
    p3.start()
    p1.join()
    p2.join()
    p3.join()

In [35]:
driver = init_driver(gecko_driver,user_agent=user_agent)
get_url('https://translate.google.com/', driver)

True

In [36]:
en_selctor = driver.find_element_by_css_selector('#sugg-item-en')
en_selctor.click()

In [37]:
ar_selctor = driver.find_element_by_css_selector('#sugg-item-ar')
ar_selctor.click()

In [47]:
source = driver.find_element_by_css_selector('#source')

In [48]:
source.clear()

In [50]:
source.send_keys('Welcome to selnium')

In [None]:
translate = driver.find_element_by_css_selector('')