# Scraping Souq Products

**Welcome to this scrapping Notebook, what in this notebook:**

- Implement helpful functions that help us scrap some images related to different categories because of our image classification model, which depends on multiple different categories that will help us in future work to recommend you images related to your style.

**Structure of the Scrapping I used:**
- loading packages we need.
- handle some firefox preference and options that help us during process of scrapping

### loading packages we need

In [None]:
from souq_configs import *
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import os
from time import sleep
import re
import sys
from time import sleep
from multiprocessing import Process
from selenium.webdriver.common.by import By
import urllib

### Path the work dir

In [None]:
try:
    current_path = os.path.dirname(os.path.abspath(__file__))
except:
    current_path = '.'

#### Handle some firefox preference and options that help us during process of scrapping

In [None]:
def init_driver(gecko_driver='', user_agent='', load_images=True, is_headless=False):
    '''
        This function is just to set up some of default for browser
    '''
    firefox_profile = webdriver.FirefoxProfile()
    
    firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', False)
    firefox_profile.set_preference("media.volume_scale", "0.0")
    firefox_profile.set_preference("dom.webnotifications.enabled", False)
    if user_agent != '':
        firefox_profile.set_preference("general.useragent.override", user_agent)
    if not load_images:
        firefox_profile.set_preference('permissions.default.image', 2)

    options = Options()
    options.add_argument('headless')
    options.headless = is_headless
    
    driver = webdriver.Firefox(options=options,
                               executable_path=f'{current_path}/{gecko_driver}',
                               firefox_profile=firefox_profile)
    
    return driver

In [None]:
def get_url(url, driver):
    '''
    Argument:
        url of any page to get
        driver that was finalized and what we will use as robot in the opened pages
    return:
        True
    '''
    driver.get(url)
    driver.refresh()
    sleep(2)
    return True


## function explanation

The driver send as paramter is a one page and for each page we send as driver we get all of the products:
- Loop over these products
- Extract image src
- Extract image name and extension using regular expression
- Save images based on the type in different directions

In [None]:
def products_info(driver, image_type):
    '''
    Argumetn:
        Driver of the page we are in
    return:
        save images in its direction based on its type
    '''

    products = driver.find_elements_by_css_selector("div.tpl-results div.grid-list div.single-item")
    
    for product in products:
        # take a based selector
        selector = product.find_elements_by_css_selector
        image_src = ''
        try:
            image_src = selector('a.img-link img')
            image_src = image_src[0].get_attribute('data-src')
            # check image_src has actuall src or None
            if image_src:
                img_name_ext = re.search('\w+(.jpg)', image_src)
                if image_type == 't_shirts':
                    urllib.request.urlretrieve(image_src, "images/t_shirts/"+img_name_ext[0])
                elif image_type == 'shirts':
                    urllib.request.urlretrieve(image_src, "images/shirts/"+img_name_ext[0])
                elif image_type == 'pants':
                    urllib.request.urlretrieve(image_src, "images/pants/"+img_name_ext[0])
        # handle errors generated of any reasons
        except Exception as e:
            file = open("logs_files/products_info.log","+a")
            file.write("This error related to function products_info of Souq_scrapping_multithreading file\n" 
               + str(e) + "\n" + "#" *99 + "\n") # "#" *99 as separated lines
    return True

## Main Function

**This function work as follow:**
- loop over all pages url
- get driver for each page then after scraping all images of this page quit the page
- check if there is new page or not to reset the process
- for each page call product info function

In [None]:
def scrap_pages(page_url, image_type, next_page = 1):
    '''
    Argument:
        next page = 1 as default value
        page_url to as start page
    return:
        dictionary for all pages contain:
        for each page get all prdocuts info contain:
        for each prodcut get all reviews and main features  
    '''
    all_page_products = {}
    while next_page:
# get the driver first
        url = page_url + str(next_page)
        driver = init_driver(gecko_driver,user_agent=user_agent)
        _ = get_url(url, driver)
# get page products info and for each product get all features and reviews
        products_infos = products_info(driver)
        all_page_products[str(next_page)] = products_info
# check for new pages
        showMore = driver.find_element_by_css_selector('.pagination-next a')
        next_page_url = showMore.get_attribute('href')
        _ = get_url(next_page_url, driver)
        next_page +=1
        current_url = driver.current_url
        driver.close()
# get the page current page number
        current_url = re.findall('page=[0-9]+', current_url)
        current_url = re.findall('[0-9]+', str(current_url))
        current_url = "".join(current_url)
        if current_url != str(next_page):
            next_page = 1
        driver.quit()
    return all_page_products


In [None]:
if __name__ == '__main__':
    p1 = Process(target=scrap_pages, args=(souq__section_url_tishrt, 't_shirts', 1))
    p1.start()
    p2 = Process(target=scrap_pages, args=(souq__section_url_shirts, 'shirts', 1))
    p2.start()
    p3 = Process(target=scrap_pages, args=(souq__section_url_pants, 'pants', 1))
    p3.start()
    p1.join()
    p2.join()
    p3.join()