In [10]:
import time
import re
import requests 
import numpy as np
import pandas as pd
import concurrent.futures

from bs4 import BeautifulSoup as bs

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import ElementNotInteractableException 

# Webscraping Dubizzle :^)

https://www.imperva.com/resources/customers/case-studies/dubizzle-case-study/

This is going to be a relatively long one. 

Dubizzle is salty about webscrapers (read link above), so they hired a company to make their website harder to scrape.

That won't stop me. I will still scrape the data I need from their website, using the most bootleg methods possible. 

## Scrape one webpage, Then build on top of that.

In [15]:
# Webdriver settings
options = Options()
driver_manager = ChromeDriverManager().install()
driver = webdriver.Chrome(driver_manager,
                          options=options)

# Open website
driver.implicitly_wait(5)
driver.get('https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/?page=1')

# Find the html blocks we need for the number plates
border_selector = 'thumb'
plate_borders = driver.find_elements_by_class_name(border_selector)

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []       # List to hold the future objects
    for elem in plate_borders:
        futures.append(
            executor.submit(
                lambda elem: elem.get_attribute("innerHTML"),
                elem = elem
            )
        )
    
    # Html elements of the border we chose by css selector
    border_htmls =\
    [future.result() for future in concurrent.futures.as_completed(futures)]


#Close driver for good!
driver.quit()



Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


In [20]:
# Great!, 25 links in the page,
# I got 25 links, can send requests to these individually. 
len(plate_borders)

25

In [18]:
# Html looks good
border_htmls[0]

'\n                    <a href="https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/86368-locked-number-2-670---a17457bf51fb4af584bf1ed8e5311b19/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&amp;pos=0&amp;highlighted_ads=1">\n                        <div style="background-image:url(https://res.cloudinary.com/dubizzle-com/image/upload/co_rgb:242424,l_text:oswald_140_bold_letter_spacing_4:86368,x_100,y_-50/co_rgb:242424,l_text:oswald_140_bold_letter_spacing_4:P,x_-240,y_-50/c_fit,w_200/dubai-plate_private-car_classic);"></div>\n                    </a>\n                    <span class="thumb-image-count">\n                        1\n                    </span>\n                '

In [19]:
# Turn it into a soup object
soup = bs(border_htmls[0], 'html.parser')
soup


<a href="https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/86368-locked-number-2-670---a17457bf51fb4af584bf1ed8e5311b19/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&amp;pos=0&amp;highlighted_ads=1">
<div style="background-image:url(https://res.cloudinary.com/dubizzle-com/image/upload/co_rgb:242424,l_text:oswald_140_bold_letter_spacing_4:86368,x_100,y_-50/co_rgb:242424,l_text:oswald_140_bold_letter_spacing_4:P,x_-240,y_-50/c_fit,w_200/dubai-plate_private-car_classic);"></div>
</a>
<span class="thumb-image-count">
                        1
                    </span>

In [30]:
# Extracting URL works
soup.find_all('a', href=True)[0]['href']

'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/86368-locked-number-2-670---a17457bf51fb4af584bf1ed8e5311b19/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=0&highlighted_ads=1'

In [33]:
# Create a helper function for the task above
def extract_url(soup_object):
    """
    Extract the url from a the a href of a soup object
    """
    return soup_object.find_all('a', href=True)[0]['href']

extract_url(soup)

'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/86368-locked-number-2-670---a17457bf51fb4af584bf1ed8e5311b19/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=0&highlighted_ads=1'

In [38]:
# Make a url list
soup_list = [bs(elem, 'html.parser') for elem in border_htmls]
url_strings = [extract_url(soup) for soup in soup_list]

url_strings[0:2]

## Need to repeat this for 44 pages, but but need to make sure I can extract data I need from one link.

['https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/86368-locked-number-2-670---a17457bf51fb4af584bf1ed8e5311b19/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=0&highlighted_ads=1',
 'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/8989-2-150---41a23c57e41e4c198e36010d578c9fc6/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=2']

In [50]:
# Attempt to fetch data from one url.
# If it works we can pretty much run this 2000 times.
options = Options()
driver_manager = ChromeDriverManager().install()
driver = webdriver.Chrome(driver_manager,
                          options=options)

driver.implicitly_wait(5)
driver.get(url_strings[5])

price_selector = "sc-1q498l3-0.sc-1q498l3-1.WNwBg.jwMZvh.sc-1pns9yx-3.cRlgZx"
code_selector = '/html/body/div[2]/div/main/div/div[3]/div/div[4]/div[1]/div[3]/div/div[2]/div/p'
number_selector = "sc-1q498l3-0.sc-1q498l3-1.eMVpmW.eoIXmr.sc-19hd12a-3.eLqWzV"

price_element = driver.find_elements_by_class_name(price_selector)
code_element = driver.find_elements_by_xpath(code_selector)
number_element = driver.find_elements_by_class_name(number_selector)

print(price_element[0].text)
print(code_element[0].text)
print(number_element[0].text)

driver.quit()



Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


AED 4,900
U
12130


In [52]:
# The code above worked with minimal hiccups.
# Let's turn that into a function.

# Webdriver settings
def get_data(url):
    options = Options()
    driver_manager = ChromeDriverManager().install()
    driver = webdriver.Chrome(driver_manager,
                              options=options)

    # Open website
    driver.implicitly_wait(5)
    driver.get(url)

    # Find the html blocks we need for the number plates
    price_selector = "sc-1q498l3-0.sc-1q498l3-1.WNwBg.jwMZvh.sc-1pns9yx-3.cRlgZx"
    code_xpath = '/html/body/div[2]/div/main/div/div[3]/div/div[4]/div[1]/div[3]/div/div[2]/div/p'
    number_selector = "sc-1q498l3-0.sc-1q498l3-1.eMVpmW.eoIXmr.sc-19hd12a-3.eLqWzV"

    price_element = driver.find_elements_by_class_name(price_selector)
    code_element = driver.find_elements_by_xpath(code_xpath)
    number_element = driver.find_elements_by_class_name(number_selector)

    price = price_element[0].text
    code = code_element[0].text
    number = number_element[0].text

    driver.quit()
    
    return (number,code,price)