In [134]:
import time
import re
import requests 
import numpy as np
import pandas as pd
import concurrent.futures

from bs4 import BeautifulSoup as bs
from fake_useragent import UserAgent
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import ElementNotInteractableException,NoSuchWindowException 

In [133]:
pip install fake_useragent

Note: you may need to restart the kernel to use updated packages.


# Webscraping Dubizzle :^)

https://www.imperva.com/resources/customers/case-studies/dubizzle-case-study/

This is going to be a relatively long one. 

Dubizzle is salty about webscrapers (read link above), so they hired a company to make their website harder to scrape.

That won't stop me. I will still scrape the data I need from their website, using the most bootleg methods possible. 

## Scrape one webpage, Then build on top of that.

In [15]:
# Webdriver settings
options = Options()
driver_manager = ChromeDriverManager().install()
driver = webdriver.Chrome(driver_manager,
                          options=options)

# Open website
driver.implicitly_wait(5)
driver.get('https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/?page=1')

# Find the html blocks we need for the number plates
border_selector = 'thumb'
plate_borders = driver.find_elements_by_class_name(border_selector)

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []       # List to hold the future objects
    for elem in plate_borders:
        futures.append(
            executor.submit(
                lambda elem: elem.get_attribute("innerHTML"),
                elem = elem
            )
        )
    
    # Html elements of the border we chose by css selector
    border_htmls =\
    [future.result() for future in concurrent.futures.as_completed(futures)]


#Close driver for good!
driver.quit()



Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


In [20]:
# Great!, 25 links in the page,
# I got 25 links, can send requests to these individually. 
len(plate_borders)

25

In [18]:
# Html looks good
border_htmls[0]

'\n                    <a href="https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/86368-locked-number-2-670---a17457bf51fb4af584bf1ed8e5311b19/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&amp;pos=0&amp;highlighted_ads=1">\n                        <div style="background-image:url(https://res.cloudinary.com/dubizzle-com/image/upload/co_rgb:242424,l_text:oswald_140_bold_letter_spacing_4:86368,x_100,y_-50/co_rgb:242424,l_text:oswald_140_bold_letter_spacing_4:P,x_-240,y_-50/c_fit,w_200/dubai-plate_private-car_classic);"></div>\n                    </a>\n                    <span class="thumb-image-count">\n                        1\n                    </span>\n                '

In [19]:
# Turn it into a soup object
soup = bs(border_htmls[0], 'html.parser')
soup


<a href="https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/86368-locked-number-2-670---a17457bf51fb4af584bf1ed8e5311b19/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&amp;pos=0&amp;highlighted_ads=1">
<div style="background-image:url(https://res.cloudinary.com/dubizzle-com/image/upload/co_rgb:242424,l_text:oswald_140_bold_letter_spacing_4:86368,x_100,y_-50/co_rgb:242424,l_text:oswald_140_bold_letter_spacing_4:P,x_-240,y_-50/c_fit,w_200/dubai-plate_private-car_classic);"></div>
</a>
<span class="thumb-image-count">
                        1
                    </span>

In [30]:
# Extracting URL works
soup.find_all('a', href=True)[0]['href']

'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/86368-locked-number-2-670---a17457bf51fb4af584bf1ed8e5311b19/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=0&highlighted_ads=1'

In [33]:
# Create a helper function for the task above
def extract_url(soup_object):
    """
    Extract the url from a the a href of a soup object
    """
    return soup_object.find_all('a', href=True)[0]['href']

extract_url(soup)

'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/86368-locked-number-2-670---a17457bf51fb4af584bf1ed8e5311b19/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=0&highlighted_ads=1'

In [38]:
# Make a url list
soup_list = [bs(elem, 'html.parser') for elem in border_htmls]
url_strings = [extract_url(soup) for soup in soup_list]

url_strings[0:2]

## Need to repeat this for 44 pages, but but need to make sure I can extract data I need from one link.

['https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/86368-locked-number-2-670---a17457bf51fb4af584bf1ed8e5311b19/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=0&highlighted_ads=1',
 'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/8989-2-150---41a23c57e41e4c198e36010d578c9fc6/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=2']

In [50]:
# Attempt to fetch data from one url.
# If it works we can pretty much run this 2000 times.
options = Options()
driver_manager = ChromeDriverManager().install()
driver = webdriver.Chrome(driver_manager,
                          options=options)

driver.implicitly_wait(5)
driver.get(url_strings[5])

price_selector = "sc-1q498l3-0.sc-1q498l3-1.WNwBg.jwMZvh.sc-1pns9yx-3.cRlgZx"
code_selector = '/html/body/div[2]/div/main/div/div[3]/div/div[4]/div[1]/div[3]/div/div[2]/div/p'
number_selector = "sc-1q498l3-0.sc-1q498l3-1.eMVpmW.eoIXmr.sc-19hd12a-3.eLqWzV"

price_element = driver.find_elements_by_class_name(price_selector)
code_element = driver.find_elements_by_xpath(code_selector)
number_element = driver.find_elements_by_class_name(number_selector)

print(price_element[0].text)
print(code_element[0].text)
print(number_element[0].text)

driver.quit()



Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


AED 4,900
U
12130


In [136]:
# The code above worked with minimal hiccups.
# Let's turn that into a function.

# Webdriver settings
def get_data(url):
    """
    Gets the data I need from a number plate price listing
    
    """
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("user-agent=[user-agent string]")
    chrome_options.add_argument('--disable-notifications')
    prefs = {"profile.default_content_setting_values.notifications" : 2}
    chrome_options.add_experimental_option("prefs",prefs)
    chrome_options.add_argument("window-size=1920,1080")

    driver_manager = ChromeDriverManager().install()
    driver = webdriver.Chrome(driver_manager,
                              options = chrome_options)


    driver.implicitly_wait(5)
    driver.get(url)
    


    # Find the html blocks we need for the number plates
    price_selector = "sc-1q498l3-0.sc-1q498l3-1.WNwBg.jwMZvh.sc-1pns9yx-3.cRlgZx"
    code_xpath = '/html/body/div[2]/div/main/div/div[3]/div/div[4]/div[1]/div[3]/div/div[2]/div/p'
    number_selector = "sc-1q498l3-0.sc-1q498l3-1.eMVpmW.eoIXmr.sc-19hd12a-3.eLqWzV"

    price_element = driver.find_elements_by_class_name(price_selector)
    code_element = driver.find_elements_by_xpath(code_xpath)
    number_element = driver.find_elements_by_class_name(number_selector)

    price = price_element[0].text
    code = code_element[0].text
    number = number_element[0].text

    driver.quit()
    
    return (number,code,price)

In [82]:
## Create one more helper function to get all the html elements from all 40 pages, and we will be set
def get_html(url):
    """
    gets html elements from all the main scroll pages
    
    """
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("user-agent=[user-agent string]")
    chrome_options.add_argument('--disable-notifications')
    prefs = {"profile.default_content_setting_values.notifications" : 2}
    chrome_options.add_experimental_option("prefs",prefs)
    chrome_options.add_argument("window-size=1920,1080")

    driver_manager = ChromeDriverManager().install()
    driver = webdriver.Chrome(driver_manager,
                              options = chrome_options)


    driver.implicitly_wait(5)
    driver.get(url)


    border_selector = 'thumb'
    plate_borders = driver.find_elements_by_class_name(border_selector)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        time.sleep(1)
        futures = []       # List to hold the future objects
        for elem in plate_borders:
            futures.append(
                executor.submit(
                    lambda elem: elem.get_attribute("innerHTML"),
                    elem = elem
                )
            )

        # Html elements of the border we chose by css selector
        border_htmls =\
        [future.result() for future in concurrent.futures.as_completed(futures)]

    driver.quit()
    
    return border_htmls

## Time to generalize to all pages

### Get links to all the pages

In [57]:
# This will store the lists of lists with all the html elements 
# from the scroll down pages
htmls_master_list = []

for num in tqdm(range(1,41)):
    url_num = f'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/?page={num}'
    sub_list = get_html(url_num)
    htmls_master_list.append(sub_list)

  0%|          | 0/40 [00:00<?, ?it/s]

Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache
  2%|▎         | 1/40 [00:06<03:55,  6.04s/it]

Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache
  5%|▌         | 2/40 [00:11<03:33,  5.62s/it]

Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache
  8%|▊         | 3/40 [00:17<03:35,  5.84s/it]

Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache
 10%|█         | 4/40 [00:23<03:35,  5.99s/it]

Current google-chrome versio

 78%|███████▊  | 31/40 [03:48<01:06,  7.38s/it]

Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache
 80%|████████  | 32/40 [03:56<01:01,  7.68s/it]

Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache
 82%|████████▎ | 33/40 [04:15<01:16, 10.96s/it]

Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache
 85%|████████▌ | 34/40 [04:25<01:03, 10.60s/it]

Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache
 88%|████████▊ | 35/40 [04:31<00:46,  9.40s/it]

Current google-

In [67]:
# Flatten the html list
# Soup it
# Get the urls
html_flat = np.array(htmls_master_list).ravel().tolist()
html_soups = [bs(elem, 'html.parser') for elem in html_array]
urls = [extract_url(soup) for soup in html_soups]

urls[0:3]

['https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/1418-2-909---0baaef4f35704021b6f8ab853539efd0/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=3',
 'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/11/17/8989-2-150---41a23c57e41e4c198e36010d578c9fc6/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=2',
 'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/12/31/b-2652-2-202---4c62bf4eeb174f6d8698b093466a88be/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTE%3D&pos=4']

In [None]:
# Get all data
data_list = []

In [157]:
# Missed elements
# Iterate through this again F dubizzle
missed_list = []

In [166]:
for ind,link in zip(range(111,1000),urls[111:]):
    response = requests.get(link)
    # check if link exists
    if response.status_code == 200:
        try:
            data = get_data(link) 
            data_list.append(data)
        except IndexError:
            missed_link.append(ind)
            continue
    # If link doesn't exist just continue iteration
    # This is needed incase someone removes their listing
    else:
        continue



Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


Current google-chrome version is 96.0.4664
Get LATEST driver ve

NameError: name 'missed_link' is not defined

In [161]:
data_list

[('11000', 'K', 'AED 255,000'),
 ('1114', 'D', 'AED 105,000'),
 ('66000', 'K', 'AED 160,000'),
 ('60007', 'Q', 'AED 31,000'),
 ('8773', 'M', 'AED 15,000'),
 ('1418', 'S', 'AED 37,000'),
 ('8989', 'H', 'AED 138,000'),
 ('2652', 'B', 'AED 21,000'),
 ('86368', 'P', 'AED 12,000'),
 ('60100', 'C', 'AED 14,500'),
 ('12130', 'U', 'AED 4,900'),
 ('82682', 'S', 'AED 4,500'),
 ('80156', 'L', 'AED 3,500'),
 ('60460', 'D', 'AED 7,000'),
 ('58886', 'F', 'AED 9,500'),
 ('44044', 'U', 'AED 69,000'),
 ('181', 'U', 'AED 555,000'),
 ('3385', 'K', 'AED 15,500'),
 ('44454', 'L', 'AED 43,000'),
 ('5010', 'A', 'AED 99,500'),
 ('1925', 'L', 'AED 15,000'),
 ('1996', 'White', 'AED 215,000'),
 ('9114', 'M', 'AED 75,500'),
 ('8864', 'H', 'AED 14,800'),
 ('1615', 'Q', 'AED 27,000'),
 ('778', 'P', 'AED 345,000'),
 ('717', 'U', 'AED 380,000'),
 ('84044', 'T', 'AED 9,000'),
 ('717', 'L', 'AED 379,000'),
 ('1313', 'F', 'AED 119,000'),
 ('55911', 'U', 'AED 14,000'),
 ('31110', 'N', 'AED 12,400'),
 ('50111', 'R', 'AED 

In [167]:
len(data_list)

116

In [169]:
urls[116]

'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/12/30/u6310-2-136---c13f1d62e5a2416ca1519da49f525a61/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTU%3D&pos=120'

In [None]:
'https://dubai.dubizzle.com/motors/number-plates/dubai-plate/private-car/2021/12/30/u6310-2-136---c13f1d62e5a2416ca1519da49f525a61/?back=L21vdG9ycy9udW1iZXItcGxhdGVzL2R1YmFpLXBsYXRlL3ByaXZhdGUtY2FyLz9wYWdlPTU%3D&pos=120'

In [164]:
len(urls[111:])

889

In [165]:
len(range(111,1000))

889

In [155]:
copy_list = [('11000', 'K', 'AED 255,000'),
 ('1114', 'D', 'AED 105,000'),
 ('66000', 'K', 'AED 160,000'),
 ('60007', 'Q', 'AED 31,000'),
 ('8773', 'M', 'AED 15,000'),
 ('1418', 'S', 'AED 37,000'),
 ('8989', 'H', 'AED 138,000'),
 ('2652', 'B', 'AED 21,000'),
 ('86368', 'P', 'AED 12,000'),
 ('60100', 'C', 'AED 14,500'),
 ('12130', 'U', 'AED 4,900'),
 ('82682', 'S', 'AED 4,500'),
 ('80156', 'L', 'AED 3,500'),
 ('60460', 'D', 'AED 7,000'),
 ('58886', 'F', 'AED 9,500'),
 ('44044', 'U', 'AED 69,000'),
 ('181', 'U', 'AED 555,000'),
 ('3385', 'K', 'AED 15,500'),
 ('44454', 'L', 'AED 43,000'),
 ('5010', 'A', 'AED 99,500'),
 ('1925', 'L', 'AED 15,000'),
 ('1996', 'White', 'AED 215,000'),
 ('9114', 'M', 'AED 75,500'),
 ('8864', 'H', 'AED 14,800'),
 ('1615', 'Q', 'AED 27,000'),
 ('778', 'P', 'AED 345,000'),
 ('717', 'U', 'AED 380,000'),
 ('84044', 'T', 'AED 9,000'),
 ('717', 'L', 'AED 379,000'),
 ('1313', 'F', 'AED 119,000'),
 ('55911', 'U', 'AED 14,000'),
 ('31110', 'N', 'AED 12,400'),
 ('50111', 'R', 'AED 22,000'),
 ('809', 'T', 'AED 300,000'),
 ('11000', 'K', 'AED 255,000'),
 ('1418', 'S', 'AED 37,000'),
 ('8989', 'H', 'AED 138,000'),
 ('2652', 'B', 'AED 21,000'),
 ('86368', 'P', 'AED 12,000'),
 ('60100', 'C', 'AED 14,500'),
 ('12130', 'U', 'AED 4,900'),
 ('82682', 'S', 'AED 4,500'),
 ('80156', 'L', 'AED 3,500'),
 ('60460', 'D', 'AED 7,000'),
 ('58886', 'F', 'AED 9,500'),
 ('44044', 'U', 'AED 69,000'),
 ('181', 'U', 'AED 555,000'),
 ('3385', 'K', 'AED 15,500'),
 ('44454', 'L', 'AED 43,000'),
 ('5010', 'A', 'AED 99,500'),
 ('1925', 'L', 'AED 15,000'),
 ('1996', 'White', 'AED 215,000'),
 ('9114', 'M', 'AED 75,500'),
 ('8864', 'H', 'AED 14,800'),
 ('1615', 'Q', 'AED 27,000'),
 ('778', 'P', 'AED 345,000'),
 ('717', 'U', 'AED 380,000'),
 ('84044', 'T', 'AED 9,000'),
 ('717', 'L', 'AED 379,000'),
 ('1313', 'F', 'AED 119,000'),
 ('55911', 'U', 'AED 14,000'),
 ('31110', 'N', 'AED 12,400'),
 ('50111', 'R', 'AED 22,000'),
 ('809', 'T', 'AED 300,000'),
 ('11000', 'K', 'AED 255,000'),
 ('1114', 'D', 'AED 105,000'),
 ('66000', 'K', 'AED 160,000'),
 ('60007', 'Q', 'AED 31,000'),
 ('8773', 'M', 'AED 15,000'),
 ('1418', 'S', 'AED 37,000'),
 ('8989', 'H', 'AED 138,000'),
 ('2652', 'B', 'AED 21,000'),
 ('86368', 'P', 'AED 12,000'),
 ('60100', 'C', 'AED 14,500'),
 ('12130', 'U', 'AED 4,900'),
 ('82682', 'S', 'AED 4,500'),
 ('80156', 'L', 'AED 3,500'),
 ('60460', 'D', 'AED 7,000'),
 ('58886', 'F', 'AED 9,500'),
 ('44044', 'U', 'AED 69,000'),
 ('181', 'U', 'AED 555,000'),
 ('3385', 'K', 'AED 15,500'),
 ('44454', 'L', 'AED 43,000'),
 ('5010', 'A', 'AED 99,500'),
 ('1925', 'L', 'AED 15,000'),
 ('1996', 'White', 'AED 215,000'),
 ('9114', 'M', 'AED 75,500'),
 ('8864', 'H', 'AED 14,800'),
 ('1615', 'Q', 'AED 27,000'),
 ('778', 'P', 'AED 345,000'),
 ('717', 'U', 'AED 380,000'),
 ('84044', 'T', 'AED 9,000'),
 ('717', 'L', 'AED 379,000'),
 ('1313', 'F', 'AED 119,000'),
 ('55911', 'U', 'AED 14,000'),
 ('31110', 'N', 'AED 12,400'),
 ('50111', 'R', 'AED 22,000'),
 ('809', 'T', 'AED 300,000'),
 ('11000', 'K', 'AED 255,000'),
 ('1114', 'D', 'AED 105,000'),
 ('66000', 'K', 'AED 160,000'),
 ('60007', 'Q', 'AED 31,000'),
 ('8773', 'M', 'AED 15,000')]