In [8]:
import time
import re
import requests 
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementNotInteractableException 

## Webscraping notebook 

All the data will be scared from 'https://plates.ae/plate.php'.

### Load webpage and pull html elements

In [2]:
# Webdriver settings
options = Options()
driver_manager = ChromeDriverManager().install()
driver = webdriver.Chrome(driver_manager,
                          options=options)

# Open website
driver.implicitly_wait(5)
driver.get('https://plates.ae/plate.php')

# Infinite scroll down to the bottom of the website
SCROLL_PAUSE_TIME = 1
height_script = "return document.body.scrollHeight"
last_height = driver.execute_script(height_script)
while True:
    try:
        script = 'window.scrollTo(0, document.body.scrollHeight);'
        element_xpath = "//button[contains(text(), 'مشاهدة المزيد من اللوحات')]"

        # Scroll to the bottom of the page
        driver.execute_script(script)

        # Click the button at the end of the page
        driver.find_elements_by_xpath(element_xpath)[0].click()
        time.sleep(SCROLL_PAUSE_TIME)

        # Exit clause if end of page is reached
        new_height = driver.execute_script(height_script)
        if new_height == last_height:
            break
        last_height = new_height
    
    # Break out if button doesn't work
    # It should so this is in case of error
    except ElementNotInteractableException:
        break


# Find the html blocks we need for the number plates
border_selector = '.bntborder.padding-single'
plate_borders = driver.find_elements_by_css_selector(border_selector)
border_htmls = [elem.get_attribute("innerHTML") for elem in plate_borders]

#Close driver for good!
driver.quit()



Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


### Figure out how to parse the elements and pull out data

In [18]:
a = border_htmls[0]
a

'\n          <div class="number_plate ">\n          \n          <div class="fdg text-center even">\n          \n            <div class="duabi_new bigtxt ">\n             \n\t\t\t <div class="plate_code  one_sh sharjah_font1 sh_one web_shcode_on">3</div>\n              <div class="plate_img "> <img src="images/uae_sharjah.svg"> </div>\n              <div class="plate_nub one_sh sharjah_font1 web_shnum_on">344</div>\n\t\t\n              <div class="clearfix"></div>\n            </div>\n            \n            </div>\n          \n\t\t\t<div class="cftg text-center even">\n\t\t\t\t\t<div class="prices-single pdlft0"> \n\t\t\t\t\t\t<a href="tel:0509080500"><div class="robot  text-center txt17 textleft">0509080500</div> </a>\n\t\t\t\t\t\t<span class="txt24bld pricered text-center">\n\t\t\t\t\t\t\t<i>229,000 AED</i>\n\t\t\t\t\t\t</span>\n\t\t\t\t\t\t\n\t\t\t\t\t</div>\n\t\t     </div>\n\t\t\t \n          </div>\n          \n         '

In [19]:
soup = bs(a, 'html.parser')
soup


<div class="number_plate">
<div class="fdg text-center even">
<div class="duabi_new bigtxt">
<div class="plate_code one_sh sharjah_font1 sh_one web_shcode_on">3</div>
<div class="plate_img"> <img src="images/uae_sharjah.svg"/> </div>
<div class="plate_nub one_sh sharjah_font1 web_shnum_on">344</div>
<div class="clearfix"></div>
</div>
</div>
<div class="cftg text-center even">
<div class="prices-single pdlft0">
<a href="tel:0509080500"><div class="robot text-center txt17 textleft">0509080500</div> </a>
<span class="txt24bld pricered text-center">
<i>229,000 AED</i>
</span>
</div>
</div>
</div>

In [55]:
code_regex = re.compile('.*plate_code.*')
soup.find('div', {"class" : code_regex})

<div class="plate_code one_sh sharjah_font1 sh_one web_shcode_on">3</div>

In [30]:
number_regex = re.compile('.*plate_nub.*')
soup.find('div', {"class" : number_regex})

<div class="plate_nub one_sh sharjah_font1 web_shnum_on">344</div>

In [56]:
soup.find('span', {"class" : 'txt24bld pricered text-center'})

'\n229,000 AED\n'

In [48]:
img_str = str(soup.find('div', {"class" : 'plate_img'}))
img_str

'<div class="plate_img"> <img src="images/uae_sharjah.svg"/> </div>'

In [53]:
img_regex = re.compile("images/(.*?)\.svg")
re.findall(img_regex,img_str)

['uae_sharjah']

### Pull out data from the html elements

In [58]:
code_list = []
number_list = []
price_list = []
city_list = []

for html_element in border_htmls:
    soup = bs(html_element, 'html.parser')
    
    # Find the code
    code_regex = re.compile('.*plate_code.*')
    code = soup.find('div', {"class" : code_regex}).text
    code_list.append(code)
    
    # Find number
    number_regex = re.compile('.*plate_nub.*')
    number = soup.find('div', {"class" : number_regex}).text
    number_list.append(number)
    
    # Price
    price = soup.find('span', {"class" : 'txt24bld pricered text-center'}).text
    price_list.append(price)
    
    #city 
    img_str = str(soup.find('div', {"class" : 'plate_img'}))
    img_regex = re.compile("images/(.*?)\.svg")
    city = re.findall(img_regex,img_str)[0]
    city_list.append(city)

### Convert