## Project 2. Part 1

**Use Selenium and BeautifulSoup**

In [234]:
import requests
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML

### Get the information for a few cars (manual html input)

In [260]:
#Code that works for a given url (for one car ad)
def make_soup(html):
    """Generates and return a soup object based on a given html file"""
    source_code = requests.get(html)
    soup = BeautifulSoup(source_code.text, 'html5lib')
    return soup


In [283]:
#SCRAPE CAR MAKE AND MODEL AND ENGINE SIZE 
def get_make_and_engine(soup, cars_dict):
    """Given an input soup object retrieves the advert title (containing Make and Engine size)
    Returns a dictionary with Make and Engine keys"""
    dividers = soup.find_all('div', {'class', 'col-5 classifieds-info'})
    cars = cars_dict
    for div in dividers:
        for title in div('h1'):
            title_list = title.text.split(',')
            for i in range(3):
                if i == 0:
                    try:
                        cars['Make'].append(title_list[i])
                    except:
                        cars['Make'] = [title_list[i]]
                elif i == 1:
                    try:
                        cars['Engine_Size'].append(title_list[i])
                    except:
                        cars['Engine_Size'] = [title_list[i]]
        return cars    


In [285]:
#SCRAPE THE REST OF THE FEATURES
def get_other_features(soup, cars_dict):
    """Retrieves remaining features of a car using input soup object
    Returns an updated cars dictionary"""
    cars = cars_dict
    params = soup.find_all('table', {'class', 'announcement-parameters'})
    for param in params:
        for tr in param('tr'):
            for th in tr('th'):
                col_title = th.text
                for td in tr('td'):
                    col_value = td.text
                    try:
                        cars[col_title].append(col_value)
                    except:
                        cars[col_title] = [col_value]
    return cars

In [286]:
def scrape(soups, cars):
    """Given the soups file this function retrieves all the feature columns about each car"""
    #Retrieve the car make, model and engine size
    cars = get_make_and_engine(soups, cars)

    #Retrieve the rest of the features
    cars = get_other_features(soups, cars)

    return cars

In [291]:
#Manual HTML list FOR NOW
html_list = ['https://en.autoplius.lt/ads/bmw-520-2-0-l-wagon-2018-diesel-7160889.html', 'https://en.autoplius.lt/ads/volvo-s40-1-6-l-saloon-sedan-2006-diesel-7355867.html']

#Generate soup object list
soups = make_soups(html_list)

cars = {}
for soup in soups:
    final_dataset = scrape(soup, cars)

print(len(final_dataset['Make']))

#have to be careful with missing data, because with a dictionary it's hard to tell 
#which record is actually missing a value. 
#In my case though, I will be dropping any columns that aren't complete anyway

2


## Testing

In [282]:
# cars = {}
# for html in html_list:
#     print(html)
#     soup = make_soup(html)
#     dividers = soup.find_all('div', {'class', 'col-5 classifieds-info'})
#     for div in dividers:
#         for title in div('h1'):
#             print(title)
#             title_list = title.text.split(',')
#             print(title_list)
#             for i in range(3):
#                 if i == 0:
#                     try:
#                         cars['Make'].append(title_list[i])
#                     except:
#                         cars['Make'] = [title_list[i]]
#                 elif i == 1:
#                     try:
#                         cars['Engine_Size'].append(title_list[i])
#                     except:
#                         cars['Engine_Size'] = [title_list[i]]

https://en.autoplius.lt/ads/bmw-520-2-0-l-wagon-2018-diesel-7160889.html
<h1>BMW 520, 2.0 l., wagon</h1>
['BMW 520', ' 2.0 l.', ' wagon']
https://en.autoplius.lt/ads/volvo-s40-1-6-l-saloon-sedan-2006-diesel-7355867.html
<h1>Volvo S40, 1.6 l., saloon / sedan</h1>
['Volvo S40', ' 1.6 l.', ' saloon / sedan']


## Use selenium to step through website

In [299]:
autoplius_url = 'https://en.autoplius.lt/'

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os

chromedriver = "/Applications/chromedriver"                     #  path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)
driver.get(autoplius_url)

In [310]:
#Now filter on cars (not minivans, or trucks)
car_type_form = driver.find_element_by_id("cats_search_1")
car_type_form.send_keys("Cars")
car_type_form.send_keys(Keys.RETURN)

In [363]:
#Find search button and click it
search_button = driver.find_element_by_link_text('Search')
search_button.click()

In [367]:
#Retrieve htmls from one Page
current_url = driver.current_url
print(current_url)



https://en.autoplius.lt/ads/used-cars?page_nr=1
