## Project 2. Part 1 (Scrape & Pickle)

**Use Selenium and BeautifulSoup**

In [47]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import pickle
import pandas as pd

## Define all functions

In [48]:
#Get urls for all the pages and put them into a list
def get_url_list():
    """Records the current url and goes through the website, clicking Next as many times as there are pages.
    Returns a list of urls to be used in the get_htmls function."""
    #Creates a list of urls for all pages
    url_list = []
    
    #Reads the url of the page the driver is currently in and adds it into the list
    current_page_url = driver.current_url
    url_list.append(current_page_url)
    
    count_except = 0
    n = 300
    while n > 0:#True:
        try:
            #Find "Next" button and click it
            search_button = driver.find_element_by_link_text('Next')
            search_button.click()
            current_page_url = driver.current_url
            url_list.append(current_page_url)
            n -= 1
            time.sleep(5)
        except:
            #Count the exceptions
            count_except += 1
            #If the there haven't been 3 exceptions yet, sleep for a bit and then continue
            if count_except < 3:
                time.sleep(10)
            else:
                #If "Next" button isn't there anymore or an error occurs, return the list
                #driver.close()
                return url_list
    #driver.close()
    return url_list


#Get all the html files for each car ad and put it into a list
def get_htmls(url, html_list):
    """Takes an url (of one page) and an existing html list.
    Returns updated html list with all the htmls from provided url."""
    #Convert the url into html and then soup it up
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html5lib')
    
    #Find all announcement items
    all_ads = soup.find_all('a', class_ = 'announcement-item')
    
    #Copy the existing list of htmls
    final_html_list = html_list
    
    for ad in all_ads:
        html = ad.get('href')
        final_html_list.append(html)
    return final_html_list

#Makes the soup from all of the html files
def make_soup(html_list, soup_list):
    """Generates and return a soup object based on a given html list"""
    final_soup_list = soup_list
    for html in html_list:
        source_code = requests.get(html)
        soup = BeautifulSoup(source_code.text, 'html5lib')
        final_soup_list.append(soup)
    return final_soup_list

#Get the make and engine size for each car
def get_make_and_engine(soup):
    """Given an input soup object retrieves the advert title (containing Make_Model and Engine size)
    Returns a dictionary with Make_Model and Engine keys"""
    dividers = soup.find_all('div', {'class', 'col-5 classifieds-info'})
    car = {}
    for div in dividers:
        for title in div('h1'):
            title_list = title.text.split(',')
            for i in range(3):
                if i == 0:
                    car['Make_Model'] = title_list[i]
                elif i == 1:
                    car['Engine_Size'] = title_list[i]
    return car    


#Get the rest of the features for each car
def get_other_features(soup, car_dict):
    """Retrieves remaining features of a car using input soup object
    Returns an updated cars dictionary"""
    car = car_dict
    params = soup.find_all('table', {'class', 'announcement-parameters'})
    for param in params:
        for tr in param('tr'):
            for th in tr('th'):
                col_title = th.text
                for td in tr('td'):
                    col_value = td.text
                    car[col_title] = col_value
    return car


#Scrape all the features for all the cars
def scrape(soup_list):
    """Given the soups file this function retrieves all the feature columns about each car"""
    #For tracking
    car_dict_list = []
    for soup in soup_list:
        #Retrieve the car make, model and engine size
        car = get_make_and_engine(soup)
        #Retrieve the rest of the features
        car = get_other_features(soup, car)
        car_dict_list.append(car)
    return car_dict_list


### Get url list

In [49]:
#Define the website
website = 'https://en.autoplius.lt/'

##Could also have a website list
#website_list = ['https://en.autoplius.lt/', 'https://en.autogidas.lt/']
#for web in website_list:
#    website = web
#    driver.get(website)

chromedriver = "/Applications/chromedriver"                     #  path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

#Go to the website
driver.get(website)

In [50]:
#Now filter on cars (not minivans, or trucks)
car_type_form = driver.find_element_by_id("cats_search_1")
car_type_form.send_keys("Cars")
car_type_form.send_keys(Keys.RETURN)

In [51]:
#Find "Search" button and click it
search_button = driver.find_element_by_link_text('Search')
search_button.click()

In [52]:
#Now get all the urls
url_list = get_url_list()
print(len(url_list))

65


In [53]:
with open('all_my_urls_20180717.pkl', 'wb') as picklefile2:
    pickle.dump(url_list, picklefile2)

In [57]:
with open('all_my_urls_20180717.pkl', 'rb') as picklefile2:
    url_list = pickle.load(picklefile2)

### Scrape

In [58]:
#It's to to get the htmls from each page (approx. 20 / page)
html_list = []
for url in url_list:
    get_htmls(url, html_list)

print(len(html_list))

1300


In [102]:
#Let's look at 600th html
#html_list[600]
#The html seems perfectly normal and works when I open it in the browser

'https://en.autoplius.lt/ads/mercedes-benz-b200-2-0-l-mpv-minivan-2006-diesel-7312499.html'

In [59]:
#Let's make soup (yuck!)
soup_list = []
soup_list = make_soup(html_list, soup_list)
print(len(soup_list))

1300


In [106]:
#Let's look at the 600th soup
soup_list[600]
#AHA! Good job Watson :) The website adds something called <!-- DYNAMICTAGS --> 
#before the usual DOCTYPE after certain number of scrapes

<!-- DYNAMICTAGS --><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html lang="lt" xml:lang="lt" xmlns="https://www.w3.org/1999/xhtml"><head>
	<title>https://en.autoplius.lt/</title>
	<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
	<meta content="(c) 2003-2018 UAB Diginet LTU" name="copyright"/>
	<meta content="autoplius.lt" name="publisher"/>
	<meta content="UAB Diginet LTU, info(eta)autoplius.lt" lang="lt" name="author"/>
	<style type="text/css">
	body,img,h2,form{margin:0;padding:0}img{border:none}body{width:100%;background:#fff;color:#000;font:normal normal 12px Arial,sans-serif;text-align:center}.wrapper{margin:30px auto 10px;width:680px}.success{margin:16px 0;position:relative;clear:both;text-align:left}.success .success-msg{border:1px solid;padding:8px 8px 8px 78px;background:url(https://autoplius-static.dgn.lt/static/images/ico/message-spire.png) 16px -12px  no-repeat;border-radius

In [61]:
#Scrape the hell out of this website!!!
cars_dict = scrape(soup_list)
len(cars_dict)

1300

In [None]:
#Let's pickle this car list
with open('cars_dict.pkl', 'wb') picklefile5:
    pickle.dump(cars_dict, picklefile5)

In [None]:
#Let's unpickle this car list
with open('cars_dict.pkl', 'rb') picklefile5:
    my_cars_dict = pickle.load(picklefile5)

**Explore the cars_dict**

In [96]:
#How many cars don't have Make_Model?
no_make = 0
no_make_indices = []
for i in range(len(cars_dict)):
    if 'Make_Model' not in list(cars_dict[i].keys()):
        no_make += 1
        no_make_indices.append(i)

print(str(no_make), "cars don't have a Make.")



#Which records are those?
no_make_indices
#Result - 600 onwards
#cars_dict[602]


385 cars don't have a Make.


[600,
 601,
 604,
 606,
 607,
 608,
 609,
 610,
 611,
 613,
 614,
 615,
 616,
 617,
 618,
 619,
 620,
 621,
 622,
 623,
 625,
 627,
 628,
 629,
 630,
 631,
 632,
 634,
 635,
 636,
 637,
 638,
 641,
 642,
 643,
 644,
 645,
 646,
 648,
 658,
 659,
 660,
 662,
 663,
 664,
 665,
 666,
 667,
 668,
 669,
 670,
 671,
 672,
 673,
 674,
 675,
 676,
 677,
 678,
 679,
 680,
 681,
 682,
 683,
 684,
 685,
 686,
 687,
 688,
 689,
 690,
 691,
 692,
 693,
 694,
 695,
 696,
 697,
 698,
 699,
 700,
 701,
 702,
 703,
 704,
 705,
 706,
 707,
 708,
 709,
 710,
 711,
 712,
 713,
 714,
 715,
 716,
 717,
 718,
 719,
 720,
 721,
 722,
 723,
 724,
 725,
 726,
 727,
 728,
 729,
 730,
 731,
 732,
 733,
 734,
 735,
 736,
 737,
 738,
 739,
 740,
 741,
 742,
 743,
 745,
 749,
 750,
 751,
 752,
 753,
 754,
 755,
 756,
 757,
 769,
 772,
 773,
 779,
 782,
 785,
 792,
 793,
 795,
 796,
 800,
 806,
 811,
 812,
 813,
 814,
 818,
 821,
 822,
 823,
 824,
 825,
 826,
 827,
 828,
 829,
 830,
 831,
 832,
 833,
 834,
 835,
 837

### Create my dataset (pandas dataframe)

In [63]:
#Now fill out the rows
car_ad_dataset = pd.DataFrame(cars_dict)
car_ad_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300 entries, 0 to 1299
Data columns (total 35 columns):
Audio/video equipment         580 non-null object
Body type                     915 non-null object
Climate control               553 non-null object
Color                         709 non-null object
Combined                      370 non-null object
Damage                        915 non-null object
Date of manufacture           915 non-null object
Driven wheels                 640 non-null object
Electronics                   791 non-null object
Engine                        910 non-null object
Engine_Size                   915 non-null object
Euro standard                 234 non-null object
Export price                  176 non-null object
Exterior                      756 non-null object
Extra-urban                   366 non-null object
First registration country    420 non-null object
Fuel type                     915 non-null object
Gearbox                       915 non-null 

### Pickle

In [None]:
#Let's pickle the hell out of the result (don't want to have to run this again)
with open('car_ad_dataset_20180717.pkl', 'wb') as picklefile:
    pickle.dump(car_ad_dataset, picklefile)

In [98]:
with open('car_ad_dataset_bigger.pkl', 'rb') as picklefile:
    my_car_ad_dataset = pickle.load(picklefile)
    
#my_car_ad_dataset.head()

Unnamed: 0,Audio/video equipment,Body type,Climate control,Color,Combined,Damage,Date of manufacture,Driven wheels,Electronics,Engine,...,Price in Lithuania,Safety,Security,Special price,Steering wheel,Tuning (improvements),Urban,VIN check,VIN number,Wheel size
0,"CD player,MP3 player,Sound amplifier,Subwoofer",Other,,White,,No damages,1996-07,Front wheel drive (FWD),,"2400 cm³, 74 HP (55kW)",...,2 050 €\n \n Sužinokite įmoką\n ...,"ABS,Airbags","Central locking,Alarm",,Left hand drive (LHD),Tuned interior,,\n \n Check vehicle history\...,,R15
1,"CD player,AUX input,Navigation/GPS,Handsfree kit",Wagon,Climate control,Black,,No damages,2018-02,All wheel (4х4),"Electric mirrors,Electric boot lid,Galinio vai...","1995 cm³, 190 HP (139kW)",...,44 900 €,"ABS,Traction control system,ESP iElectronic st...","Central locking,Immobilizer,Alarm",,Left hand drive (LHD),,,\n \n Check vehicle history\...,,
2,,Coupe,,Blue,,No damages,1992,Rear wheel drive (RWD),Electric windows,"1998 cm³, 156 HP (115kW)",...,4 400 €\n \n Sužinokite įmoką\n ...,,,,Left hand drive (LHD),,,\n \n Check vehicle history\...,,R17
3,CD player,MPV / minivan,Air conditioning,,,No damages,2008-04,Front wheel drive (FWD),"Electric mirrors,Electric windows,Automatic he...","2200 cm³, 135 HP (100kW)",...,4 400 €\n \n Sužinokite įmoką\n ...,Airbags,"Central locking,Immobilizer",,Left hand drive (LHD),,,\n \n Check vehicle history\...,,R15
4,"CD player,MP3 player,CD changer,Subwoofer,USB ...",Saloon / sedan,Climate control,Gray / silver,9.5,No damages,2009-11,Front wheel drive (FWD),"Electric mirrors,Electric windows,Automatic he...","2354 cm³, 201 HP (148kW)",...,7 990 €\n \n Sužinokite įmoką\n ...,"ABS,ESP iElectronic stability program$(""#rel_E...","Central locking,Immobilizer,Alarm",,Left hand drive (LHD),,10.5,\n \n Check vehicle history\...,JHMCU26809C230972,
