## Project 2. Part 1 (Scrape & Pickle)

**Use Selenium and BeautifulSoup**

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import pickle
import pandas as pd

## Define all functions (NEED TO CHANGE THE WHILE LOOK FROM n TO True !!!!)

In [2]:
#Get urls for all the pages and put them into a list
def get_url_list():
    """Records the current url and goes through the website, clicking Next as many times as there are pages.
    Returns a list of urls to be used in the get_htmls function."""
    #Creates a list of urls for all pages
    url_list = []
    
    #Reads the url of the page the driver is currently in and adds it into the list
    current_page_url = driver.current_url
    url_list.append(current_page_url)

    n = 500
    while n > 0:#True:
        try:
            #Find "Next" button and click it
            search_button = driver.find_element_by_link_text('Next')
            search_button.click()
            time.sleep(3)
            current_page_url = driver.current_url
            url_list.append(current_page_url)
            n -= 1
        except:
            #If "Next" button isn't there anymore, return the list
            driver.close()
            return url_list
    driver.close()
    return url_list


#Get all the html files for each car ad and put it into a list
def get_htmls(url, html_list):
    """Takes an url (of one page) and an existing html list.
    Returns updated html list with all the htmls from provided url."""
    #Convert the url into html and then soup it up
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html5lib')
    
    #Find all announcement items
    all_ads = soup.find_all('a', class_ = 'announcement-item')
    
    #Copy the existing list of htmls
    page_html_list = html_list
    
    for ad in all_ads:
        html = ad.get('href')
        page_html_list.append(html)
    return page_html_list

#Makes the soup from all of the html files
def make_soup(html_list):
    """Generates and return a soup object based on a given html list"""
    soup_list = []
    for html in html_list:
        source_code = requests.get(html)
        soup = BeautifulSoup(source_code.text, 'html5lib')
        soup_list.append(soup)
    return soup_list


#Get the make and engine size for each car
def get_make_and_engine(soup):
    """Given an input soup object retrieves the advert title (containing Make_Model and Engine size)
    Returns a dictionary with Make_Model and Engine keys"""
    dividers = soup.find_all('div', {'class', 'col-5 classifieds-info'})
    car = {}
    for div in dividers:
        for title in div('h1'):
            title_list = title.text.split(',')
            for i in range(3):
                if i == 0:
                    car['Make_Model'] = title_list[i]
                elif i == 1:
                    car['Engine_Size'] = title_list[i]
    return car    


#Get the rest of the features for each car
def get_other_features(soup, car_dict):
    """Retrieves remaining features of a car using input soup object
    Returns an updated cars dictionary"""
    car = car_dict
    params = soup.find_all('table', {'class', 'announcement-parameters'})
    for param in params:
        for tr in param('tr'):
            for th in tr('th'):
                col_title = th.text
                for td in tr('td'):
                    col_value = td.text
                    car[col_title] = col_value
    return car


#Scrape all the features for all the cars
def scrape(soup_list):
    """Given the soups file this function retrieves all the feature columns about each car"""
    #For tracking
    car_dict_list = []
    for soup in soup_list:
        #Retrieve the car make, model and engine size
        car = get_make_and_engine(soup)
        #Retrieve the rest of the features
        car = get_other_features(soup, car)
        car_dict_list.append(car)
    return car_dict_list


### Get url list

In [3]:
#Define the website
website = 'https://en.autoplius.lt/'

##Could also have a website list
#website_list = ['https://en.autoplius.lt/', 'https://en.autogidas.lt/']
#for web in website_list:
#    website = web
#    driver.get(website)

chromedriver = "/Applications/chromedriver"                     #  path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

#Go to the website
driver.get(website)

In [4]:
#Now filter on cars (not minivans, or trucks)
car_type_form = driver.find_element_by_id("cats_search_1")
car_type_form.send_keys("Cars")
car_type_form.send_keys(Keys.RETURN)

In [5]:
#Find "Search" button and click it
search_button = driver.find_element_by_link_text('Search')
search_button.click()

In [6]:
#Now get all the urls
url_list = get_url_list()
print(len(url_list))

194


In [7]:
with open('all_my_urls.pkl', 'wb') as picklefile2:
    pickle.dump(url_list, picklefile2)

In [8]:
with open('all_my_urls.pkl', 'rb') as picklefile2:
    url_list = pickle.load(picklefile2)

### Scrape

In [9]:
#It's to to get the htmls from each page
html_list = []
for url in url_list:
    get_htmls(url, html_list)

print(len(html_list))

#with open('all_my_html.pkl', 'wb') as picklefile2:
#    pickle.dump(html_list, picklefile2)

3879


In [10]:
#Let's make soup (yuck!)
soup_list = make_soup(html_list)
print(len(soup_list))

3879


In [11]:
#Scrape the hell out of this website!!!
cars_dict = scrape(soup_list)
len(cars_dict)

3879

**Explore the soups**

In [18]:
#soup_list[320]

### Create my dataset (pandas dataframe)

In [19]:
#Now fill out the rows
car_ad_dataset = pd.DataFrame(cars_dict)
car_ad_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3879 entries, 0 to 3878
Data columns (total 37 columns):
Audio/video equipment         941 non-null object
Body type                     1388 non-null object
Climate control               874 non-null object
Color                         1134 non-null object
Combined                      543 non-null object
Damage                        1388 non-null object
Date of manufacture           1388 non-null object
Driven wheels                 1037 non-null object
Electronics                   1121 non-null object
Engine                        1366 non-null object
Engine_Size                   1388 non-null object
Euro standard                 440 non-null object
Export price                  382 non-null object
Exterior                      1114 non-null object
Extra-urban                   548 non-null object
First registration country    682 non-null object
Fuel type                     1388 non-null object
Gearbox                       138

### Pickle

In [13]:
#Let's pickle the hell out of the result (don't want to have to run this again)
with open('car_ad_dataset_20180715.pkl', 'wb') as picklefile:
    pickle.dump(car_ad_dataset, picklefile)

In [14]:
# with open('car_ad_dataset.pkl', 'rb') as picklefile:
#     my_car_ad_dataset = pickle.load(picklefile)
    
#print(my_car_ad_dataset[:10])