## Project 2. Part 1 (Scrape & Pickle)

**Use Selenium and BeautifulSoup**

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import pickle
import pandas as pd

## Define all functions

In [None]:
#Get urls for all the pages and put them into a list
def get_url_list():
    """Records the current url and goes through the website, clicking Next as many times as there are pages.
    Returns a list of urls to be used in the get_htmls function."""
    #Creates a list of urls for all pages
    url_list = []
    
    #Reads the url of the page the driver is currently in and adds it into the list
    current_page_url = driver.current_url
    url_list.append(current_page_url)
    
    count_except = 0
    while True:
        try:
            #Find "Next" button and click it
            search_button = driver.find_element_by_link_text('Next')
            search_button.click()
            current_page_url = driver.current_url
            url_list.append(current_page_url)
            time.sleep(5)
        except:
            #Count the exceptions
            count_except += 1
            #If the there haven't been 3 exceptions yet, sleep for a bit and then continue
            if count_except < 3:
                time.sleep(10)
            else:
                #If "Next" button isn't there anymore or an error occurs, return the list
                #driver.close()
                return url_list
    #driver.close()
    return url_list


#Get all the html files for each car ad and put it into a list
def get_htmls(url, html_list):
    """Takes an url (of one page) and an existing html list.
    Returns updated html list with all the htmls from provided url."""
    #Convert the url into html and then soup it up
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html5lib')
    
    #Find all announcement items
    all_ads = soup.find_all('a', class_ = 'announcement-item')
    
    #Copy the existing list of htmls
    final_html_list = html_list
    
    for ad in all_ads:
        html = ad.get('href')
        final_html_list.append(html)
    return final_html_list

#Makes the soup from all of the html files
def make_soup(html_list, soup_list):
    """Generates and return a soup object based on a given html list"""
    final_soup_list = soup_list
    for html in html_list:
        source_code = requests.get(html)
        soup = BeautifulSoup(source_code.text, 'html5lib')
        final_soup_list.append(soup)
    return final_soup_list

#Get the make and engine size for each car
def get_make_and_engine(soup):
    """Given an input soup object retrieves the advert title (containing Make_Model and Engine size)
    Returns a dictionary with Make_Model and Engine keys"""
    dividers = soup.find_all('div', {'class', 'col-5 classifieds-info'})
    car = {}
    for div in dividers:
        for title in div('h1'):
            title_list = title.text.split(',')
            for i in range(3):
                if i == 0:
                    car['Make_Model'] = title_list[i]
                elif i == 1:
                    car['Engine_Size'] = title_list[i]
    return car    


#Get the rest of the features for each car
def get_other_features(soup, car_dict):
    """Retrieves remaining features of a car using input soup object
    Returns an updated cars dictionary"""
    car = car_dict
    params = soup.find_all('table', {'class', 'announcement-parameters'})
    for param in params:
        for tr in param('tr'):
            for th in tr('th'):
                col_title = th.text
                for td in tr('td'):
                    col_value = td.text
                    car[col_title] = col_value
    return car


#Scrape all the features for all the cars
def scrape(soup_list):
    """Given the soups file this function retrieves all the feature columns about each car"""
    #For tracking
    car_dict_list = []
    for soup in soup_list:
        #Retrieve the car make, model and engine size
        car = get_make_and_engine(soup)
        #Retrieve the rest of the features
        car = get_other_features(soup, car)
        car_dict_list.append(car)
    return car_dict_list


### Get url list

In [None]:
#Define the website
website = 'https://en.autoplius.lt/'

##Could also have a website list
#website_list = ['https://en.autoplius.lt/', 'https://en.autogidas.lt/']
#for web in website_list:
#    website = web
#    driver.get(website)

chromedriver = "/Applications/chromedriver"                     #  path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

#Go to the website
driver.get(website)

In [None]:
#Now filter on cars (not minivans, or trucks)
car_type_form = driver.find_element_by_id("cats_search_1")
car_type_form.send_keys("Cars")
car_type_form.send_keys(Keys.RETURN)

In [None]:
#Find "Search" button and click it
search_button = driver.find_element_by_link_text('Search')
search_button.click()

In [None]:
#Now get all the urls
url_list = get_url_list()
print(len(url_list))

In [None]:
with open('all_my_urls.pkl', 'wb') as picklefile2:
    pickle.dump(url_list, picklefile2)

In [None]:
with open('all_my_urls.pkl', 'rb') as picklefile2:
    url_list = pickle.load(picklefile2)

### Scrape

In [None]:
#It's to to get the htmls from each page (approx. 20 / page)
html_list = []
for url in url_list:
    get_htmls(url, html_list)

print(len(html_list))

In [None]:
#Let's look at 600th html
#html_list[600]
#The html seems perfectly normal and works when I open it in the browser

In [None]:
#Let's make soup (yuck!)
soup_list = []
soup_list = make_soup(html_list, soup_list)
print(len(soup_list))

In [None]:
#Let's look at the 600th soup
soup_list[600]
#AHA! Good job Watson :) The website adds something called <!-- DYNAMICTAGS --> 
#before the usual DOCTYPE after certain number of scrapes

In [None]:
#Scrape the hell out of this website!!!
cars_dict = scrape(soup_list)
len(cars_dict)

In [None]:
#Let's pickle this car list
with open('cars_dict.pkl', 'wb') picklefile5:
    pickle.dump(cars_dict, picklefile5)

In [None]:
#Let's unpickle this car list
with open('cars_dict.pkl', 'rb') picklefile5:
    my_cars_dict = pickle.load(picklefile5)

**Explore the cars_dict**

In [None]:
#How many cars don't have Make_Model?
# no_make = 0
# no_make_indices = []
# for i in range(len(cars_dict)):
#     if 'Make_Model' not in list(cars_dict[i].keys()):
#         no_make += 1
#         no_make_indices.append(i)

# print(str(no_make), "cars don't have a Make.")

# #Which records are those?
# no_make_indices
# #Result - 600 onwards
# #cars_dict[602]


### Create my dataset (pandas dataframe)

In [None]:
#Now fill out the rows
car_ad_dataset = pd.DataFrame(cars_dict)
car_ad_dataset.info()

### Pickle

In [None]:
#Let's pickle the hell out of the result (don't want to have to run this again)
with open('car_ad_dataset.pkl', 'wb') as picklefile:
    pickle.dump(car_ad_dataset, picklefile)