# Data collecting

### Importing libraries and modules

In [None]:
import itertools
import pandas as pd
import numpy as np
import re
import requests
import math
from bs4 import BeautifulSoup
from selenium import webdriver
import time

### HTML soups from all 500 web pages

In [None]:
def get_soups(website_number):
    get_url = requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page=' + str(website_number))
    return BeautifulSoup(get_url.content, 'lxml')

In [None]:
soups = list(map(get_soups, list(range(564))))

## URLs scraping

In [None]:
def urls_scraper(soup):
    nth_urls = []
    
    def urlsppage(nth):
        try:
            finding = soup.find_all('a', {'data-test': 'usedListing'})[nth]
            return re.findall('href="/.+" style', str(finding)[:280])[0]
        except:
            return 0
        
    nth_urls = list(map(urlsppage, list(range(30))))
    return nth_urls

urls_list = list(map(urls_scraper, soups))
pages_urls = np.array(urls_list).flatten()
filter = [bool(url) for url in pages_urls if url != 0]
fpages_urls = pages_urls[filter]
url_formatter = np.vectorize(lambda url: 'https://truecar.com' + url[6: -7])
urls = url_formatter(fpages_urls)

In [None]:
def fscraper(url, feature):
    nth_request = requests.get(url)
    nth_soup = BeautifulSoup(nth_request.content, 'lxml')
    nth_search = re.search(feature + '</h4><ul><li>.+</li', str(nth_soup))
    try:
        return re.findall('li>.+</l', str(nth_search))[0][3: -3]
    except:
        return np.NaN

## First vehicles features scraping

In [None]:
drive_types = list(map(fscraper, urls, itertools.repeat('Drive Type')))
fuel_types = list(map(fscraper, urls, itertools.repeat('Fuel Type')))
mileages = list(map(fscraper, urls, itertools.repeat('Mileage')))
transmissions = list(map(fscraper, urls, itertools.repeat('Transmission')))
MPGs = list(map(fscraper, urls, itertools.repeat('MPG')))
options_levels = list(map(fscraper, urls, itertools.repeat('Options Level')))
bed_lengths = list(map(fscraper, urls, itertools.repeat('Bed Length')))
engines = list(map(fscraper, urls, itertools.repeat('Engine')))
exterior_colors = list(map(fscraper, urls, itertools.repeat('Exterior Color')))
interior_colors = list(map(fscraper, urls, itertools.repeat('Interior Color')))
styles = list(map(fscraper, urls, itertools.repeat('Style')))

## Vehicles Years, Makes and Models scraping

In [None]:
def ymm_scraper(url, feat_name):
    nth_request = requests.get(url)
    nth_soup = BeautifulSoup(nth_request.content, 'lxml')
    nth_finding = nth_soup.find_all('div', {'class': 'text-truncate heading-3 margin-right-2 margin-right-sm-3'})
    try:
        if feat_name == 'year':
            return re.findall('>.+</', str(nth_finding))[0].split()[0][1:]
        elif feat_name == 'make':
            return ' '.join(re.findall('>.+</', str(nth_finding))[0].split()[1: -1])
        else:
            return re.findall('>.+</', str(nth_finding))[0].split()[-1][: -2]
    except:
        return np.NaN

In [None]:
years = list(map(ymm_scraper, urls, itertools.repeat('year')))
makes = list(map(ymm_scraper, urls, itertools.repeat('make')))
models = list(map(ymm_scraper, urls, itertools.repeat('model')))

## Vehicles Prices scraping

In [None]:
def prices_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('div', {'data-qa': 'LabelBlock-text'})
    try:
        return re.findall('[0-9]+,[0-9]+', str(nth_soup))[0]
    except:
        return np.NaN

In [None]:
prices = list(map(prices_scraper, urls))

## Vehicles Locations (Cities and States) scraping

In [None]:
def cities_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('span', {'data-qa': 'used-vdp-header-location'})
    try:
        return re.findall('">.+<!', str(nth_soup))[0][2: -12]
    except:
        return np.NaN

In [None]:
def states_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('span', {'data-qa': 'used-vdp-header-location'})
    try:
        return re.findall('[A-W][A-Z]', str(nth_soup))[0]
    except:
        return np.NaN

In [None]:
cities = list(map(cities_scraper, urls))
states = list(map(states_scraper, urls))

## Vehicles Conditions scraping

In [None]:
def conditions_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('li', {'class': '_h9wfdq'})
    try:
        return re.findall('">[0-9]<!', str(nth_soup[0]))[0][2: -2] + re.findall('->.+</l', str(nth_soup[0]))[0][2: -3]
    except:
        return np.NaN

In [None]:
conditions = list(map(conditions_scraper, urls))

# Dataset

In [None]:
features = {
    'Make': makes, 'Model': models, 'Year': years, 'Mileage': mileages, 'Transmission': transmissions,
    'Engine': engines, 'Exterior Color': exterior_colors, 'Interior Color': interior_colors,
    'MPG': MPGs, 'Fuel Type': fuel_types, 'Drive Type': drive_types, 'Location (City)': cities,
    'Location (State)': states, 'Style': styles, 'Condition (Accidents)': conditions,
    'Options Level': options_levels, 'Bed Length': bed_lengths, 'Price': prices
}

In [None]:
vehicles_data = pd.DataFrame(features)
vehicles_data.to_csv('C:/.../.csv')